In [2]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [3]:
df = pd.read_csv('book_w_images_&_desc.csv')

In [4]:
df.head(1)

Unnamed: 0,bookformat,desc,pages,rating,reviews,totalratings,cover_image
0,Paperback,ship traveled universe longer crew recall true...,512,3.82,110,2051,cover_images/processed_image_111.jpg


In [5]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['desc'], df['rating'], test_size = 0.2, random_state = 42)

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
predictions = model.predict(X_test_tfidf)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 0.051662756605822126


In [6]:
feature_names = vectorizer.get_feature_names_out()

coefficients = model.coef_

coefficients_df = pd.DataFrame({'feature': feature_names, 'coefficient': coefficients})

top_words = coefficients_df.sort_values(by = 'coefficient', ascending = False).head(10)

print('For high ratings, the top ten most important words were: ')
print(top_words)

For high ratings, the top ten most important words were: 
           feature  coefficient
18493        share     1.198828
22553      warrior     0.755163
12304        luffy     0.744516
17785       sakura     0.606792
13540  morganville     0.579293
12943    meanwhile     0.557274
13790       nanami     0.555096
8650          goku     0.539937
23001   wonderland     0.503053
26            abel     0.498316


In [7]:
low_rating_words = coefficients_df.sort_values(by = 'coefficient', ascending = True).head(10)
print('For low ratings, the top ten most important words were: ')
print(low_rating_words)

For low ratings, the top ten most important words were: 
       feature  coefficient
18496   shares    -1.929606
14041    nicky    -0.662036
22859    willo    -0.602208
16375    rafer    -0.601220
8548     girls    -0.598668
19209     soon    -0.561232
7566   felicia    -0.560172
5134     debut    -0.558334
2906    calexa    -0.529575
11023  journal    -0.519264


In [8]:
df_new = pd.DataFrame({
    'title': ['Iron Widow', 'The Long Way to a Small Angry Planet', 'Tress of the Emerald Sea'],
    'desc': ["The boys of Huaxia dream of pairing up with girls to pilot Chrysalises, giant transforming robots that can battle the mecha aliens that lurk beyond the Great Wall. It doesn't matter that the girls often die from the mental strain. When 18-year-old Zetian offers herself up as a concubine-pilot, it's to assassinate the ace male pilot responsible for her sister's death. But she gets her vengeance in a way nobody expected—she kills him through the psychic link between pilots and emerges from the cockpit unscathed. She is labeled an Iron Widow, a much-feared and much-silenced kind of female pilot who can sacrifice boys to power up Chrysalises instead. To tame her unnerving yet invaluable mental strength, she is paired up with Li Shimin, the strongest and most controversial male pilot in Huaxia. But now that Zetian has had a taste of power, she will not cower so easily. She will miss no opportunity to leverage their combined might and infamy to survive attempt after attempt on her life, until she can figure out exactly why the pilot system works in its misogynist way—and stop more girls from being sacrificed.",
             "Follow a motley crew on an exciting journey through space-and one adventurous young explorer who discovers the meaning of family in the far reaches of the universe-in this light-hearted debut space opera from a rising sci-fi star. Rosemary Harper doesn’t expect much when she joins the crew of the aging Wayfarer. While the patched-up ship has seen better days, it offers her a bed, a chance to explore the far-off corners of the galaxy, and most importantly, some distance from her past. An introspective young woman who learned early to keep to herself, she’s never met anyone remotely like the ship’s diverse crew, including Sissix, the exotic reptilian pilot, chatty engineers Kizzy and Jenks who keep the ship running, and Ashby, their noble captain. Life aboard the Wayfarer is chaotic and crazy—exactly what Rosemary wants. It’s also about to get extremely dangerous when the crew is offered the job of a lifetime. Tunneling wormholes through space to a distant planet is definitely lucrative and will keep them comfortable for years. But risking her life wasn’t part of the plan. In the far reaches of deep space, the tiny Wayfarer crew will confront a host of unexpected mishaps and thrilling adventures that force them to depend on each other. To survive, Rosemary’s got to learn how to rely on this assortment of oddballs—an experience that teaches her about love and trust, and that having a family isn’t necessarily the worst thing in the universe.",
             "The only life Tress has known on her island home in an emerald-green ocean has been a simple one, with the simple pleasures of collecting cups brought by sailors from faraway lands and listening to stories told by her friend Charlie. But when his father takes him on a voyage to find a bride and disaster strikes, Tress must stow away on a ship and seek the Sorceress of the deadly Midnight Sea. Amid the spore oceans where pirates abound, can Tress leave her simple life behind and make her own place sailing a sea where a single drop of water can mean instant death?"]
})

In [9]:
# Preprocess new text data
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def preprocess_text(text):
    words = word_tokenize(text.lower())
    words = [ps.stem(word) for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(words)

In [13]:
df_new.desc = df_new.desc.apply(preprocess_text)

new_X_tfidf = vectorizer.transform(df_new.desc)

new_predictions = model.predict(new_X_tfidf)

df_new['predicted_rating'] = new_predictions

print(df_new[['title', 'predicted_rating']])

                                  title  predicted_rating
0                            Iron Widow          3.818360
1  The Long Way to a Small Angry Planet          3.717837
2              Tress of the Emerald Sea          3.847237
