In [1]:
# Imports
import re
import matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import seaborn as sns
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDRegressor
from sklearn.svm import LinearSVR
from sklearn.metrics import accuracy_score


# nltk downloads
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('vader_lexicon')

# bug fix for matplotlib version error
if not hasattr(matplotlib.rcParams, "_get"):
    def _get(key):
        return matplotlib.rcParams[key]
    matplotlib.rcParams._get = _get


In [2]:
# Create dataframe from dataset
df = pd.read_csv('Reviews.csv')
# Change time units to YYYY-MM-DD Hours:Minutes:Seconds
df['Time'] = pd.to_datetime(df['Time'],unit='s')
# new df only consisting of reviews from 2010 new numrows = 57979
df_2010 = df[df['Time'].dt.year == 2010] 
df_2010 = df_2010.drop_duplicates(subset = 'Text')

keep = ['Id', 'Score', 'Time', 'Summary', 'Text']
df_final = df_2010[keep]
df_final["Exclamation"] = df_final["Text"].str.count("!")
df_final["Length"] = df_final["Text"].str.split().str.len()
df_final["Question"] = df_final["Text"].str.count(r'\?')

print(df_final.size)
print(df_final.shape)


In [3]:
df_final.head(20)

In [4]:
CAP5 = 6000
five = df_final[df_final['Score'] == 5]
rest = df_final[df_final['Score'] != 5]
five_down = five.sample(n=min(CAP5,len(five)),random_state=42)
df_cut = pd.concat([five_down,rest],axis=0).sample(frac=1,random_state=42).reset_index(drop=True)
counts = df_cut['Score'].value_counts().sort_index()
plt.figure(figsize=(6,4))
plt.bar(counts.index, counts.values)
plt.xticks([1,2,3,4,5])
plt.xlabel('Score'); plt.ylabel('Count'); plt.title('Review count by score')
plt.tight_layout(); plt.show()


Final dataset contains 156360 reviews after cleaning and balancing, with Y 5-star reviews capped at 6000.

In [5]:
df_cut['Text'] = df_cut['Text'].fillna("").astype(str)
df_cut['n_words'] = df_cut['Text'].str.split().str.len()
avg_len = df_cut.groupby('Score')[['n_words']].mean().round(2)
sns.barplot(data=df_cut, x = 'Score', y = 'n_words')
plt.title('Average review length (in words) by Star Rating')
plt.show()


In [6]:
df_cut

In [7]:
sia = SentimentIntensityAnalyzer()


In [8]:
sia.polarity_scores('and depression and happiness six seven')

In [9]:
res = {}
for i, row in df_cut.iterrows():
    text = row['Text']
    myid = row['Id']
    res[myid] = sia.polarity_scores(text)

In [16]:
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns={'index': 'Id'})
vaders = vaders.merge(df_cut, how='left', on='Id')

In [17]:
df_cut.head(5)

In [18]:
vaders.head()

In [19]:
ax = sns.barplot(data=vaders, x = 'Score', y ='compound')
ax.set_title('Compound Score by Amazon Review')
plt.show()

In [20]:
fig, axs = plt.subplots(1, 3, figsize=(12, 3))
sns.barplot(data=vaders, x='Score', y='pos', ax=axs[0])
sns.barplot(data=vaders, x='Score', y='neu', ax=axs[1])
sns.barplot(data=vaders, x='Score', y='neg', ax=axs[2])
axs[0].set_title('Positive')
axs[1].set_title('Neutral')
axs[2].set_title('Negative')
plt.tight_layout()
plt.show()

In [24]:
stopwords_list = set(stopwords.words("english"))
print(f'List of stopwords:\n{stopwords_list}\n')

no_stopwords = ["not","don't",'aren','don','ain',"aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't",
               'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't",
               "won't", 'wouldn', "wouldn't"]
for no_stopwords in no_stopwords:
    stopwords_list.remove(no_stopwords)

print(f'Final list of stopwords:\n{stopwords_list}')

In [27]:
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def lemma_stem_text(words_list):
    text = [lemmatizer.lemmatize(token.lower()) for token in words_list]
    text = [lemmatizer.lemmatize(token.lower(), "v") for token in text]
    return text

word_example = "feet"
print(f'The word "{word_example}" is transformed to "{lemma_stem_text([word_example])[0]}"')

In [28]:

re_negation = re.compile("n't")

# takes sequences of words and return so the abbreviated negations are turned into "not"
def negation_abbreviated_to_standard(sent):
    sent = re_negation.sub(" not ",sent)
    return sent

word_example = "i aren't"
print(f'The sentence {word_example} is transformed to "{negation_abbreviated_to_standard(word_example)}"')

def review_to_words(raw_review):
    review_text = BeautifulSoup(str(raw_review), "html.parser").get_text()
    review_text = negation_abbreviated_to_standard(review_text)
    review_text = re.sub("[^a-zA-Z0-9]", " ", review_text)
    words = review_text.lower().split()
    words = [w for w in words if w not in stop_words]
    words = [lemmatizer.lemmatize(w) for w in words]
    return " ".join(words)

df_final['clean_text'] = df_final['Text'].apply(review_to_words)


In [29]:
# Merge VADER scores into df_cut
df_model = df_cut.merge(vaders[['Id', 'compound', 'pos', 'neg', 'neu']], on='Id', how='inner')

# Create punctuation features
df_model['Exclamation_Count'] = df_model['Text'].str.count('!')
df_model['Question_Count'] = df_model['Text'].str.count(r'\?')

# Create 'clean_text' column for df_cut
if 'clean_text' not in df_model.columns:
    df_model['clean_text'] = df_model['Text'].apply(review_to_words)

# Check that we have no missing values
print(f"Data Shape: {df_model.shape}")
print(f"Missing Values: {df_model.isnull().sum().sum()}")

In [30]:
# Define target as star rating
y = df_model['Score']

# Split data
X_train_text, X_test_text, y_train, y_test, X_train_meta, X_test_meta = train_test_split(
    df_model['clean_text'], 
    y, 
    df_model[['compound', 'Exclamation_Count', 'Question_Count']], 
    test_size=0.2, 
    random_state=42
)

# Vectorize text
vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train_text)
X_test_tfidf = vectorizer.transform(X_test_text)

# Scale meta features (compound score and punctuation)
scaler = StandardScaler()
X_train_meta_scaled = scaler.fit_transform(X_train_meta)
X_test_meta_scaled = scaler.transform(X_test_meta)

# Combine them both
X_train_combined = hstack([X_train_tfidf, X_train_meta_scaled])
X_test_combined = hstack([X_test_tfidf, X_test_meta_scaled])

# Just a check. Training set is 20848 rows. There are 20000 words + 3 features for each row
print(f"Training Matrix Shape: {X_train_combined.shape}")

In [61]:
# Train
# reg_model = LinearRegression()
reg_model = SGDRegressor(max_iter=1000, tol=1e-3, random_state=42)

reg_model = Ridge(alpha=2.0)

reg_model.fit(X_train_combined, y_train)

# Predict
y_pred_raw = reg_model.predict(X_test_combined)

# Limit predictions from 1-5
y_pred_final = np.clip(y_pred_raw, 1, 5)

# Round everything to change decimals to integers (2.5 becomes 3)
y_pred_final = np.round(y_pred_final).astype(int)

# Evaluate
mse = mean_squared_error(y_test, y_pred_final)
r2 = r2_score(y_test, y_pred_raw)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R2 Score: {r2:.4f}")  # 1.0 is perfect, 0.0 is random guessing
print(f"Min Prediction: {y_pred_raw.min():.2f}")
print(f"Max Prediction: {y_pred_raw.max():.2f}")

# View Predictions
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_final})
print(results.head(10))

In [62]:
# Confusion matrix for visualization

print("\nConfusion Matrix (Rows=True, Cols=Pred):")
print(confusion_matrix(y_test, y_pred_final))

In [63]:
plt.figure(figsize=(10, 5))

# Actual ratings
sns.histplot(y_test, color='blue', alpha=0.5, label='Actual Values', discrete=True)

# Predicted ratings
sns.histplot(y_pred_final, color='orange', alpha=0.5, label='Predicted Values', discrete=True)

plt.legend()
plt.title('Actual vs Predicted Distribution')
plt.show()

In [64]:
exact_acc = accuracy_score(y_test, y_pred_final)

diff = np.abs(y_test - y_pred_final)
within_one_acc = np.mean(diff <= 1)

print("Model Accuracy (in percent)")
print(f"Exact Star Rating:      {exact_acc:.2%}")
print(f"Within 1 Star Rating:   {within_one_acc:.2%}")

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=3cf9eecd-ea28-4073-b66c-82b4a5f801b0' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>