In [80]:
# Load packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score


In [81]:
# load the data
data = pd.read_csv('complete_data.csv')
data.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image,category
0,1.0,True,"03 25, 2017",A2K5ZBX75J80TX,B009MA34NY,"{'Size:': ' 9 W US', 'Color:': ' Cool Grey / V...",Marjorie A. Dunbar,Ordered 9(m) received 9 Wide for the second ti...,One Star,1490400000,,,amazon_fashion
1,1.0,True,"09 11, 2016",A3SM5XENOINDNH,B010RRWKT4,"{'Size:': ' 7.5 B(M) US', 'Color:': ' Black/Wh...",Marielys Rodrguez,After using this shoes seven times for regular...,Poor Quality. Don't buy.,1473552000,14.0,['https://images-na.ssl-images-amazon.com/imag...,amazon_fashion
2,1.0,True,"05 21, 2015",A36GNVEQP92OPA,B0017LGD34,,Peter B. Thorp,I'm not sure I was shipped the correct style. ...,Huh,1432166400,,,amazon_fashion
3,1.0,True,"02 21, 2017",ABX2R8Q6GBTJ8,B000YFSR5G,"{'Size:': ' Medium', 'Color:': ' Black'}",Sammy C.,Nice pants but too small for a medium. Return it.,Returned also,1487635200,,,amazon_fashion
4,1.0,True,"12 15, 2016",A3PTZ7IHGU9BA8,B009MA34NY,"{'Size:': ' 7 B(M) US', 'Color:': ' Black/Red ...",Amazon Customer,wrong shoes,One Star,1481760000,,,amazon_fashion


In [82]:
# check the shape of the data
data['reviewText'] = data['reviewText'].fillna('No review').astype(str)
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)

# check the shape of the train and test data
print('shape of training set: ', train_data.shape)
print('shape of test set: ', val_data.shape)


# check the distribution of the train and test data
print('training dist', train_data['overall'].value_counts(normalize=True))
print('testing dist', val_data['overall'].value_counts(normalize=True))



# check the shape of the train and validation data
print('shape of training set: ', train_data.shape)
print('shape of validation set: ', val_data.shape)


shape of training set:  (5151, 13)
shape of test set:  (573, 13)
training dist overall
3.0    0.252378
1.0    0.251796
2.0    0.249466
4.0    0.246360
Name: proportion, dtype: float64
testing dist overall
4.0    0.282723
2.0    0.254799
1.0    0.233857
3.0    0.228621
Name: proportion, dtype: float64
shape of training set:  (5151, 13)
shape of validation set:  (573, 13)


In [83]:
# print the first review and its rating using iloc
print(f"First review: {train_data['reviewText'].iloc[0]}")
print(f"First rating: {train_data['overall'].iloc[0]}")

# print the length of the first review
print(f"Length of first review: {len(train_data['reviewText'].iloc[0])}")

# print the average, max, and min length of the reviews
review_lengths = [len(t) for t in train_data['reviewText']]
print(f"Average length of text: {np.mean(review_lengths)}")
print(f"Max length of text: {np.max(review_lengths)}")
print(f"Min length of text: {np.min(review_lengths)}")


First review: Great product - my wife loves it
First rating: 4.0
Length of first review: 32
Average length of text: 189.71927781013395
Max length of text: 2654
Min length of text: 1


In [84]:
# Feature Extraction with TF-IDF
max_features = 200  # Adjust as necessary

tfidf = TfidfVectorizer(max_features=max_features)
X_train_tfidf = tfidf.fit_transform(train_data['reviewText'])
X_val_tfidf = tfidf.transform(val_data['reviewText'])

# Gradient Boosting Classifier
gbc = GradientBoostingClassifier(n_estimators=100, max_depth=3, random_state=42)
gbc.fit(X_train_tfidf, train_data['overall'])

# Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=42)
rf.fit(X_train_tfidf, train_data['overall'])

# Voting Classifier
voting_clf = VotingClassifier(
    estimators=[('gbc', gbc), ('rf', rf)], 
    voting='soft'
)

# Fit the Voting Classifier
voting_clf.fit(X_train_tfidf, train_data['overall'])


In [85]:
# Prediction and Evaluation
y_pred = voting_clf.predict(X_val_tfidf)

# Print the classification report
print("Classification Report:\n")
print(classification_report(val_data['overall'], y_pred))

# Print the accuracy score
accuracy = accuracy_score(val_data['overall'], y_pred)
print("Accuracy Score:", accuracy)

Classification Report:

              precision    recall  f1-score   support

         1.0       0.89      0.88      0.88       134
         2.0       0.89      0.84      0.87       146
         3.0       0.76      0.79      0.77       131
         4.0       0.77      0.79      0.78       162

    accuracy                           0.82       573
   macro avg       0.83      0.82      0.83       573
weighted avg       0.83      0.82      0.82       573

Accuracy Score: 0.8237347294938918


alternative using grid search

In [86]:
# Parameter grid for Gradient Boosting Classifier
param_grid_gbc = {
    'gbc__n_estimators': [50, 100, 200],
    'gbc__max_depth': [3, 5, 10, 15]
}

# Parameter grid for Random Forest Classifier
param_grid_rf = {
    'rf__n_estimators': [50, 100, 200],
    'rf__max_depth': [3, 5, 10, 15]
}

# Combine the parameter grids
param_grid = {**param_grid_gbc, **param_grid_rf}

# Voting Classifier
voting_clf = VotingClassifier(
    estimators=[('gbc', GradientBoostingClassifier(random_state=42)), 
                ('rf', RandomForestClassifier(random_state=42))], 
    voting='soft'
)

# Grid Search with cross-validation
grid_search = GridSearchCV(voting_clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the Grid Search to the data
grid_search.fit(X_train_tfidf, train_data['overall'])

# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

best_model = grid_search.best_estimator_
predictions = best_model.predict(X_val_tfidf)


Best parameters: {'gbc__max_depth': 15, 'gbc__n_estimators': 100, 'rf__max_depth': 15, 'rf__n_estimators': 100}
Best score: 0.9172968086408708


In [87]:
# Print the classification report
print("Classification Report:\n")
print(classification_report(val_data['overall'], predictions))

# Print the accuracy score
accuracy = accuracy_score(val_data['overall'], predictions)
print("Accuracy Score:", accuracy)

Classification Report:

              precision    recall  f1-score   support

         1.0       0.97      0.95      0.96       134
         2.0       0.96      0.94      0.95       146
         3.0       0.90      0.92      0.91       131
         4.0       0.90      0.91      0.91       162

    accuracy                           0.93       573
   macro avg       0.93      0.93      0.93       573
weighted avg       0.93      0.93      0.93       573

Accuracy Score: 0.9301919720767888


In [105]:
# Load test data
test_data = pd.read_csv('luxury_beauty.csv')

In [113]:
# check the shape of the data
test_data['reviewText'] = test_data['reviewText'].fillna('No review').astype(str)

# check the shape of the train and test data
print('shape of test set: ', test_data.shape)

# check the distribution of the train and test data
print('test dist', test_data['overall'].value_counts(normalize=True))

# check the shape of the train and validation data
print('shape of test set: ', test_data.shape)

shape of test set:  (17750, 12)
test dist overall
4.0    0.708282
3.0    0.135887
2.0    0.081070
1.0    0.074761
Name: proportion, dtype: float64
shape of test set:  (17750, 12)


In [107]:
X_test_tfidf = tfidf.fit_transform(test_data['reviewText'])
X_data_tfidf = tfidf.fit_transform(data['reviewText'])

In [109]:
# Retrain Model with Best Parameters and All Data
# Gradient Boosting Classifier
gbc = GradientBoostingClassifier(n_estimators=100, max_depth=15, random_state=42)
gbc.fit(X_data_tfidf, data['overall'])

# Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, max_depth=15, random_state=42)
rf.fit(X_data_tfidf, data['overall'])

# Voting Classifier
voting_clf = VotingClassifier(
    estimators=[('gbc', gbc), ('rf', rf)], 
    voting='soft'
)

# Fit the Voting Classifier
voting_clf.fit(X_data_tfidf, data['overall'])

In [110]:
# Prediction and Evaluation
y_pred = voting_clf.predict(X_test_tfidf)

# Print the classification report
print("Classification Report:\n")
print(classification_report(test_data['overall'], y_pred))

# Print the accuracy score
accuracy = accuracy_score(test_data['overall'], y_pred)
print("Accuracy Score:", accuracy)

Classification Report:

              precision    recall  f1-score   support

         1.0       0.08      0.08      0.08      1327
         2.0       0.08      0.03      0.05      1439
         3.0       0.15      0.15      0.15      2412
         4.0       0.73      0.77      0.75     12572

    accuracy                           0.57     17750
   macro avg       0.26      0.26      0.26     17750
weighted avg       0.55      0.57      0.56     17750

Accuracy Score: 0.5745352112676056
