## CS 349 Final Project
Ian Shi, Ellen Liao, Tim Fu

In [None]:
# importing libraries 
import numpy as np
import pandas as pd
import math

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import GenericUnivariateSelect, RFECV, chi2
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn import metrics
from scipy.stats import randint

from sklearn import svm


In [None]:
# downloads for python nltk (sentiment analysis)

"""
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('all')
"""

In [None]:
# loading in preprocessed dataset to save time for analysis
# prod_train = pd.read_json("./preprocessed_data.json")

### Loading Dataset

In [None]:
CD_rev_train = pd.read_json("./CDs_and_Vinyl/train/review_training.json")
CD_prod_train = pd.read_json("./CDs_and_Vinyl/train/product_training.json")
# set prod_train to preprocessed data so we don't have to rerun processing every time
prod_train = pd.read_json("./preprocessed_data.json")

In [None]:
print(list(CD_rev_train.columns))
print(list(CD_prod_train.columns))

### Preprocessing

In [None]:
# data set containing specific asin (product) and number of unique reviews 
prod_reviews = CD_rev_train.groupby('asin')['reviewerID'].nunique()

# data set containing specific asin (product) and average review upvote
CD_rev_train['vote'] = pd.to_numeric(CD_rev_train['vote'], errors='coerce')
CD_rev_train['vote'] = CD_rev_train['vote'].fillna(0).astype(int)
prod_votes = CD_rev_train.groupby('asin')['vote'].mean()

# data set containing specific asin (product) and proportion of verified reviews
prod_ver = CD_rev_train.groupby('asin')['verified'].mean()

# data set containing specific asin (product) and proportion of reviews that include images
CD_rev_train['image'] = CD_rev_train['image'].notna()
prod_image = CD_rev_train.groupby('asin')['image'].mean()

# data set containing specific asin (product) and time of earliest review
earliest_rev = CD_rev_train.groupby('asin')['unixReviewTime'].min()


#### Sentiment Analysis using Python nltk

In [None]:
# sentiment analysis preprocessing
def preprocess(text):
    # creating tokens and formatting
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha()]
    # removing stop words from nltk package of stopwords
    stop_words = stopwords.words('english')
    stop_tokens = [token for token in tokens if token not in stop_words]
    # lemmatizing (grouping tg different inflected forms of same word)
    lemmatized = [WordNetLemmatizer().lemmatize(token) for token in stop_tokens]
    # join tokens tg
    processed = ' '.join(lemmatized)
    return processed

In [None]:
# some reviewText are empty, making sure those get converted to empty string
CD_rev_train['reviewText'] = CD_rev_train['reviewText'].fillna('')
CD_rev_train['reviewText'] = CD_rev_train['reviewText'].apply(preprocess)
CD_rev_train['reviewText']
CD_rev_train['summary'] = CD_rev_train['summary'].fillna('')
CD_rev_train['summary'] = CD_rev_train['summary'].apply(preprocess)
CD_rev_train['summary']

In [None]:
# running nltk sentiment analysis
analyzer = SentimentIntensityAnalyzer()

def comp_sentiment(text):
    scores = analyzer.polarity_scores(text)
    sentiment = scores['compound']
    return sentiment

def sentiment(text):
    scores = analyzer.polarity_scores(text)
    if scores['compound'] > 0:
        sentiment = 1
    else:
        sentiment = 0
    return sentiment

def pos_sentiment(text):
    scores = analyzer.polarity_scores(text)
    sentiment = scores['pos']
    return sentiment

def neg_sentiment(text):
    scores = analyzer.polarity_scores(text)
    sentiment = scores['neg']
    return sentiment

In [None]:
# running nltk sentiment analysis, criteria in the sentiment function could definitely change
analyzer = SentimentIntensityAnalyzer()

# just raw polarity score output (and then averaging them across all reviews for certain prod later)
# analyzer.polarity_scores returns -1 to 1
def sentiment(text):
    scores = analyzer.polarity_scores(text)
    # looking through nltk documentation, 'compound' is the aggregate sentiment
    # 'pos' gives positive sentiment only (so negative sentiment isn't subtracted)
    sentiment = scores['compound']
    return sentiment

In [None]:
# applying sentiment analysis to dataset
# compound sentiment values
CD_rev_train['rev_compSentiment'] = CD_rev_train['reviewText'].apply(comp_sentiment)
CD_rev_train['summ_compSentiment'] = CD_rev_train['summary'].apply(comp_sentiment)

# 1 or 0 sentiment values
CD_rev_train['rev_Sentiment'] = CD_rev_train['reviewText'].apply(sentiment)
CD_rev_train['summ_Sentiment'] = CD_rev_train['summary'].apply(sentiment)

# pos sentiment values
CD_rev_train['rev_posSentiment'] = CD_rev_train['reviewText'].apply(pos_sentiment)
CD_rev_train['summ_posSentiment'] = CD_rev_train['summary'].apply(pos_sentiment)

# neg sentiment values
CD_rev_train['rev_negSentiment'] = CD_rev_train['reviewText'].apply(neg_sentiment)
CD_rev_train['summ_negSentiment'] = CD_rev_train['summary'].apply(neg_sentiment)

In [None]:
# dataset containing proportion of postively sentimented reviews and summaries for each product
rev_compSentiment = CD_rev_train.groupby('asin')['rev_compSentiment'].mean()
summ_compSentiment = CD_rev_train.groupby('asin')['summ_compSentiment'].mean()

rev_posSentiment = CD_rev_train.groupby('asin')['rev_posSentiment'].mean()
summ_posSentiment = CD_rev_train.groupby('asin')['summ_posSentiment'].mean()

rev_negSentiment = CD_rev_train.groupby('asin')['rev_negSentiment'].mean()
summ_negSentiment = CD_rev_train.groupby('asin')['summ_negSentiment'].mean()

rev_Sentiment = CD_rev_train.groupby('asin')['rev_Sentiment'].mean().apply(lambda x: 1 if x >= 0.5 else 0)
summ_Sentiment = CD_rev_train.groupby('asin')['summ_Sentiment'].mean().apply(lambda x: 1 if x >= 0.5 else 0)

In [None]:
# combining datasets into one
prod_train = pd.concat([prod_reviews, prod_votes, prod_ver, rev_Sentiment, summ_Sentiment, rev_compSentiment, summ_compSentiment,
                        rev_posSentiment, summ_posSentiment, rev_negSentiment, summ_negSentiment, prod_image], axis = 1)

In [None]:
# merge expected awesomeness into dataset
prod_train = prod_train.merge(CD_prod_train, left_index = True, right_on = 'asin')
prod_train

In [None]:
# downloading preprocessed training data to separate json - i have the git repo cloned to a folder
# prod_train.to_json(r'/Users/IanShi/Desktop/Classwork/CS 349/cs349-project/preprocessed_data.json')

In [None]:
# positive to negative sentiment ratio
prod_train['rev_posNegRatio'] = (prod_train['rev_posSentiment'] + 1) \
    / (prod_train['rev_negSentiment'] + 1)
prod_train['summ_posNegRatio'] = (prod_train['summ_posSentiment'] + 1) \
    / (prod_train['summ_negSentiment'] + 1)

# summary to review sentiment ratio
prod_train['summToRev'] = (prod_train['summ_compSentiment'] + 1) \
    / (prod_train['rev_compSentiment'] + 1)

## Analysis Models
### Gaussian Naive Bayes
Below is a coded naive bayes function that we tested at first. Later switched to sklearn's gnb model for ease of use.

In [None]:
# gaussian naive-bayes setup
mean_by_awe = prod_train.groupby('awesomeness').mean(numeric_only=True)
var_by_awe = prod_train.groupby('awesomeness').var(numeric_only=True)
count_by_awe = prod_train.groupby('awesomeness')['asin'].count()

# gaussian naive-bayes
features = list(mean_by_awe.columns)
features.pop(12)
def gaussian_nb(feature_vec: pd.Series):
    prob = [count_by_awe.at[0], count_by_awe.at[1]]
    for feature in features:
        prob[0] *= (1 / math.sqrt(2 * math.pi * var_by_awe.at[0,feature])) \
        * math.exp(-pow((feature_vec.at[feature] - mean_by_awe.at[0,feature]), 2) \
                   / (2 * pow(var_by_awe.at[0,feature], 2)))
        prob[1] *= (1 / math.sqrt(2 * math.pi * var_by_awe.at[1,feature])) \
        * math.exp(-pow((feature_vec.at[feature] - mean_by_awe.at[1,feature]), 2) \
                   / (2 * pow(var_by_awe.at[1,feature], 2)))
    if prob[0] > prob[1]:
        return 0
    return 1

In [None]:
# apply gaussian naive-bayes to dataset
gaussianNB = prod_train.apply(gaussian_nb, axis=1)

# compute accuracy, f1
num_correct = 0
tp = 0
fp = 0
fn = 0
for i in range(len(prod_train)):
    if (gaussianNB[i] == prod_train['awesomeness'][i]):
        num_correct = num_correct + 1
        if gaussianNB[i] == 1:
            tp += 1
    elif gaussianNB[i] == 1:
        fp += 1
    elif gaussianNB[i] == 0:
        fn += 1
print("Gaussian NB Accuracy", num_correct/len(prod_train))
print("Gaussian NB F1 Score", 2*tp/(2*tp + fp + fn))

In [None]:
# gaussian nb setup
features = list(mean_by_awe.columns)
features.pop(12)
X = prod_train[features]
Y = prod_train['awesomeness']

# splitting data into train and test (70/30)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 1)

# gaussian nb with sklearn
# use recursive feature elimination with cv (RFECV) to find best features
# results: ['verified', 'unixReviewTime', 'rev_negSentiment', 'summ_negSentiment']
gnb = GaussianNB()
#rfecv = RFECV(estimator=gnb, scoring='f1', n_jobs=-1, cv=10, verbose=2)
gnb.fit(X_train, Y_train)
gnb_pred = gnb.predict(X_test)
#print("Features:", rfecv.get_feature_names_out(features))
#gnb_cv = pd.DataFrame(rfecv.cv_results_)

# model accuracy
print("Gaussian NB Accuracy:",metrics.accuracy_score(Y_test, gnb_pred))

# 10 fold cross validation
gnb_cv = cross_val_score(gnb, X, Y, cv = 10, scoring='f1')
print("Mean CV F1 Score:", np.mean(gnb_cv))
print("Standard Deviation F1 Score:", np.std(gnb_cv))

### Decision Tree

In [None]:
# decision tree setup
# features = ['reviewerID', 'vote', 'verified', 'rev_Sentiment', 'summ_Sentiment', 'rev_compSentiment', 'summ_compSentiment',
#             'rev_posSentiment', 'summ_posSentiment', 'rev_negSentiment', 'summ_negSentiment', 'image']
#features = ['reviewerID', 'vote', 'verified', 'rev_Sentiment', 'summ_Sentiment', 'image']
features = list(mean_by_awe.columns)
features.pop(12)
X = prod_train[features]
Y = prod_train['awesomeness']

# splitting data into train and test (70/30)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 1)

'''
# use grid search to find best hyperparameters
# the results are criterion: gini/entropy (~same), max_depth: 3
param_grid = {"criterion": ['gini', 'entropy'], "max_depth": [x for x in range(1,11)]}
tree = DecisionTreeClassifier()
gs = GridSearchCV(estimator=tree, param_grid=param_grid, scoring='f1', n_jobs=-1, cv=10, verbose=2)
gs.fit(X_train, Y_train)
tree_pred = gs.predict(X_test)
tree_cv = pd.DataFrame(gs.cv_results_)
'''

"""
# use recursive feature elimination with cv (RFECV) to find best features
# results: ['verified', 'unixReviewTime', 'rev_negSentiment', 'summ_negSentiment']
tree = DecisionTreeClassifier(criterion='entropy', max_depth = 3)
rfecv = RFECV(estimator=tree, scoring='f1', n_jobs=-1, cv=10, verbose=0)
rfecv.fit(X_train, Y_train)
tree_pred = rfecv.predict(X_test)
print("Features:", rfecv.get_feature_names_out(features))
tree_cv = pd.DataFrame(rfecv.cv_results_)
"""

X_train = X_train[['verified', 'unixReviewTime', 'rev_negSentiment', 'summ_negSentiment']]
X_test = X_test[['verified', 'unixReviewTime', 'rev_negSentiment', 'summ_negSentiment']]
tree = DecisionTreeClassifier(criterion='entropy', max_depth = 3)
tree = tree.fit(X_train, Y_train)
tree_pred = tree.predict(X_test)

# model accuracy
print("Decision Tree Accuracy:",metrics.accuracy_score(Y_test, tree_pred))

# 10 fold cross validation
tree_cv = cross_val_score(tree, X, Y, cv = 10, scoring='f1')
print("Mean CV F1 Score:", np.mean(tree_cv))
print("Standard Deviation F1 Score:", np.std(tree_cv))

### Random Forest

In [None]:
# random forest

# trying RandomizedSearchCV to find best parameters
'''
hyperparams = {'n_estimators': randint(20,200),
              'max_depth': randint(1,20)}
rf = RandomForestClassifier(n_jobs=-1)
rand_search = RandomizedSearchCV(rf, param_distributions = hyperparams, n_iter=10, scoring='f1', cv=10, n_jobs=-1)
rand_search.fit(X_train, Y_train)
best_rf = rand_search.best_estimator_
print('Best hyperparameters:',  rand_search.best_params_)

best_pred = best_rf.predict(X_test)
#10 fold cross validation
print("Random Forest Accuracy:",metrics.accuracy_score(Y_test, best_pred))
forest_cv = cross_val_score(best_rf, X, Y, cv = 10, scoring='f1', n_jobs=-1)
print("Mean CV F1 Score:", np.mean(forest_cv))
print("Standard Deviation F1 Score:", np.std(forest_cv))
'''

In [None]:
# using RFECV for feature selection
'''
rf_estimator = RandomForestClassifier(n_estimators = 15, max_depth = 1, n_jobs=-1)
rf_selector = RFECV(estimator = rf_estimator, scoring = 'f1', step = 1, cv = 5, verbose = 2)
rf_selector = rf_selector.fit(X_train, Y_train)
rf_selector.ranking_
# selected features ['verified','rev_posSentiment', 'rev_negSentiment', 'summ_negSentiment', 'rev_posNegRatio', 'summ_posNegRatio', 'summToRev']
'''

X_train = X_train[['verified','rev_posSentiment', 'rev_negSentiment', 'summ_negSentiment', 'rev_posNegRatio', 'summ_posNegRatio', 'summToRev']]
X_test = X_test[['verified','rev_posSentiment', 'rev_negSentiment', 'summ_negSentiment', 'rev_posNegRatio', 'summ_posNegRatio', 'summToRev']]
rf = RandomForestClassifier(n_estimators = 28, max_depth = 1, n_jobs=-1)
rf.fit(X_train, Y_train)
rf_pred = rf.predict(X_test)
print("Random Forest Accuracy:",metrics.accuracy_score(Y_test, rf_pred))
forest_cv = cross_val_score(rf, X, Y, cv = 10, scoring='f1', n_jobs=-1)
print("Mean CV F1 Score:", np.mean(forest_cv))
print("Standard Deviation F1 Score:", np.std(forest_cv))

### SVM

In [None]:
# svm 
svm_class = svm.SVC()
svm_class = svm_class.fit(X_train, Y_train)
svm_pred = svm_class.predict(X_test)

# model accuracy
print("SVM Tree Accuracy:",metrics.accuracy_score(Y_test, svm_pred))

# 10 fold cross validation
svm_cv = cross_val_score(svm_class, X, Y, cv = 10, scoring='f1', n_jobs=-1, verbose=2)
print("Mean CV F1 Score:", np.mean(svm_cv))
print("Standard Deviation F1 Score:", np.std(svm_cv))

### K-Nearest-Neighbors

In [None]:
# k-nearest neighbors
kneigh = KNeighborsClassifier(n_neighbors = 9, p=1, n_jobs=-1)

# set up
#features = ['reviewerID', 'vote', 'verified', 'rev_Sentiment', 'summ_Sentiment', 'rev_posSentiment', 'summ_posSentiment', 'rev_negSentiment', 'summ_negSentiment', 'image']
features = list(mean_by_awe.columns)
features.pop(12) # remove "image" feature for now -- currently broken -ellen
X = prod_train[features]
Y = prod_train['awesomeness']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 1)

"""
# use grid search to find best hyperparameters
# results:  n_neighbors: 9, weights: uniform, p: 1
param_grid = {"n_neighbors": [x for x in range(1,11)],
              "weights": ['uniform', 'distance'],
              "p": [1, 2]}
kneigh = KNeighborsClassifier(n_jobs=-1)
gs = GridSearchCV(estimator=kneigh, param_grid=param_grid, scoring='f1', n_jobs=-1, cv=10, verbose=2)
gs.fit(X_train, Y_train)
kneigh_pred = gs.predict(X_test)
kneigh_cv = pd.DataFrame(gs.cv_results_)
"""

# train the model
kneigh.fit(X_train, Y_train)

# apply model to test set
kneigh_pred = kneigh.predict(X_test)
kneigh_pred_prob = kneigh.predict_proba(X_test)

# mean accuracy
kneigh_score = kneigh.score(X_test, Y_test)

# 10 fold cross validation
kneigh_cv = cross_val_score(kneigh, X, Y, cv = 10, scoring='f1', n_jobs=-1)
print("Mean CV F1 Score:", np.mean(kneigh_cv))
print("Standard Deviation F1 Score:", np.std(kneigh_cv))