In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, auc
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier

In [2]:
#### LOAD DATASETS ####

train_data_file = "gatsby_train.csv"
test_data_file = "gatsby_test.csv"

#Import train and test dataset into data frames and print out the original lengths
train_data_df = pd.read_csv(train_data_file)
test_data_df = pd.read_csv(test_data_file)
print ("Original train set: ",len(train_data_df))
print (" Original test set: ",len(test_data_df))

# Remove rows with null labels
train_data_df = train_data_df[~train_data_df["sentiment"].isnull()]
test_data_df = test_data_df[~test_data_df["sentiment"].isnull()]
print ("After removing instances with no labels, train set size: ", len(train_data_df))
print (" After removing instances with no labels, test set size: ", len(test_data_df))

# print out top 5 rows of the train set
display(train_data_df.head(5))

# Remove empty rows from both sets and print out the new lengths
train_data_df = train_data_df[~train_data_df["ProcessedReview"].isnull()]
test_data_df = test_data_df[~test_data_df["ProcessedReview"].isnull()]
print ("After removing empty Review, train set size: ",len(train_data_df))
print (" After removing empty Review, test set size: ",len(test_data_df))

Original train set:  2206
 Original test set:  946
After removing instances with no labels, train set size:  2206
 After removing instances with no labels, test set size:  946


Unnamed: 0,sentiment,review,reviewLength,OriginalReview,CleanedReview,TokenizedReview,StopwordRemovedReview,StemmedReview,ProcessedReview
0,positive,Perfect Reading Of The Perfect Book. I have al...,610,perfect reading of the perfect book. i have al...,perfect reading of the perfect book i have alw...,"['perfect', 'reading', 'of', 'the', 'perfect',...","['perfect', 'reading', 'perfect', 'book', 'alw...","['perfect', 'read', 'of', 'the', 'perfect', 'b...",perfect read of the perfect book i have alway...
1,neutral,"Three Stars. Had writing into it, no pages mis...",52,"three stars. had writing into it, no pages mis...",three stars had writing into it no pages missing,"['three', 'stars', 'had', 'writing', 'into', '...","['three', 'stars', 'writing', 'pages', 'missing']","['three', 'star', 'had', 'write', 'into', 'it'...",three star had write into it no page miss
2,positive,Possibly The Best Novel Ever Written.... Sigh....,627,possibly the best novel ever written.... sigh....,possibly the best novel ever written sigh gasp...,"['possibly', 'the', 'best', 'novel', 'ever', '...","['possibly', 'best', 'novel', 'ever', 'written...","['possibl', 'the', 'best', 'novel', 'ever', 'w...",possibl the best novel ever written sigh gasp...
3,neutral,Not my Favorite. I could never get into this b...,136,not my favorite. i could never get into this b...,not my favorite i could never get into this bo...,"['not', 'my', 'favorite', 'i', 'could', 'never...","['favorite', 'could', 'never', 'get', 'book', ...","['not', 'my', 'favorit', 'i', 'could', 'never'...",not my favorit i could never get into thi boo...
4,positive,Like 'Rating' one of the Gospels. I think I fi...,651,like 'rating' one of the gospels. i think i fi...,like rating one of the gospels i think i first...,"['like', 'rating', 'one', 'of', 'the', 'gospel...","['like', 'rating', 'one', 'gospels', 'think', ...","['like', 'rate', 'one', 'of', 'the', 'gospel',...",like rate one of the gospel i think i first r...


After removing empty Review, train set size:  2206
 After removing empty Review, test set size:  945


In [3]:
# use processed reviews for model building
y_train = train_data_df["sentiment"]
y_test = test_data_df["sentiment"]

train_text = train_data_df["ProcessedReview"]
test_text = test_data_df["ProcessedReview"]

# set the n-gram range
vectorizer = CountVectorizer(ngram_range = [1,1])

# create training data representation
train_data_cv = vectorizer.fit_transform(train_text)

# observe the words in the created dictionary across the document
print(len(vectorizer.vocabulary_), " ... ", list(vectorizer.vocabulary_.items())[0:100],"\n")

print(train_data_cv.shape,"\n") 

# create test data representation
test_data_cv = vectorizer.transform(test_text)
print(test_data_cv.shape,"\n") 

6920  ...  [('perfect', 4493), ('read', 4912), ('of', 4248), ('the', 6097), ('book', 825), ('have', 2934), ('alway', 363), ('love', 3681), ('fsf', 2609), ('write', 6861), ('hi', 2994), ('prose', 4789), ('is', 3369), ('beauti', 668), ('poetic', 4610), ('thi', 6118), ('best', 720), ('work', 6833), ('by', 959), ('far', 2345), ('come', 1240), ('aliv', 336), ('in', 3185), ('alexand', 330), ('scourbi', 5320), ('listen', 3626), ('to', 6189), ('record', 4960), ('probabl', 4740), ('10', 6), ('time', 6173), ('over', 4344), ('last', 3521), ('few', 2410), ('year', 6888), ('and', 408), ('never', 4119), ('tire', 6180), ('stori', 5815), ('or', 4296), ('way', 6700), ('it', 3379), ('also', 357), ('version', 6603), ('tim', 6172), ('robbin', 5165), ('he', 2940), ('doesn', 1847), ('do', 1841), ('justic', 3443), ('portray', 4647), ('gatsbi', 2674), ('as', 511), ('kind', 3470), ('dour', 1873), ('fellow', 2392), ('view', 6622), ('ultim', 6354), ('romant', 5178), ('that', 6096), ('how', 3068), ('can', 979), (

In [4]:
# Use chi-squared statistics to select the best 5000 unigram features
from sklearn.feature_selection import SelectKBest, chi2
selector = SelectKBest(chi2, k=5000)
X_train_features_filtered_kbest = selector.fit_transform(train_data_cv, y_train)
print ("Train feature space before filtering: ", train_data_cv.shape)
print (" Train feature space after filtering: ", X_train_features_filtered_kbest.shape)


X_test_features_filtered_kbest = selector.transform(test_data_cv)
print (" Test feature space before filtering: ", test_data_cv.shape)
print ("  Test feature space after filtering: ", X_test_features_filtered_kbest.shape)

Train feature space before filtering:  (2206, 6920)
 Train feature space after filtering:  (2206, 5000)
 Test feature space before filtering:  (945, 6920)
  Test feature space after filtering:  (945, 5000)


In [5]:
#Combine filtered datasets to single dataset.
from scipy.sparse import vstack
X = vstack((X_train_features_filtered_kbest, X_test_features_filtered_kbest))

#Combine target data for test and train
y = pd.concat([y_train, y_test])

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.70, random_state=57)

In [7]:
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix
from sklearn.pipeline import Pipeline

step1 = MultinomialNB()

pipe = Pipeline([
    ('step1', step1)
])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

print('\033[1m' + 'Overall scores NB' + '\033[0m')
print(" Accuracy score:", accuracy_score(y_test, y_pred))
print("Precision Score:", precision_score(y_test, y_pred, average='weighted'))
print("   Recall score:", recall_score(y_test, y_pred, average='weighted'))
print("       F1 score:", f1_score(y_test, y_pred, average='weighted'))
print('\033[1m' + '\nIndividual label performance ' + '\033[0m')
print (classification_report(y_test, y_pred))

[1mOverall scores NB[0m
 Accuracy score: 0.8446088794926004
Precision Score: 0.8167261946288339
   Recall score: 0.8446088794926004
       F1 score: 0.811114338014781
[1m
Individual label performance [0m
              precision    recall  f1-score   support

    negative       0.66      0.51      0.58       105
     neutral       0.57      0.09      0.16        87
    positive       0.87      0.98      0.92       754

    accuracy                           0.84       946
   macro avg       0.70      0.53      0.55       946
weighted avg       0.82      0.84      0.81       946



In [33]:
step1 = LinearSVC(C=0.1, random_state=17)

pipe = Pipeline([
    ('step1', step1)
])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

print('\033[1m' + 'Overall scores SVM' + '\033[0m')
print(" Accuracy score:", accuracy_score(y_test, y_pred))
print("Precision Score:", precision_score(y_test, y_pred, average='weighted'))
print("   Recall score:", recall_score(y_test, y_pred, average='weighted'))
print("       F1 score:", f1_score(y_test, y_pred, average='weighted'))
print('\033[1m' + '\nIndividual label performance SVC Model' + '\033[0m')
print (classification_report(y_test, y_pred))

[1mOverall scores SVM[0m
 Accuracy score: 0.8414376321353065
Precision Score: 0.8235522411211019
   Recall score: 0.8414376321353065
       F1 score: 0.830331261998578
[1m
Individual label performance SVC Model[0m
              precision    recall  f1-score   support

    negative       0.66      0.54      0.59       105
     neutral       0.41      0.29      0.34        87
    positive       0.89      0.95      0.92       754

    accuracy                           0.84       946
   macro avg       0.65      0.59      0.62       946
weighted avg       0.82      0.84      0.83       946



In [9]:
step1 = DecisionTreeClassifier(max_depth=43, random_state=17)

pipe = Pipeline([
    ('step1', step1)
])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

print('\033[1m' + 'Overall scores' + '\033[0m')
print(" Accuracy score:", accuracy_score(y_test, y_pred))
print("Precision Score:", precision_score(y_test, y_pred, average='weighted'))
print("   Recall score:", recall_score(y_test, y_pred, average='weighted'))
print("       F1 score:", f1_score(y_test, y_pred, average='weighted'))
print('\033[1m' + '\nIndividual label performance ' + '\033[0m')
print (classification_report(y_test, y_pred))

[1mOverall scores[0m
 Accuracy score: 0.7928118393234672
Precision Score: 0.768579756646761
   Recall score: 0.7928118393234672
       F1 score: 0.7778274090697043
[1m
Individual label performance [0m
              precision    recall  f1-score   support

    negative       0.55      0.39      0.46       105
     neutral       0.29      0.21      0.24        87
    positive       0.86      0.92      0.88       754

    accuracy                           0.79       946
   macro avg       0.56      0.50      0.53       946
weighted avg       0.77      0.79      0.78       946



In [22]:
step1 = AdaBoostClassifier(n_estimators=50, learning_rate=.99, random_state=17)

pipe = Pipeline([
    ('step1', step1)
])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

print('\033[1m' + 'Overall scores' + '\033[0m')
print(" Accuracy score:", accuracy_score(y_test, y_pred))
print("Precision Score:", precision_score(y_test, y_pred, average='weighted'))
print("   Recall score:", recall_score(y_test, y_pred, average='weighted'))
print("       F1 score:", f1_score(y_test, y_pred, average='weighted'))
print('\033[1m' + '\nIndividual label performance ' + '\033[0m')
print (classification_report(y_test, y_pred))

[1mOverall scores[0m
 Accuracy score: 0.8276955602536998
Precision Score: 0.8025056229867586
   Recall score: 0.8276955602536998
       F1 score: 0.8044839070209432
[1m
Individual label performance [0m
              precision    recall  f1-score   support

    negative       0.74      0.38      0.50       105
     neutral       0.38      0.22      0.28        87
    positive       0.86      0.96      0.91       754

    accuracy                           0.83       946
   macro avg       0.66      0.52      0.56       946
weighted avg       0.80      0.83      0.80       946



In [25]:
step1 = GradientBoostingClassifier(learning_rate=.5, n_estimators=89, criterion='squared_error',  random_state=17)

pipe = Pipeline([
    ('step1', step1)
])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

print('\033[1m' + 'Overall scores' + '\033[0m')
print(" Accuracy score:", accuracy_score(y_test, y_pred))
print("Precision Score:", precision_score(y_test, y_pred, average='weighted'))
print("   Recall score:", recall_score(y_test, y_pred, average='weighted'))
print("       F1 score:", f1_score(y_test, y_pred, average='weighted'))
print('\033[1m' + '\nIndividual label performance ' + '\033[0m')
print (classification_report(y_test, y_pred))

[1mOverall scores[0m
 Accuracy score: 0.8181818181818182
Precision Score: 0.7947691197691198
   Recall score: 0.8181818181818182
       F1 score: 0.8016495390552915
[1m
Individual label performance [0m
              precision    recall  f1-score   support

    negative       0.64      0.44      0.52       105
     neutral       0.40      0.25      0.31        87
    positive       0.86      0.94      0.90       754

    accuracy                           0.82       946
   macro avg       0.63      0.54      0.58       946
weighted avg       0.79      0.82      0.80       946



In [31]:
step1 = ExtraTreesClassifier(n_estimators=100, criterion='gini', min_samples_split=3, random_state=17)

pipe = Pipeline([
    ('step1', step1)
])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

print('\033[1m' + 'Overall scores' + '\033[0m')
print(" Accuracy score:", accuracy_score(y_test, y_pred))
print("Precision Score:", precision_score(y_test, y_pred, average='weighted'))
print("   Recall score:", recall_score(y_test, y_pred, average='weighted'))
print("       F1 score:", f1_score(y_test, y_pred, average='weighted'))
print('\033[1m' + '\nIndividual label performance ' + '\033[0m')
print (classification_report(y_test, y_pred))

[1mOverall scores[0m
 Accuracy score: 0.8329809725158562
Precision Score: 0.8280263336613469
   Recall score: 0.8329809725158562
       F1 score: 0.7902784350833737
[1m
Individual label performance [0m
              precision    recall  f1-score   support

    negative       0.81      0.25      0.38       105
     neutral       0.79      0.17      0.28        87
    positive       0.83      0.99      0.91       754

    accuracy                           0.83       946
   macro avg       0.81      0.47      0.52       946
weighted avg       0.83      0.83      0.79       946



# Summary  
  
  
<center><img src="findings/final.jpg" /></center>