# Baseline Model: Logistic Regression (clustered)


In [None]:
import numpy as np
import re
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
import os
os.getcwd()


'C:\\Users\\Brad\\Desktop\\Keras - GPU\\Baseline Models'

## Data Prep

In [None]:
# create a dataset by sampling labeled_training_set_clust.csv

#my_data = "../w266_proj/data/labeled_training_set_clust.csv"
#mine = pd.read_csv(my_data)

# Scaled annual hvar using global min-max values
#my_data = "../w266_proj/data/labeled_training_set_clust_v3.csv"
#mine = pd.read_csv(my_data)

# Class labels calculated using annual hvar values from test/dev/train
# - calculated min, max, mean, standard dev values to facilitate cluster assignment
#my_data = "../w266_proj/data/labeled_training_set_clust_v4.csv"
#mine = pd.read_csv(my_data)

# Class labels calculated using annual hvar values from test/dev
# - calculated min, max, mean, standard dev values to facilitate cluster assignment
#my_data = "../w266_proj/data/labeled_training_set_clust_v5.csv"
#mine = pd.read_csv(my_data)

train = pd.read_csv("../w266_proj/data/labeled_training_set_clust_FINAL.csv")
dev = pd.read_csv("../w266_proj/data/labeled_dev_set_clust_FINAL.csv")
test = pd.read_csv("../w266_proj/data/labeled_test_set_clust_FINAL.csv")

**Data has columns to test 3 types of labeling approaches:**
1. Jen's original approach (`most_helpful`)
2. 2-class cluster assigned labels (`class_2`)
3. 2-class cluster assigned labels (`group_z_class`)

**Approach 3 resulted in the most-favorable label assignment**

In [None]:
# Cleanup reviews without review content
#mine.iloc[94073]['reviewText'] # Example has 'nan' as reviewText
#mine.dropna(subset=['reviewText'],inplace=True)

train.dropna(subset=['reviewText'],inplace=True)
dev.dropna(subset=['reviewText'],inplace=True)
test.dropna(subset=['reviewText'],inplace=True)

In [4]:
# How many reviews have exactly 0 helpful votes?
#sum(mine.helpful_votes == 0)

print(sum(train.helpful_votes == 0))
print(sum(dev.helpful_votes == 0))
print(sum(test.helpful_votes == 0))

271443
32445
32456


In [23]:
train.group_z_class.value_counts()

0.0    2348039
1.0     311377
Name: group_z_class, dtype: int64

In [25]:
dev.group_z_class.value_counts()

0.0    281301
1.0     32779
Name: group_z_class, dtype: int64

In [55]:
test.group_z_class.value_counts()

0.0    281029
1.0     32573
Name: group_z_class, dtype: int64

## Final Model: Data Prep

In [5]:
# How many examples of each group in training set?
print('neg_helpful: {}'.format(train[(train.overall == 1) & (train.group_z_class == 1) & (train.helpful_votes != 0)].shape))
print('neg_unhelpful: {}'.format(train[(train.overall == 1) & (train.group_z_class == 0) & (train.helpful_votes == 0)].shape))
print('pos_unhelpful: {}'.format(train[(train.overall == 5) & (train.group_z_class == 0) & (train.helpful_votes == 0)].shape))
print('pos_helpful: {}'.format(train[(train.overall == 5) & (train.group_z_class == 1) & (train.helpful_votes != 0)].shape))

neg_helpful: (31348, 21)
neg_unhelpful: (13033, 21)
pos_unhelpful: (121879, 21)
pos_helpful: (158714, 21)


In [17]:
# Sample Training Set
# sample to have equal amounts of pos/neg reviews and equal amounts of top-quartile-HVAR vs 0 helpful votes

num_per_condition = 13033
repl=False

train_neg_helpful = train[(train.overall == 1) & (train.group_z_class == 1) & (train.helpful_votes != 0)].sample(num_per_condition, replace=repl, random_state=42)
train_neg_unhelpful = train[(train.overall == 1) & (train.group_z_class == 0) & (train.helpful_votes == 0)].sample(num_per_condition, replace=repl, random_state=42)
train_pos_unhelpful = train[(train.overall == 5) & (train.group_z_class == 0) & (train.helpful_votes == 0)].sample(num_per_condition, replace=repl, random_state=42)
train_pos_helpful = train[(train.overall == 5) & (train.group_z_class == 1) & (train.helpful_votes != 0)].sample(num_per_condition, replace=repl, random_state=42)

In [57]:
test.shape

(313602, 19)

In [58]:
# How many examples of each group in dev set?
print('neg_helpful: {}'.format(dev[(dev.overall == 1) & (dev.group_z_class == 1) & (dev.helpful_votes != 0)].shape))
print('neg_unhelpful: {}'.format(dev[(dev.overall == 1) & (dev.group_z_class == 0) & (dev.helpful_votes == 0)].shape))
print('pos_unhelpful: {}'.format(dev[(dev.overall == 5) & (dev.group_z_class == 0) & (dev.helpful_votes == 0)].shape))
print('pos_helpful: {}'.format(dev[(dev.overall == 5) & (dev.group_z_class == 1) & (dev.helpful_votes != 0)].shape))

neg_helpful: (3418, 21)
neg_unhelpful: (1504, 21)
pos_unhelpful: (14786, 21)
pos_helpful: (16719, 21)


In [64]:
# Sample Dev Set
# 1504 = no oversampling

num_per_condition = 1504
repl=False

dev_neg_helpful = dev[(dev.overall == 1) & (dev.group_z_class == 1) & (dev.helpful_votes != 0)].sample(num_per_condition, replace=repl)
dev_neg_unhelpful = dev[(dev.overall == 1) & (dev.group_z_class == 0) & (dev.helpful_votes == 0)].sample(num_per_condition, replace=repl)
dev_pos_unhelpful = dev[(dev.overall == 5) & (dev.group_z_class == 0) & (dev.helpful_votes == 0)].sample(num_per_condition, replace=repl)
dev_pos_helpful = dev[(dev.overall == 5) & (dev.group_z_class == 1) & (dev.helpful_votes != 0)].sample(num_per_condition, replace=repl)

In [60]:
# How many examples of each group in test set?
print('neg_helpful: {}'.format(test[(test.overall == 1) & (test.group_z_class == 1) & (test.helpful_votes != 0)].shape))
print('neg_unhelpful: {}'.format(test[(test.overall == 1) & (test.group_z_class == 0) & (test.helpful_votes == 0)].shape))
print('pos_unhelpful: {}'.format(test[(test.overall == 5) & (test.group_z_class == 0) & (test.helpful_votes == 0)].shape))
print('pos_helpful: {}'.format(test[(test.overall == 5) & (test.group_z_class == 1) & (test.helpful_votes != 0)].shape))

neg_helpful: (3493, 19)
neg_unhelpful: (1533, 19)
pos_unhelpful: (14569, 19)
pos_helpful: (16272, 19)


In [65]:
# Sample Test Set
# 1533 = no oversampling

num_per_condition = 1533
repl=False

test_neg_helpful = test[(test.overall == 1) & (test.group_z_class == 1) & (test.helpful_votes != 0)].sample(num_per_condition, replace=repl)
test_neg_unhelpful = test[(test.overall == 1) & (test.group_z_class == 0) & (test.helpful_votes == 0)].sample(num_per_condition, replace=repl)
test_pos_unhelpful = test[(test.overall == 5) & (test.group_z_class == 0) & (test.helpful_votes == 0)].sample(num_per_condition, replace=repl)
test_pos_helpful = test[(test.overall == 5) & (test.group_z_class == 1) & (test.helpful_votes != 0)].sample(num_per_condition, replace=repl)

In [18]:
# Prepend training set

train_neg_helpful['prepReviewText'] = train_neg_helpful.apply(lambda x: 'WORST ' + x.reviewText,axis = 1)
train_neg_unhelpful['prepReviewText'] = train_neg_unhelpful.apply(lambda x: 'WORST ' + x.reviewText,axis = 1)
train_pos_unhelpful['prepReviewText'] = train_pos_unhelpful.apply(lambda x: 'BEST ' + x.reviewText,axis = 1)
train_pos_helpful['prepReviewText'] = train_pos_helpful.apply(lambda x: 'BEST ' + x.reviewText,axis = 1)

In [67]:
# Prepend dev set

dev_neg_helpful['prepReviewText'] = dev_neg_helpful.apply(lambda x: 'WORST ' + x.reviewText,axis = 1)
dev_neg_unhelpful['prepReviewText'] = dev_neg_unhelpful.apply(lambda x: 'WORST ' + x.reviewText,axis = 1)
dev_pos_unhelpful['prepReviewText'] = dev_pos_unhelpful.apply(lambda x: 'BEST ' + x.reviewText,axis = 1)
dev_pos_helpful['prepReviewText'] = dev_pos_helpful.apply(lambda x: 'BEST ' + x.reviewText,axis = 1)

In [68]:
# Prepend test set

test_neg_helpful['prepReviewText'] = test_neg_helpful.apply(lambda x: 'WORST ' + x.reviewText,axis = 1)
test_neg_unhelpful['prepReviewText'] = test_neg_unhelpful.apply(lambda x: 'WORST ' + x.reviewText,axis = 1)
test_pos_unhelpful['prepReviewText'] = test_pos_unhelpful.apply(lambda x: 'BEST ' + x.reviewText,axis = 1)
test_pos_helpful['prepReviewText'] = test_pos_helpful.apply(lambda x: 'BEST ' + x.reviewText,axis = 1)

In [19]:
# Assemble training set

stratdf_train = train_neg_helpful.append(train_neg_unhelpful, ignore_index=True)
stratdf_train = stratdf_train.append(train_pos_unhelpful, ignore_index=True)
stratdf_train = stratdf_train.append(train_pos_helpful, ignore_index=True)
print(f"Our training dataset is now {stratdf_train.shape[0]} reviews.")

Our training dataset is now 52132 reviews.


In [70]:
# Assemble dev set

stratdf_dev = dev_neg_helpful.append(dev_neg_unhelpful, ignore_index=True)
stratdf_dev = stratdf_dev.append(dev_pos_unhelpful, ignore_index=True)
stratdf_dev = stratdf_dev.append(dev_pos_helpful, ignore_index=True)
print(f"Our development dataset is now {stratdf_dev.shape[0]} reviews.")

Our development dataset is now 6016 reviews.


In [72]:
# Assemble test set

stratdf_test = test_neg_helpful.append(test_neg_unhelpful, ignore_index=True)
stratdf_test = stratdf_test.append(test_pos_unhelpful, ignore_index=True)
stratdf_test = stratdf_test.append(test_pos_helpful, ignore_index=True)
print(f"Our test dataset is now {stratdf_test.shape[0]} reviews.")

Our test dataset is now 6132 reviews.


In [22]:
df_train_prep = shuffle(stratdf_train,random_state=42)[['asin','prepReviewText','group_z_class']]
df_dev_prep = shuffle(stratdf_dev,random_state=42)[['prepReviewText','group_z_class']]
df_test_prep = shuffle(stratdf_test,random_state=42)[['prepReviewText','group_z_class']]

NameError: name 'stratdf_dev' is not defined

In [24]:
group_asin_list = list(df_train_prep['asin'].unique())

import pickle

with open('group_asin_list.pkl', 'wb') as f:
    pickle.dump(group_asin_list, f)

In [23]:
df_train_prep.to_csv('train_prep_clust_FINAL.csv')
df_dev_prep.to_csv('dev_prep_clust_FINAL.csv')
df_test_prep.to_csv('test_prep_clust_FINAL.csv')

In [15]:
# skip the above steps if restarting kernel/notebook and just load the data

df_train_prep = pd.read_csv('../w266_proj/data/train_clust_FINAL.csv')
df_dev_prep = pd.read_csv('../w266_proj/data/dev_clust_FINAL.csv')
df_test_prep = pd.read_csv('../w266_proj/data/test_clust_FINAL.csv')

In [16]:
df_train_prep.head()

Unnamed: 0.1,Unnamed: 0,prepReviewText,group_z_class
0,11245,WORST This is my least favorite book on breast...,1.0
1,18683,WORST This is a depressing book. I like upli...,0.0
2,43825,BEST One of my favorite television programs of...,1.0
3,9253,"WORST Aryan Nation, torture, suicide, rape, de...",1.0
4,1165,WORST Chelsea is touted as one of the top come...,1.0


In [4]:
X_train_prep = df_train_prep['prepReviewText']
X_dev_prep = df_dev_prep['prepReviewText']
X_test_prep = df_test_prep['prepReviewText']

y_train_prep = df_train_prep['group_z_class']
y_dev_prep = df_dev_prep['group_z_class']
y_test_prep = df_test_prep['group_z_class']

In [5]:
# Transform text examples

vectorizer = TfidfVectorizer(lowercase=True,
                             #tokenizer=Tokenizer,
                             analyzer='word',
                             stop_words=None,
                             token_pattern=r"(?u)\b\w\w+\b|!|\?|\"|\'|\*|\-|\;|\:|\,|\.",
                             ngram_range=(1,2),
                             max_features=None)

X_train_prep_vec = vectorizer.fit_transform(X_train_prep)
X_dev_prep_vec = vectorizer.transform(X_dev_prep)
X_test_prep_vec = vectorizer.transform(X_test_prep)

print("Token Count: {}".format(len(vectorizer.get_feature_names())))

Token Count: 2110756


In [6]:
# Logistic Regression Classifier

clf = LogisticRegression(penalty='l2',
                         C=1.0,
                         random_state=42,
                         solver='saga',
                         multi_class='ovr',
                         max_iter=100,
                         n_jobs=-1,
                         verbose=True)

In [7]:
# Fit model

clf.fit(X_train_prep_vec, y_train_prep)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


convergence after 22 epochs took 10 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   10.0s finished


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='ovr', n_jobs=-1, penalty='l2', random_state=42,
                   solver='saga', tol=0.0001, verbose=True, warm_start=False)

In [10]:
dev_predicted_labels = clf.predict(X_dev_prep_vec)
f1_weighted = metrics.f1_score(y_dev_prep, dev_predicted_labels, average='weighted')
accuracy = metrics.accuracy_score(y_dev_prep, dev_predicted_labels)
    
print('Logistic Regression Classifer - 2 Class (cluster) Labels FINAL')
print('-------------\n')
print('Accuracy on test set: {:0.4f}'.format(accuracy))
print('f_1 score (Weighted): {:0.4f}'.format(f1_weighted))

Logistic Regression Classifer - 2 Class (cluster) Labels FINAL
-------------

Accuracy on test set: 0.7738
f_1 score (Weighted): 0.7737


In [9]:
target_names = ['not_helpful', 'helpful']

print(metrics.classification_report(y_dev_prep, dev_predicted_labels, target_names=target_names, digits=4))

              precision    recall  f1-score   support

 not_helpful     0.7858    0.7527    0.7689      3008
     helpful     0.7627    0.7949    0.7784      3008

    accuracy                         0.7738      6016
   macro avg     0.7743    0.7738    0.7737      6016
weighted avg     0.7743    0.7738    0.7737      6016



In [11]:
test_predicted_labels = clf.predict(X_test_prep_vec)
f1_weighted = metrics.f1_score(y_test_prep, test_predicted_labels, average='weighted')
accuracy = metrics.accuracy_score(y_test_prep, test_predicted_labels)
    
print('Logistic Regression Classifer - 2 Class (cluster) Labels FINAL (TEST SET)')
print('-------------\n')
print('Accuracy on test set: {:0.4f}'.format(accuracy))
print('f_1 score (Weighted): {:0.4f}'.format(f1_weighted))

Logistic Regression Classifer - 2 Class (cluster) Labels FINAL (TEST SET)
-------------

Accuracy on test set: 0.7652
f_1 score (Weighted): 0.7651


## FINAL MODEL RESULTS ON TEST SET

In [12]:
target_names = ['not_helpful', 'helpful']

print(metrics.classification_report(y_test_prep, test_predicted_labels, target_names=target_names, digits=4))

              precision    recall  f1-score   support

 not_helpful     0.7773    0.7433    0.7599      3066
     helpful     0.7541    0.7870    0.7702      3066

    accuracy                         0.7652      6132
   macro avg     0.7657    0.7652    0.7651      6132
weighted avg     0.7657    0.7652    0.7651      6132



## EXPERIMENTATION/TESTING CODE BLOCKS BELOW


## 2-Class Testing With Groups

Jen's buckets using random sampling to form 4 balanced groups across 2 classes

1. Oversampling all groups produces odd results (equal precision, recall, and f_1 scores across the board)
2. Oversampling underrepresented group might improve results
3. Balanced groups (11,889) produces best results (~77%)
4. Random sampling the 2 classes underperforms using groups (~65.8%)

In [119]:
# First check group membership based on the original outcome variable `most_helpful`
# How many examples of each group?
print('orig_neg_helpful: {}'.format(mine[(mine.overall == 1) & (mine.most_helpful == 1) & (mine.helpful_votes != 0)].shape))
print('orig_neg_unhelpful: {}'.format(mine[(mine.overall == 1) & (mine.most_helpful == 0) & (mine.helpful_votes == 0)].shape))
print('orig_pos_unhelpful: {}'.format(mine[(mine.overall == 5) & (mine.most_helpful == 0) & (mine.helpful_votes == 0)].shape))
print('orig_pos_helpful: {}'.format(mine[(mine.overall == 5) & (mine.most_helpful == 1) & (mine.helpful_votes != 0)].shape))

orig_neg_helpful: (81152, 21)
orig_neg_unhelpful: (13033, 21)
orig_pos_unhelpful: (121879, 21)
orig_pos_helpful: (316234, 21)


In [120]:
# Next, check group membership based on clustering on scaled annual_HVAR values
# How many examples of each group?
print('cl2_neg_helpful: {}'.format(mine[(mine.overall == 1) & (mine.group_z_class == 1) & (mine.helpful_votes != 0)].shape))
print('cl2_neg_unhelpful: {}'.format(mine[(mine.overall == 1) & (mine.group_z_class == 0) & (mine.helpful_votes == 0)].shape))
print('cl2_pos_unhelpful: {}'.format(mine[(mine.overall == 5) & (mine.group_z_class == 0) & (mine.helpful_votes == 0)].shape))
print('cl2_pos_helpful: {}'.format(mine[(mine.overall == 5) & (mine.group_z_class == 1) & (mine.helpful_votes != 0)].shape))

cl2_neg_helpful: (33416, 21)
cl2_neg_unhelpful: (13033, 21)
cl2_pos_unhelpful: (121879, 21)
cl2_pos_helpful: (167093, 21)


**Original and new (cluster) labeling scheme have same quantity of membership for unhelpful reviews**
This makes sense. The clustering approach changes the criteria for inclusion in a helpful category (it makes it harder to get labeled as helpful). This means that the extreme (1-star and 5-star) unhelpful cases should still be the same, but extreme helpful cases will be a smaller set.

**Maximum balanced group size old labels (sampling without replacement) = 11,889**
**Maximum balanced group size 2-class (sampling without replacement) = 11,889**

In [15]:
# Prepare groups based on original labels
# Below I sample to have equal amounts of pos/neg reviews and equal amounts of top-quartile-HVAR vs 0 helpful votes

num_per_condition = 13033
repl=False
orig_neg_helpful = mine[(mine.overall == 1) & (mine.most_helpful == 1) & (mine.helpful_votes != 0)].sample(num_per_condition, replace=repl, random_state=42)
orig_neg_unhelpful = mine[(mine.overall == 1) & (mine.most_helpful == 0) & (mine.helpful_votes == 0)].sample(num_per_condition, replace=repl, random_state=42)
orig_pos_unhelpful = mine[(mine.overall == 5) & (mine.most_helpful == 0) & (mine.helpful_votes == 0)].sample(num_per_condition, replace=repl, random_state=42)
orig_pos_helpful = mine[(mine.overall == 5) & (mine.most_helpful == 1) & (mine.helpful_votes != 0)].sample(num_per_condition, replace=repl, random_state=42)

In [99]:
# Prepare groups based on new 2-class labels

num_per_condition = 13033
repl=False
cl2_neg_helpful = mine[(mine.overall == 1) & (mine.group_z_class == 1) & (mine.helpful_votes != 0)].sample(num_per_condition, replace=repl, random_state=42)
cl2_neg_unhelpful = mine[(mine.overall == 1) & (mine.group_z_class == 0) & (mine.helpful_votes == 0)].sample(num_per_condition, replace=True, random_state=42)
cl2_pos_unhelpful = mine[(mine.overall == 5) & (mine.group_z_class == 0) & (mine.helpful_votes == 0)].sample(num_per_condition, replace=repl, random_state=42)
cl2_pos_helpful = mine[(mine.overall == 5) & (mine.group_z_class == 1) & (mine.helpful_votes != 0)].sample(num_per_condition, replace=repl, random_state=42)
# "reviewText" has the review content
# "most_helpful" has the label of 0 or 1
# "overall" has the star-rating {1,2,3,4,5}

In [249]:
# Prepare groups based on random sample from 2-classes

#num_per_condition = 300000
#repl=False

#cl2_0 = mine[mine.class_2 == 0].sample(num_per_condition, replace=repl, random_state=42)
#cl2_1 = mine[mine.class_2 == 1].sample(num_per_condition, replace=repl, random_state=42)

In [250]:
#cl2_0['prepReviewText'] = cl2_0.apply(lambda x: 'WORST ' + x.reviewText,axis = 1)
#cl2_1['prepReviewText'] = cl2_1.apply(lambda x: 'WORST ' + x.reviewText,axis = 1)


In [17]:
# Experiment with prepending TEXT representation of starts to the reviews, 
# as a way to pass overall rating to our classifier
# because haven't figured out how to send categorical data AROUND the transformer yet
orig_neg_helpful['prepReviewText'] = orig_neg_helpful.apply(lambda x: 'WORST ' + x.reviewText,axis = 1)
orig_neg_unhelpful['prepReviewText'] = orig_neg_unhelpful.apply(lambda x: 'WORST ' + x.reviewText,axis = 1)
orig_pos_unhelpful['prepReviewText'] = orig_pos_unhelpful.apply(lambda x: 'BEST ' + x.reviewText,axis = 1)
orig_pos_helpful['prepReviewText'] = orig_pos_helpful.apply(lambda x: 'BEST ' + x.reviewText,axis = 1)

In [100]:
# Experiment with prepending TEXT representation of starts to the reviews, 
# as a way to pass overall rating to our classifier
# because haven't figured out how to send categorical data AROUND the transformer yet
cl2_neg_helpful['prepReviewText'] = cl2_neg_helpful.apply(lambda x: 'WORST ' + x.reviewText,axis = 1)
cl2_neg_unhelpful['prepReviewText'] = cl2_neg_unhelpful.apply(lambda x: 'WORST ' + x.reviewText,axis = 1)
cl2_pos_unhelpful['prepReviewText'] = cl2_pos_unhelpful.apply(lambda x: 'BEST ' + x.reviewText,axis = 1)
cl2_pos_helpful['prepReviewText'] = cl2_pos_helpful.apply(lambda x: 'BEST ' + x.reviewText,axis = 1)

In [251]:
# Put the subsets into the same dataframe again
#stratdf_cl2_rand = cl2_0.append(cl2_1, ignore_index=True)
#print(f"Our dataset with 2-class (random) labels is now {stratdf_cl2_rand.shape[0]} reviews.")

Our dataset with 2-class (random) labels is now 600000 reviews.


In [19]:
# Put the subsets into the same dataframe again
stratdf_orig = orig_neg_helpful.append(orig_neg_unhelpful, ignore_index=True)
stratdf_orig = stratdf_orig.append(orig_pos_unhelpful, ignore_index=True)
stratdf_orig = stratdf_orig.append(orig_pos_helpful, ignore_index=True)
print(f"Our dataset with original labels is now {stratdf_orig.shape[0]} reviews.")

Our dataset with original labels is now 52132 reviews.


In [101]:
# Put the subsets into the same dataframe again
stratdf_cl2 = cl2_neg_helpful.append(cl2_neg_unhelpful, ignore_index=True)
stratdf_cl2 = stratdf_cl2.append(cl2_pos_unhelpful, ignore_index=True)
stratdf_cl2 = stratdf_cl2.append(cl2_pos_helpful, ignore_index=True)
print(f"Our dataset with 2-class labels is now {stratdf_cl2.shape[0]} reviews.")

Our dataset with 2-class labels is now 52132 reviews.


In [21]:
# Balances classes
stratdf_orig['most_helpful'].value_counts()

1    26066
0    26066
Name: most_helpful, dtype: int64

In [80]:
# Balances classes
stratdf_cl2['class_2'].value_counts()

0.0    30937
1.0    21195
Name: class_2, dtype: int64

In [102]:
# Balances classes
stratdf_cl2['group_z_class'].value_counts()

0.0    26066
1.0    26066
Name: group_z_class, dtype: int64

In [75]:
stratdf_orig.head()

Unnamed: 0,asin,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime,helpful_votes,review_age_days,annual_HVAR,book_num_reviews,std_HVAR,top_quartile_HVAR,min_max,most_helpful,scaled,class_2,class_3,prepReviewText
0,812885147,1,Many of the more sensational items in this boo...,2005-08-29,A6GMEO3VRY51S,microjoe,Not a balanced view,1125273600,29,3250,3.256923,10,1.069925,0.276051,"(0.0, 3.256923076923077)",1,1.0,1.0,2.0,WORST Many of the more sensational items in th...
1,1608194809,1,Save your money; here are the only observation...,2012-01-01,A298FZ0IH1Y09R,"Roberta Proctor ""Roberta""",A Little Too Secretive,1325376000,68,934,26.573876,20,10.879657,9.667526,"(0.8039647577092511, 36.94891201513718)",1,0.71296,1.0,2.0,WORST Save your money; here are the only obser...
2,312578075,1,Most authors get better with time and experien...,2012-10-13,A3NORUUG9HYUQY,SSB,Just Wretched! Big Yawn!,1350086400,8,648,4.506173,11,1.650425,3.210855,"(0.0, 4.506172839506172)",1,1.0,1.0,2.0,WORST Most authors get better with time and ex...
3,385301898,1,"Because of good marketing, Lauurie Cabot is pr...",2008-03-28,A26TY6GCGBF5HT,Graves,badly flawed and egotistical.,1206662400,8,2308,1.265165,20,0.723741,0.761366,"(0.08130986856760972, 3.0692108667529108)",1,0.396216,0.0,1.0,"WORST Because of good marketing, Lauurie Cabot..."
4,442273118,1,This book disappoints. The idea of an undergr...,2012-03-10,A2MMTR804CCXYF,Zaur,Title attractive but content disappointing.,1331337600,28,865,11.815029,12,23.419022,3.75726,"(0.0, 82.80399274047187)",1,0.142687,0.0,0.0,WORST This book disappoints. The idea of an u...


In [23]:
# Original
df_orig = shuffle(stratdf_orig,random_state=42)[['prepReviewText','overall','most_helpful']]

In [103]:
# 2-class
df_cl2 = shuffle(stratdf_cl2,random_state=42)[['prepReviewText','overall','group_z_class']]
df_cl2.to_csv('df_cl2_v5.csv')

In [329]:
# 2-class random (no groups)
#df_cl2_rand = shuffle(stratdf_cl2_rand,random_state=42)[['prepReviewText','overall','class_2']]

In [25]:
# Original Labels
X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(df_orig.prepReviewText,df_orig.most_helpful, test_size=0.2, \
                                    random_state=42,stratify=df_orig.most_helpful)


In [104]:
# 2-class labels
X_train_cl2, X_test_cl2, y_train_cl2, y_test_cl2 = train_test_split(df_cl2.prepReviewText,df_cl2.group_z_class, test_size=0.2, \
                                   random_state=42,stratify=df_cl2.group_z_class)
# Ideally I would like to stratify such that train and test have stratified samples across
# BOTH the most_helpful values AND the overall rating, but I keep getting errors when I try to do that

In [253]:
# 2-class labels (random)
#X_train_cl2_rand, X_test_cl2_rand, y_train_cl2_rand, y_test_cl2_rand = train_test_split(df_cl2_rand.prepReviewText,df_cl2_rand.class_2, test_size=0.2, \
                                   #random_state=42,stratify=df_cl2_rand.class_2)
# Ideally I would like to stratify such that train and test have stratified samples across
# BOTH the most_helpful values AND the overall rating, but I keep getting errors when I try to do that




## Data Preprocessing

In [105]:
# Transform text examples

vectorizer = TfidfVectorizer(lowercase=True,
                             #tokenizer=Tokenizer,
                             analyzer='word',
                             stop_words=None,
                             token_pattern=r"(?u)\b\w\w+\b|!|\?|\"|\'|\*|\-|\;|\:|\,|\.",
                             ngram_range=(1,2),
                             max_features=None)

#X_train_orig_vec = vectorizer.fit_transform(X_train_orig)
#X_test_orig_vec = vectorizer.transform(X_test_orig)

#print("Token Count - Original: {}".format(len(vectorizer.get_feature_names())))

X_train_cl2_vec = vectorizer.fit_transform(X_train_cl2)
X_test_cl2_vec = vectorizer.transform(X_test_cl2)

print("Token Count - 2 class: {}".format(len(vectorizer.get_feature_names())))



#X_train_cl2_rand_vec = vectorizer.fit_transform(X_train_cl2_rand)
#X_test_cl2_rand_vec = vectorizer.transform(X_test_cl2_rand)

#print("Token Count - 2 class: {}".format(len(vectorizer.get_feature_names())))

Token Count - 2 class: 1754541


## Logistic Regression - Original Labels

In [28]:
clf = LogisticRegression(penalty='l2',
                         C=1.0,
                         random_state=42,
                         solver='saga',
                         multi_class='ovr',
                         max_iter=100,
                         n_jobs=-1,
                         verbose=True)



clf.fit(X_train_orig_vec, y_train_orig)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


convergence after 21 epochs took 6 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    5.8s finished


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='ovr', n_jobs=-1, penalty='l2', random_state=42,
                   solver='saga', tol=0.0001, verbose=True, warm_start=False)

In [29]:
test_predicted_labels_orig = clf.predict(X_test_orig_vec)


In [30]:
# Evaluate with various f1 metrics
f1_weighted = metrics.f1_score(y_test_orig, test_predicted_labels_orig, average='weighted')
accuracy = metrics.accuracy_score(y_test_orig, test_predicted_labels_orig)
    
print('Logistic Regression Classifer - Original Labels')
print('-------------\n')
print('Accuracy on test set: {:0.3f}'.format(accuracy))
print('f_1 score (Weighted): {:0.3f}'.format(f1_weighted))


Logistic Regression Classifer - Original Labels
-------------

Accuracy on test set: 0.731
f_1 score (Weighted): 0.731


In [31]:
target_names = ['not_helpful', 'helpful']

print(metrics.classification_report(y_test_orig, test_predicted_labels_orig, target_names=target_names))

              precision    recall  f1-score   support

 not_helpful       0.74      0.71      0.73      5214
     helpful       0.72      0.75      0.74      5213

    accuracy                           0.73     10427
   macro avg       0.73      0.73      0.73     10427
weighted avg       0.73      0.73      0.73     10427



## Logistic Regression - 2 Class Labels

In [106]:
clf = LogisticRegression(penalty='l2',
                         C=1.0,
                         random_state=42,
                         solver='saga',
                         multi_class='ovr',
                         max_iter=100,
                         n_jobs=-1,
                         verbose=True)

In [107]:
clf.fit(X_train_cl2_vec, y_train_cl2)
test_predicted_labels_cl2 = clf.predict(X_test_cl2_vec)
f1_weighted = metrics.f1_score(y_test_cl2, test_predicted_labels_cl2, average='weighted')
accuracy = metrics.accuracy_score(y_test_cl2, test_predicted_labels_cl2)
    
print('Logistic Regression Classifer - 2 Class (cluster) Labels v5')
print('-------------\n')
print('Accuracy on test set: {:0.3f}'.format(accuracy))
print('f_1 score (Weighted): {:0.3f}'.format(f1_weighted))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


convergence after 21 epochs took 7 seconds
Logistic Regression Classifer - 2 Class (cluster) Labels v5
-------------

Accuracy on test set: 0.786
f_1 score (Weighted): 0.786


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    6.8s finished


In [108]:
print(metrics.confusion_matrix(y_test_cl2, test_predicted_labels_cl2))
tn, fp, fn, tp = metrics.confusion_matrix(y_test_cl2, test_predicted_labels_cl2).ravel()
print("True Negative: {}".format(tn))
print("True Positive: {}".format(tp))
print("False Negative: {}".format(fn))
print("False Positive: {}".format(fp))

[[4045 1169]
 [1063 4150]]
True Negative: 4045
True Positive: 4150
False Negative: 1063
False Positive: 1169


In [109]:


target_names = ['not_helpful', 'helpful']

print(metrics.classification_report(y_test_cl2, test_predicted_labels_cl2, target_names=target_names))

              precision    recall  f1-score   support

 not_helpful       0.79      0.78      0.78      5214
     helpful       0.78      0.80      0.79      5213

    accuracy                           0.79     10427
   macro avg       0.79      0.79      0.79     10427
weighted avg       0.79      0.79      0.79     10427



In [36]:
# Reference Testing

# Here I have a few "canary in the coal mine" examples
# If the model can't get these right, I put very little faith in it

# This is 1* without sharing why
vacuous_negative = "WORST I like to read many different books from many points of view.  This book was a complete waste of time that I will never get back. Save yourself the trouble and money."

# This is 5* but worthless
vacuous_positive = "BEST My husband has really loved this series and I have heard the same comments from many others who have also read this series."

# This looks like someone who didn't read the book was paid to write a long 5* review
# it sounds like a completely generic description of any cookbook ("it provides recipes to prepare foods...")
# Perhaps mentions of many concepts (ingredients, gourmet, etc.) can fool some people and also an algorithm
# Helpful votes: 71
# Annual HVAR: 5
# For this book, the top quartile was HVAR of 1.7
# Surely adding GENRE would knock this one down
vacuous_cookbook = "BEST As someone who is learning to cook only late in her life, I was apprehensive and embarrassed about asking simple basic questions of friends and family.  Perceiving this, my parents gave me this cookbook, and voila!  -- I can cook!With step-by-step instructions on everything from  cookware, ingredients, buying, preapring, cooking, and serving, there's  nothing this book can't handle.  It provides recipes to prepare foods in  the simplest ways, all the way up to complex gourmet dishes.  And it covers  every imaginable food -- if it isn't in here, I can't imagine where you'd  find it.The language is straightforward and encouraging, with  appropriate editorializing on the author's preferences, and the layout is  clean and easy to read.  I can't say enough good things about this cookbook  -- it never leaves my kitchen counter."

# Reviewer hasn't read it yet
vacuous_not_read = "BEST Just downloaded this series.  Looking forward to getting to read it, once i get past some of the other books on my reading list.  I just love beig able to carryall these books without having to carry them individually!"

# This excerpt of a review got 5* and JenD finds it helpful 
meaty_positive = "BEST After \"Riding Lessons\", which I loved - and \"Flying Changes\", which was a huge disappointment to me, I was not sure what I would find in \"Water for Elephants\".  Wow - what a great read!  Research does pay off hugely - when it enables a writer to place the reader inside another world so easily - the world of the circus. This was a world totally foreign to most of us - but now, so familiar, thanks to Sara.  This book satisfied my three requirements - transportation (take me away from all this),  levitation (lift my spirits and leave me thinking good thoughts) and infiltration (let me get inside the characters so I feel I really know them). Reading \"Water for Elephants\" is time well-spent.  I'm happy to know Sara is working on a fourth book. I'll be first in line to buy it."
#Research does pay off hugely - when it enables a writer to place the reader inside another world so easily - the world of the circus. This was a world totally foreign to most of us - but now, so familiar, thanks to Sara. This book satisfied my three requirements - transportation (take me away from all this),  levitation (lift my spirits and leave me thinking good thoughts) and infiltration (let me get inside the characters so I feel I really know them). Time well spent!"

# Another excerpt from a 5* with many helpful reviews
meaty_positive2 = "BEST This had the flavor of The Great Gatsby, the well to do characters spend their days going from luncheon to evening parties and everyone is concerned about who's who. That is, on the surface it has that flavor. Beneath is a gripping story more about Trudy and Will than about the piano teacher, Claire and the desperate desire to survive.The characters in this book are well developed. There is a lot going on beneath the surface that the author lets you discern.Life in Hong Kong during the 40's is a lark and all about the parties you go to, until the Japanese occupation.  Will is interned along with many of the other socialites.  Life becomes getting food, keeping warm, keeping from being infested, keeping from being singled out for abuse."

# This excerpt is from a negative 1* review and includes arguments (>300 helpful votes) 
meaty_negative = "WORST Colin Campbell is so intent on promoting a vegan data that he misrepresents the data in the real China Study and cherry picks anti-animal food data. For instance, he rightly cites the link between milk and autoimmune disease but fails to mention that gluten, from wheat and related grains, is at least as important a cause. He writes of the association between casein, a milk protein, with cancer, but fails to mention that whey and butterfat are protective against cancer, and in milk you get all of them. He makes completely false statements like folate not being in meat when organ meats are much higher in folate than any plant source according to the USDA. He assumes nutrient consistency with the US without actually measuring it, despite the fact that soil nutrients and species differences have a huge effect on nutrition."

# This is a scholar reviewing another scholar's 5* work
# Helpful votes: 788 and annual HVAR: 74
meaty_scholar = "BEST Noted historian of the early church Elaine Pagels has produced a clear, cogent, and very effective introduction to the subject of Gnosticism, a different form of Christianity that was declared heretical and virtually stamped out by the orthodox church by the start of the second century after Christ.  Most of what we knew of the Gnostic belief system came from the religious authors who worked so hard to destroy the movement, but that changed drastically with the still relatively recent discovery of a number of lost Gnostic writings near Nag Hammadi in Upper Egypt.  Unlike the Dead Sea Scrolls, this momentous discovery of ancient papyri has received little attention, and I must admit I went into this book knowing virtually nothing about Gnosticism.  As an historian by training and a Christian, the information in these &quot;heretical&quot; texts intrigue me, and I believe that Christians should challenge their faith by examining material that does not fall in line"

# Make the list of canaries
pred_sentences = [vacuous_negative,\
                  vacuous_positive, \
                  vacuous_cookbook, \
                  vacuous_not_read, \
                  meaty_negative, \
                  meaty_positive, \
                  meaty_positive2, \
                  meaty_scholar]

In [37]:
canaries = vectorizer.transform(pred_sentences)

In [38]:
canaries_pred = clf.predict(canaries)
canaries_pred_prob = clf.predict_proba(canaries)

for i in canaries_pred:
    print(i)

canaries_pred
np.argmax(canaries_pred_prob[0])

0.0
0.0
1.0
0.0
1.0
0.0
1.0
1.0


0

In [40]:
for pred in canaries_pred_prob:
    max_class_prob = max(pred)
    
    boundary_dist = max_class_prob - 0.5
    
    if boundary_dist < 0.1:
        print('Gee...not really sure')
    else:
        print(np.argmax(pred))

Gee...not really sure
0
1
0
1
Gee...not really sure
1
1


In [41]:
vectorizer = TfidfVectorizer(lowercase=True,
                             #tokenizer=Tokenizer,
                             analyzer='word',
                             stop_words=None,
                             token_pattern=r"(?u)\b\w\w+\b|!|\?|\"|\'|\*|\-|\;|\:|\,|\.",
                             ngram_range=(1,2),
                             max_features=None)

X_train_cl2_vec = vectorizer.fit_transform(X_train_cl2)

In [16]:
def getPrediction(in_sentences):
    labels = ["Unhelpful", "Helpful"]
    
    tfidf_transform = vectorizer.transform(in_sentences)
    
    predictions = clf.predict_proba(tfidf_transform)
    
    return [(sentence, prediction, labels[np.argmax(prediction)]) for sentence, prediction in zip(in_sentences, predictions)]

In [43]:
predictions = getPrediction(pred_sentences)
predictions

[('WORST I like to read many different books from many points of view.  This book was a complete waste of time that I will never get back. Save yourself the trouble and money.',
  array([0.5854154, 0.4145846]),
  'Unhelpful'),
 ('BEST My husband has really loved this series and I have heard the same comments from many others who have also read this series.',
  array([0.86454695, 0.13545305]),
  'Unhelpful'),
 ("BEST As someone who is learning to cook only late in her life, I was apprehensive and embarrassed about asking simple basic questions of friends and family.  Perceiving this, my parents gave me this cookbook, and voila!  -- I can cook!With step-by-step instructions on everything from  cookware, ingredients, buying, preapring, cooking, and serving, there's  nothing this book can't handle.  It provides recipes to prepare foods in  the simplest ways, all the way up to complex gourmet dishes.  And it covers  every imaginable food -- if it isn't in here, I can't imagine where you'd  

In [34]:
test = ['wonderful best favorite excellent' for x in range(500)]
test = ' '.join(test)

In [40]:
amazon_help_rev_pos_1 = "I've had this iPad for just a few weeks but I love it. It arrived brand new in original packaging with all the standard accessories. I bought the iPad for the sole purpose of using it to start getting into digital art. When I say sole purpose, I mean I only use this iPad for ProCreate. I never have the internet on or use it for videos since I use my MacAir or iPhone for those things. I purchased the Apple Pencil and it works wonderfully on this iPad. For anyone who is interested in this more budget iPad for digital art or taking notes, I would recommend it. I bought a case for the iPad and it isn't as light as you might think. It has some weight to it. So if you are someone who really needs a REALLY lightweight tablet, this might not be great. The battery life so far is amazing. But, like I said before I don't have the internet on or any other app running besides ProCreate when I am using it. Maybe in two years this little guy won't be running so great, but for right now I am so pleased with my purchase. The one thing I might regret is not buying the one with more GB of memory. If you are looking into breaking into digital art, or would like to start doing digital illustrations, I would recommend this. However, for someone who has a good grasp of digital art or has used a previous tablet, consider the size of this tablet. If you need a lot of space to draw, this screen size might be too small and you should consider the 10 or 12 inch ipad Pro line. I have small hands and haven't ever used a tablet to draw before so it works great for me."

In [44]:
amazon_help_rev_pos_low_votes = "Much faster and less glitchy than my Air 2, battery lasts longer, no longer have trouble finding nearby networks. I know people said it’s not worth the upgrade if you have an Air 2 but it was totally worth it to me."

In [46]:
amazon_help_rev_pos_more_votes = "Ordered the new iPad because the sale was too good to let go. I had wanted the iPad Apple Pencil pair, so being able to get both for the price of just the iPad after tax was worth it. Apple got us, yet again. Not that I mind though, I love my apple products. This iPad is a great example. I will primarily be using it for digital art/graphic design/photo editing adobe type programs. I’ve used half the Wacom models, including the cintique. Haven’t had a chance to use the new iPad Pro and pencil, but for the more affordable version, this earns all 5 stars. #1, gorgeous screen, easy to set up, easy to use right out of the box. Set up is even easier now if you have another apple device. The setup prompt me to grab an apple device logged into my account already. From there I entered my passcode on my phone, before I knew it the iPad was asking if I wanted all my apps brought over, or if I wanted to start fresh. They took what I thought was already a pretty easy process thanks to iCloud, and made it even more user friendly. #2, it feels like the money you spent. Unlike other android devices I have attempted, iOS devices have never let me down in the quality department. This being said, it’s not as light as the kindle fire or what have you, but it’s going to last longer and be a more enjoyable experience over all for the extra couple of ounces. It’s not heavy compared to my MacBook, which is why I have it. Fits in my purse while still being a gorgeous screen to watch movies on. Retina displays will change your life, and ruin every other screen you look at. #3, the Apple Pencil. I get it now. And do not regret a single penny spent. The pressure sensitivity, accuracy of the accelerometer, how fast it charges, and how the pen to screen experience is closer to paper than anything I’ve ever used. I’m talking Wacom Cintique level accuracy on a tablet. It feels unreal. I love my moleskines more than the next person, but I’m worried they might be getting less use in the coming months just because this is so user friendly. I’ll be sure to add battery details or any updates, but those 3 alone were enough to sell me on the purchase."

In [41]:
amazon_help_rev_neg_1 = "I bought this iPad as Christmas gift for myself and together with the apple pencil I was planning on using them for a more productive semester at school. One day I put this iPad on the table(about 1.65 foot tall) and it dropped shortly after I left because of imbalance. I didn't even think what was about to happen as I used my iPad Air(first Generation) for years and it dropped multiple times from place much higher than this table. When I went check my iPad, I was shocked, the entire screen was broken, the tiny glass pieces falling off and cut my finger. I couldn't believe this screen being this fragile, if this wasn't me but a young kid I couldn't able to imagine how bad things could turn to. Apple shouldn't allow its product to has screen this fragile, it's definitely a safety concerns for all its consumers. I called Apple and hope they can help me, the people whom answered my phone call keep telling me there is nothing they can do, and my only option is to replace my screen for 249.99 (yes, exactly how much I got this iPad). This whole thing is so ridiculous, I've never seen a electronic product being this fragile. For all of you thinking about buying this product, please make sure you got yourself a case, for your own safety."

In [64]:
az_help_pos = "I wanted to write a review, not just about these headphones, but comparing them to the G4ME ZERO. I originally purchased the G4ME ONE, and upon waking up the next day, realized I bought the wrong ones. I wanted the closed version. So I ordered the ZEROs right away, and within a couple days I had $400 worth of headphones sitting on my desk. First off, both sound absolutely fantastic paired with an adequate amplifier. I gave the ZEROs a good test run with a couple long gaming sessions and listening to some tunes. I was impressed by the quality of the sound, they sounded great. Gaming is where I ran into trouble. I typically use closed headphones, but I've always used cheaper ones, which don't isolate the sound as well as higher quality headphones. Games sounded great, but talking to teammates was very troublesome. They are so well isolated, I can't hear myself talking. At all. Plug your ears with your fingers and try holding a conversation with someone, exactly like that. Even enabling the stereo mix setting that allows you to quietly hear yourself through the mic, it wasn't enough. I just could not stand it. Enter the G4ME ONE. I finally unboxed my G4ME ONEs today and plugged them in. The sound quality is close on both, but these seem to be a bit cleaner, especially on the lows. On the ZEROs, bass felt ever so slightly 'detached' from the mids, if that makes any sense. On the ONEs, the frequencies seem to blend better. They're outstanding. Mids and highs are crystal clear, bass is clean and punchy but not boomy at the extreme low end. They sound very natural all around. One small con about both headsets, is that the volume wheel on the right ear cup doesn't go to 0 volume. It only goes down to about 25% volume, which is a little irritating if you want to quickly silence your headset. A few other differences between the two is that the ONEs are an all-plastic design, whereas the ZEROs have a metal rod connecting the ear cups to the headband. How much that actually adds to the structural integrity, I don't know. The ZEROs also come in a carrying case (that nobody in their right mind would use) and so the ear cups rotate 90 degrees to fit into the case. The ONEs only rotate a few degrees, which is fine for me. The ZEROs win hands down in comfort; it's like soft, leather-wrapped pillows around your ears, and they're large, but my ears got very hot very fast (and I was wearing them in a ~55-60 degree house). The ONEs have slightly smaller felt cups and are a little denser but no temperature issues. Other than that, they're very similar."

In [68]:
test='Call me Ishmael. Some years ago—never mind how long precisely—having little or no money in my purse, and nothing particular to interest me on shore, I thought I would sail about a little and see the watery part of the world. It is a way I have of driving off the spleen and regulating the circulation. Whenever I find myself growing grim about the mouth; whenever it is a damp, drizzly November in my soul; whenever I find myself involuntarily pausing before coffin warehouses, and bringing up the rear of every funeral I meet; and especially whenever my hypos get such an upper hand of me, that it requires a strong moral principle to prevent me from deliberately stepping into the street, and methodically knocking people’s hats off—then, I account it high time to get to sea as soon as I can. This is my substitute for pistol and ball. With a philosophical flourish Cato throws himself upon his sword; I quietly take to the ship. There is nothing surprising in this. If they but knew it, almost all men in their degree, some time or other, cherish very nearly the same feelings towards the ocean with me.'

In [69]:
getPrediction([test])

[('Call me Ishmael. Some years ago—never mind how long precisely—having little or no money in my purse, and nothing particular to interest me on shore, I thought I would sail about a little and see the watery part of the world. It is a way I have of driving off the spleen and regulating the circulation. Whenever I find myself growing grim about the mouth; whenever it is a damp, drizzly November in my soul; whenever I find myself involuntarily pausing before coffin warehouses, and bringing up the rear of every funeral I meet; and especially whenever my hypos get such an upper hand of me, that it requires a strong moral principle to prevent me from deliberately stepping into the street, and methodically knocking people’s hats off—then, I account it high time to get to sea as soon as I can. This is my substitute for pistol and ball. With a philosophical flourish Cato throws himself upon his sword; I quietly take to the ship. There is nothing surprising in this. If they but knew it, almost

## Tomeks removed

In [46]:
# 2-class Tomek links removed
# current winner: ratio='majority'

tl = TomekLinks(return_indices=True, ratio='majority', n_jobs=-1)
X_tl, y_tl, id_tl = tl.fit_sample(vectorizer.fit_transform(df_cl2['prepReviewText']), df_cl2['class_2'])

In [47]:
# train/test split
# 2-class labels
X_train_cl2_tl, X_test_cl2_tl, y_train_cl2_tl, y_test_cl2_tl = train_test_split(X_tl,y_tl, test_size=0.2, \
                                   random_state=42,stratify=y_tl)

In [48]:
clf.fit(X_train_cl2_tl, y_train_cl2_tl)
test_predicted_labels_cl2_tl = clf.predict(X_test_cl2_tl)
f1_weighted = metrics.f1_score(y_test_cl2_tl, test_predicted_labels_cl2_tl, average='weighted')
accuracy = metrics.accuracy_score(y_test_cl2_tl, test_predicted_labels_cl2_tl)
    
print('Logistic Regression Classifer - 2 Class (cluster) Labels')
print('-------------\n')
print('Accuracy on test set: {:0.3f}'.format(accuracy))
print('f_1 score (Weighted): {:0.3f}'.format(f1_weighted))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


convergence after 21 epochs took 6 seconds
Logistic Regression Classifer - 2 Class (cluster) Labels
-------------

Accuracy on test set: 0.760
f_1 score (Weighted): 0.760


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    6.4s finished


In [49]:
target_names = ['not_helpful', 'helpful']

print(metrics.classification_report(y_test_cl2_tl, test_predicted_labels_cl2_tl, target_names=target_names))

              precision    recall  f1-score   support

 not_helpful       0.75      0.77      0.76      5112
     helpful       0.77      0.75      0.76      5213

    accuracy                           0.76     10325
   macro avg       0.76      0.76      0.76     10325
weighted avg       0.76      0.76      0.76     10325



In [50]:
predictions = getPrediction(pred_sentences)
predictions

[('WORST I like to read many different books from many points of view.  This book was a complete waste of time that I will never get back. Save yourself the trouble and money.',
  array([0.61822559, 0.38177441]),
  'Unhelpful'),
 ('BEST My husband has really loved this series and I have heard the same comments from many others who have also read this series.',
  array([0.87253292, 0.12746708]),
  'Unhelpful'),
 ("BEST As someone who is learning to cook only late in her life, I was apprehensive and embarrassed about asking simple basic questions of friends and family.  Perceiving this, my parents gave me this cookbook, and voila!  -- I can cook!With step-by-step instructions on everything from  cookware, ingredients, buying, preapring, cooking, and serving, there's  nothing this book can't handle.  It provides recipes to prepare foods in  the simplest ways, all the way up to complex gourmet dishes.  And it covers  every imaginable food -- if it isn't in here, I can't imagine where you'd

## Test Values for C - 2 Class Model

In [51]:
clf_2 = LogisticRegression(penalty='l2',
                           C=1.0,
                           random_state=42,
                           solver='saga',
                           multi_class='ovr',
                           max_iter=100,
                           n_jobs=-1,
                           verbose=True)

param_grid = {'C':list(np.linspace(0.1,1.0,10))}            
clf_2 = GridSearchCV(clf_2, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=True)
clf_2.fit(X_train_cl2_tl, y_train_cl2_tl)
best_c = round(clf_2.best_params_['C'],2)
print('Best value for C: {}'.format(best_c))

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  1.3min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


convergence after 21 epochs took 7 seconds
Best value for C: 1.0


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    6.8s finished


In [52]:
# Fit a model with the best C value
clf_2 = LogisticRegression(penalty='l2',
                           C=best_c,
                           random_state=42,
                           solver='saga',
                           multi_class='ovr',
                           max_iter=100,
                           n_jobs=-1,
                           verbose=True)

clf_2.fit(X_train_cl2_tl, y_train_cl2_tl)
test_predicted_labels_cl2_tl = clf_2.predict(X_test_cl2_tl)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


convergence after 21 epochs took 7 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    6.5s finished


In [53]:
# Evaluate with various f1 metrics
f1_weighted = metrics.f1_score(y_test_cl2_tl, test_predicted_labels_cl2_tl, average='weighted')
accuracy = metrics.accuracy_score(y_test_cl2_tl, test_predicted_labels_cl2_tl)
    
print('Logistic Regression Classifer - 2 Class (cluster) Labels')
print('-------------\n')
print('Accuracy on test set: {:0.3f}'.format(accuracy))
print('f_1 score (Weighted): {:0.3f}'.format(f1_weighted))

Logistic Regression Classifer - 2 Class (cluster) Labels
-------------

Accuracy on test set: 0.760
f_1 score (Weighted): 0.760


In [54]:
metrics.confusion_matrix(y_test_cl2, test_predicted_labels_cl2)
tn, fp, fn, tp = metrics.confusion_matrix(y_test_cl2, test_predicted_labels_cl2).ravel()
print("True Negative: {}".format(tn))
print("True Positive: {}".format(tp))
print("False Negative: {}".format(fn))
print("False Positive: {}".format(fp))

True Negative: 3968
True Positive: 4075
False Negative: 1138
False Positive: 1246


## Logistic Regression - 3 Class Labels
1. Trying to put middle class into buckets produces poor results (f_1 ~ 51%); likely too much variance
2. Randomly sampling from entire middle class is only slightly better (f_1 ~ 59.6%)
3. Randomly dampling all classes (no buckets) terrible performance (f_1 ~ 44.5%)

In [152]:
# Next, check group membership based on clustering on scaled annual_HVAR values
# How many examples of each group?
print('cl3_neg_helpful: {}'.format(mine[(mine.overall == 1) & (mine.class_3 == 1) & (mine.helpful_votes != 0)].shape))
print('cl3_neg_unhelpful: {}'.format(mine[(mine.overall == 1) & (mine.class_3 == 0) & (mine.helpful_votes == 0)].shape))
# a negaitve middle ground review has accumulated SOME helpfulness
print('cl3_neg_middle: {}'.format(mine[(mine.overall == 1) & (mine.class_3 == 2) & (mine.helpful_votes != 0)].shape))

print('cl3_pos_unhelpful: {}'.format(mine[(mine.overall == 5) & (mine.class_3 == 0) & (mine.helpful_votes == 0)].shape))
print('cl3_pos_helpful: {}'.format(mine[(mine.overall == 5) & (mine.class_3 == 1) & (mine.helpful_votes != 0)].shape))
# a positive middle ground review has accumulated SOME helpfulness
print('cl3_pos_middle: {}'.format(mine[(mine.overall == 5) & (mine.class_3 == 2) & (mine.helpful_votes != 0)].shape))



cl3_neg_helpful: (36977, 19)
cl3_neg_unhelpful: (11889, 19)
cl3_neg_middle: (20698, 19)
cl3_pos_unhelpful: (116227, 19)
cl3_pos_helpful: (193895, 19)
cl3_pos_middle: (141347, 19)


In [160]:
# Prepare groups based on new 3-class labels (poor results)

#num_per_condition = 11889
#repl=False
#cl3_neg_helpful = mine[(mine.overall == 1) & (mine.class_3 == 1) & (mine.helpful_votes != 0)].sample(num_per_condition, replace=repl, random_state=42)
#cl3_neg_unhelpful = mine[(mine.overall == 1) & (mine.class_3 == 0) & (mine.helpful_votes == 0)].sample(num_per_condition, replace=repl, random_state=42)
#cl3_neg_middle = mine[(mine.overall == 1) & (mine.class_3 == 2) & (mine.helpful_votes != 0)].sample(num_per_condition, replace=repl, random_state=42)

#cl3_pos_unhelpful = mine[(mine.overall == 5) & (mine.class_3 == 0) & (mine.helpful_votes == 0)].sample(num_per_condition, replace=repl, random_state=42)
#cl3_pos_helpful = mine[(mine.overall == 5) & (mine.class_3 == 1) & (mine.helpful_votes != 0)].sample(num_per_condition, replace=repl, random_state=42)
#cl3_pos_middle = mine[(mine.overall == 5) & (mine.class_3 == 2) & (mine.helpful_votes != 0)].sample(num_per_condition, replace=repl, random_state=42)
# "reviewText" has the review content
# "most_helpful" has the label of 0 or 1
# "overall" has the star-rating {1,2,3,4,5}

In [201]:
# Try a random sample from entire middle class (poor results)

#num_per_condition = 11889
#repl=False
#cl3_neg_helpful = mine[(mine.overall == 1) & (mine.class_3 == 1) & (mine.helpful_votes != 0)].sample(num_per_condition, replace=repl, random_state=42)
#cl3_neg_unhelpful = mine[(mine.overall == 1) & (mine.class_3 == 0) & (mine.helpful_votes == 0)].sample(num_per_condition, replace=repl, random_state=42)

#cl3_pos_unhelpful = mine[(mine.overall == 5) & (mine.class_3 == 0) & (mine.helpful_votes == 0)].sample(num_per_condition, replace=repl, random_state=42)
#cl3_pos_helpful = mine[(mine.overall == 5) & (mine.class_3 == 1) & (mine.helpful_votes != 0)].sample(num_per_condition, replace=repl, random_state=42)

# two strategies:
# - some accumulated helpful votes
# - random
#cl3_middle = mine[(mine.class_3 == 2) & (mine.helpful_votes != 0)].sample(num_per_condition*1, replace=repl, random_state=42)
#cl3_middle = mine[mine.class_3 == 2].sample(num_per_condition*2, replace=repl, random_state=42)

In [209]:
mine['class_3'].value_counts()

0.0    1820423
1.0     394950
2.0     261174
Name: class_3, dtype: int64

In [211]:
num_per_condition = 100000
repl=False
cl3_0 = mine[mine.class_3 == 0].sample(num_per_condition, replace=repl, random_state=42)
cl3_1 = mine[mine.class_3 == 1].sample(num_per_condition, replace=repl, random_state=42)
cl3_2 = mine[mine.class_3 == 2].sample(num_per_condition, replace=repl, random_state=42)


In [202]:
# Experiment with prepending TEXT representation of starts to the reviews, 
# as a way to pass overall rating to our classifier
# because haven't figured out how to send categorical data AROUND the transformer yet
#cl3_neg_helpful['prepReviewText'] = cl3_neg_helpful.apply(lambda x: 'WORST ' + x.reviewText,axis = 1)
#cl3_neg_unhelpful['prepReviewText'] = cl3_neg_unhelpful.apply(lambda x: 'WORST ' + x.reviewText,axis = 1)
#cl3_neg_middle['prepReviewText'] = cl3_neg_middle.apply(lambda x: 'WORST ' + x.reviewText,axis = 1)

#cl3_pos_unhelpful['prepReviewText'] = cl3_pos_unhelpful.apply(lambda x: 'BEST ' + x.reviewText,axis = 1)
#cl3_pos_helpful['prepReviewText'] = cl3_pos_helpful.apply(lambda x: 'BEST ' + x.reviewText,axis = 1)
#cl3_pos_middle['prepReviewText'] = cl3_pos_middle.apply(lambda x: 'BEST ' + x.reviewText,axis = 1)

#cl3_middle['prepReviewText'] = cl3_middle.apply(lambda x: 'BEST ' + x.reviewText,axis = 1)

In [212]:
cl3_0['prepReviewText'] = cl3_0.apply(lambda x: 'WORST ' + x.reviewText,axis = 1)
cl3_1['prepReviewText'] = cl3_1.apply(lambda x: 'WORST ' + x.reviewText,axis = 1)
cl3_2['prepReviewText'] = cl3_2.apply(lambda x: 'WORST ' + x.reviewText,axis = 1)

In [164]:
# Put the subsets into the same dataframe again
#stratdf_cl3 = cl3_neg_helpful.append(cl3_neg_unhelpful, ignore_index=True)
#stratdf_cl3 = stratdf_cl3.append(cl3_pos_unhelpful, ignore_index=True)
#stratdf_cl3 = stratdf_cl3.append(cl3_pos_helpful, ignore_index=True)
#stratdf_cl3 = stratdf_cl3.append(cl3_neg_middle, ignore_index=True)
#stratdf_cl3 = stratdf_cl3.append(cl3_pos_middle, ignore_index=True)

#print(f"Our dataset with 3-class labels is now {stratdf_cl3.shape[0]} reviews.")

Our dataset with 3-class labels is now 23778 reviews.
Our dataset with 3-class labels is now 35667 reviews.
Our dataset with 3-class labels is now 47556 reviews.
Our dataset with 3-class labels is now 59445 reviews.
Our dataset with 3-class labels is now 71334 reviews.


In [203]:
# Put the subsets into the same dataframe again

#stratdf_cl3 = cl3_neg_helpful.append(cl3_neg_unhelpful, ignore_index=True)
#stratdf_cl3 = stratdf_cl3.append(cl3_pos_unhelpful, ignore_index=True)
#stratdf_cl3 = stratdf_cl3.append(cl3_pos_helpful, ignore_index=True)
#stratdf_cl3 = stratdf_cl3.append(cl3_middle, ignore_index=True)

#print(f"Our dataset with 3-class labels is now {stratdf_cl3.shape[0]} reviews.")

Our dataset with 3-class labels is now 71334 reviews.


In [213]:
stratdf_cl3 = cl3_0.append(cl3_1, ignore_index=True)
stratdf_cl3 = stratdf_cl3.append(cl3_2, ignore_index=True)

print(f"Our dataset with 3-class labels is now {stratdf_cl3.shape[0]} reviews.")

Our dataset with 3-class labels is now 300000 reviews.


In [214]:
# Balances classes
stratdf_cl3['class_3'].value_counts()

1.0    100000
2.0    100000
0.0    100000
Name: class_3, dtype: int64

In [215]:
df_cl3 = shuffle(stratdf_cl3,random_state=42)[['prepReviewText','overall','class_3']]

In [216]:
# 3-class labels
X_train_cl3, X_test_cl3, y_train_cl3, y_test_cl3 = train_test_split(df_cl3.prepReviewText,df_cl3.class_3, test_size=0.2, \
                                   random_state=42,stratify=df_cl3.class_3)

In [217]:
# Transform text examples

vectorizer = TfidfVectorizer(lowercase=True,
                             #tokenizer=Tokenizer,
                             analyzer='word',
                             stop_words=None,
                             token_pattern=r"(?u)\b\w\w+\b|!|\?|\"|\'|\*|\-|\;|\:|\,|\.",
                             ngram_range=(1,2),
                             max_features=None)

X_train_cl3_vec = vectorizer.fit_transform(X_train_cl3)
X_test_cl3_vec = vectorizer.transform(X_test_cl3)

print("Token Count - 3 class: {}".format(len(vectorizer.get_feature_names())))

Token Count - 3 class: 6893652


In [218]:
clf.fit(X_train_cl3_vec, y_train_cl3)
test_predicted_labels_cl3 = clf.predict(X_test_cl3_vec)
f1_weighted = metrics.f1_score(y_test_cl3, test_predicted_labels_cl3, average='weighted')
accuracy = metrics.accuracy_score(y_test_cl3, test_predicted_labels_cl3)
    
print('Logistic Regression Classifer - 3 Class (cluster) Labels')
print('-------------\n')
print('Accuracy on test set: {:0.3f}'.format(accuracy))
print('f_1 score (Weighted): {:0.3f}'.format(f1_weighted))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


convergence after 20 epochs took 86 seconds
convergence after 22 epochs took 93 seconds
convergence after 24 epochs took 99 seconds


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.6min finished


Logistic Regression Classifer - 3 Class (cluster) Labels
-------------

Accuracy on test set: 0.457
f_1 score (Weighted): 0.445


In [170]:
print(metrics.confusion_matrix(y_test_cl3, test_predicted_labels_cl3))
tn, fp, fn, tp = metrics.confusion_matrix(y_test_cl3, test_predicted_labels_cl3).ravel()
print("True Negative: {}".format(tn))
print("True Positive: {}".format(tp))
print("False Negative: {}".format(fn))
print("False Positive: {}".format(fp))

[[3408  595  752]
 [1389 1165 2202]
 [ 839  949 2968]]


ValueError: too many values to unpack (expected 4)

## Interpret Correct and Incorrect Classifications

In [111]:
# When classifications are incorrect, what are the class probabilities that resulted in class assignment?

pred_prob = clf_2.predict_proba(X_test_cl2_vec)

In [134]:
pred_prob[:26]

array([[0.65465361, 0.34534639],
       [0.74181191, 0.25818809],
       [0.61751734, 0.38248266],
       [0.44185769, 0.55814231],
       [0.24397539, 0.75602461],
       [0.2921176 , 0.7078824 ],
       [0.74290291, 0.25709709],
       [0.84839834, 0.15160166],
       [0.03871746, 0.96128254],
       [0.22270836, 0.77729164],
       [0.1638336 , 0.8361664 ],
       [0.34632303, 0.65367697],
       [0.19883285, 0.80116715],
       [0.2823253 , 0.7176747 ],
       [0.30060976, 0.69939024],
       [0.23455909, 0.76544091],
       [0.45543343, 0.54456657],
       [0.85395959, 0.14604041],
       [0.85331909, 0.14668091],
       [0.47681776, 0.52318224],
       [0.03183905, 0.96816095],
       [0.84278728, 0.15721272],
       [0.12928942, 0.87071058],
       [0.88353061, 0.11646939],
       [0.86053012, 0.13946988],
       [0.14695511, 0.85304489]])

In [135]:
y_test_cl2[:26]

23267    0.0
23962    0.0
21392    0.0
44106    1.0
42631    1.0
37555    1.0
29978    0.0
17454    0.0
2686     1.0
5939     1.0
43259    1.0
3822     1.0
45849    1.0
34321    0.0
7479     1.0
35753    1.0
44595    1.0
34694    0.0
19730    0.0
11465    1.0
8760     1.0
35647    0.0
36497    1.0
13115    0.0
20720    0.0
16914    0.0
Name: class_2, dtype: float64

In [130]:
y_test_cl2.iloc[25]

0.0

In [106]:
misclassified = np.where(y_test_cl2 != clf_2.predict(X_test_cl2_vec))

In [143]:
len(misclassified[0])

2185

In [144]:
for i in range(2185):
    print(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [145]:
error = []

for i in range(len(misclassified[0])):
    index = misclassified[0][i]
    #print('prediction: {}; correct: {}'.format(pred_prob[index], y_test_cl2.iloc[index]))
    #print(0.5 - min(pred_prob[index]))
    error.append(0.5 - min(pred_prob[index]))
    

In [149]:
np.min(error)

0.0001853522413891029

In [109]:
misclassified[0].shape

(2185,)

In [125]:
misclassified[0][1]

25

In [126]:
y_test_cl2.iloc[25]

0.0

In [118]:
print('prediction: {}'.format(pred_prob[25]))
print('actual: {}'.format(y_test_cl2.iloc[13]))

prediction: [0.2823253 0.7176747]
actual: 0.0


In [131]:
print(X_test_cl2.iloc[25])
print(y_test_cl2.iloc[25])

WORST In writing this review, I would like to note two items of importance.First, I have had the opportunity to read a number of Mr. Ringo's other works and have been generally favorably impressed by their quality.Second, although the information on this book both on Amazon and the cover appears to promote it as a novel in reality it is three stories of significantly different lengths with an attempt to pull them  together with some very uninspired interludes.In summary, I found this book to be two hundred pages of boredom surrounded by several somewhat stories which appear to have been written (and probably rejected) early in the author's career.  I found the long parade of characters to be uninteresting and their motivations to be extremely confused. Would any woman with the experience, intelligence and abilities of the primary character really be the submissive housewife as characterized in this volume - impossible for me to rationalize.It appears to me that either the author or pub

In [120]:
stratdf_cl2.iloc[13]

asin                                                        0385332947
overall                                                              1
reviewText           Don't bother to read this one, folks.  Coleman...
reviewTime                                                  2006-08-08
reviewerID                                              A2OU2NCXK689Z2
reviewerName                                          Tigger "kkegley"
summary                                               Guilty after all
unixReviewTime                                              1154995200
helpful_votes                                                       12
review_age_days                                                   2906
annual_HVAR                                                    1.50723
book_num_reviews                                                    10
std_HVAR                                                      0.988172
top_quartile_HVAR                                              1.13868
min_ma

## Tweaking

In [None]:
# Tweaking lessons

# acc improves with more data
# use tfidf max_features and/or regularization parameter to control for overfitting

# as per-group samples increase NOT using SVD is better
# - might as well experiment with max_features
# - 10000 features -> acc decr from 79.8 to 75.4
# - 20000 features -> acc 76.6
# - 100000 features -> acc 78.2
# - 300000 features -> acc 79.4
# - 600000 features -> acc 79.9
# - 1000000 features -> acc 80.1
# - 1.97mil features -> acc 79.8

# 32000 examples; 100000 features -> 81.8
#               1mil features -> 84.0

# do not have vectorizer ignore any tokens (min_df, max_df)
# do not remove stop words
# increase to (1,3) ngrams
# try ngrams (1,3) + truncated svd;
# - svd 100 dim acc: .694 (c=0.95)
# - svd 120 dim acc: .682 (C=0.95)
# - svd 130 dim acc: .685 (C=0.86)
# - svd 150 dim acc: .689 (C=0.91)
# - svd 50 dim acc: .688 (C=0.91)
# - svd 300 dim acc: .689 (C=0.67); more dimensions -> regularization works harder

# More data
# Did not oversample:
#  - sample 1500 per group without replacement
#  - svd 300 dim; C=1.0; acc=.732
#  - svd 100 dim; C=1.0; acc=.725
#  - svd 150 dim; C=0.76; acc=.732

# results:

#True Negative: 386
#True Positive: 492
#False Negative: 108
#False Positive: 214

# Oversample:
#  - sample 2000 per group with replacement
#  - svd 100 dim; C=0.95; acc=.742
#  - svd 150 dim; C=0.81; acc=.745
#  - svd 300 dim; C=1.0; acc=.751


# oversampling underrepresented increases accuracy

# Oversample (8000 total) with no SVD acc: 78.2



# reduce dimensions
# - stemming



In [352]:
# Reduce dimensions with truncated SVD
dimensions=100
svd = TruncatedSVD(n_components=dimensions, n_iter=5, random_state=42)
X_train_svd = svd.fit_transform(X_train_prep_vec)
X_test_svd = svd.transform(X_test_prep_vec)

In [353]:
clf_3 = LogisticRegression(penalty='l2',
                           C=1.0,
                           random_state=42,
                           solver='saga',
                           multi_class='ovr',
                           max_iter=100,
                           n_jobs=-1,
                           verbose=True)

param_grid = {'C':list(np.linspace(0.1,1.0,20))}            
clf_3 = GridSearchCV(clf_3, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=True)
clf_3.fit(X_train_svd, y_train_prep)
best_c = round(clf_3.best_params_['C'],2)
print('Best value for C: {}'.format(best_c))

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    3.3s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


convergence after 17 epochs took 0 seconds
Best value for C: 0.91


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.2s finished


In [354]:
# Fit a model with the best C value
clf_3 = LogisticRegression(penalty='l2',
                           C=best_c,
                           random_state=42,
                           solver='saga',
                           multi_class='ovr',
                           max_iter=100,
                           n_jobs=-1,
                           verbose=True)

clf_3.fit(X_train_svd, y_train_prep)
test_predicted_labels_3 = clf_3.predict(X_test_svd)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


convergence after 17 epochs took 1 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.1s finished


In [355]:
# Evaluate with various f1 metrics
f1_weighted = metrics.f1_score(y_test_prep, test_predicted_labels_3, average='weighted')
accuracy = metrics.accuracy_score(y_test_prep, test_predicted_labels_3)
    
print('Logistic Regression Classifer - SVD Dimensions = {}'.format(dimensions))
print('-------------\n')
print('Accuracy on test set: {:0.3f}'.format(accuracy))
print('f_1 score (Weighted): {:0.3f}'.format(f1_weighted))

Logistic Regression Classifer - SVD Dimensions = 100
-------------

Accuracy on test set: 0.735
f_1 score (Weighted): 0.735


In [356]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test_prep, test_predicted_labels_3)
tn, fp, fn, tp = confusion_matrix(y_test_prep, test_predicted_labels_3).ravel()
print("True Negative: {}".format(tn))
print("True Positive: {}".format(tp))
print("False Negative: {}".format(fn))
print("False Positive: {}".format(fp))

True Negative: 1186
True Positive: 1166
False Negative: 434
False Positive: 414


## Alternate Labeling Strategy
Adapt from Jen's labeling notebook
1. KBinsDiscretizer with strategy='kmeans'
2. Try 3 classes (helpful, unhelpful, undetermined)
3. Try training a model on helpful and unhelpful classes only; and on all 3