# Hypothesis to test:
### Removing objective sentences from reviews helps predict star rating from reviews

In [7]:
import numpy as np
import pandas as pd
import pickle
import gzip
import math
import random
from IPython.display import Markdown, display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor,GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, make_scorer
import statsmodels.api as sm

from utils import rmse, rmse_train_cv, classifier_report, confusion_rmse
from to_bag_of_words import create_bag_of_words

# Avoid restarting Kernel
%load_ext autoreload
%autoreload 2

# %autosave 50

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Configuration

In [2]:
# Subsampling from Amazon reviews
NB_SAMPLES = 20000  # up to 200k, then change the input file

## Get user comments

In [3]:
data_path = '../../datasets/'
file_name = '360000_balanced_train_test_reviews.pkl'

In [10]:
pickle_in = open(data_path + file_name,"rb")
train_test_dic = pickle.load(pickle_in)

## Create bag of words
Remove accents  
Tokenize  
Lower the case
Apply custom stop words (keep all negations)
 
Output:  
One list of words for each review 

In [12]:
train_test_dic['train'].shape

(360000, 9)

In [33]:
reviews_text = train_test_dic['train']['reviewText'][:3].values
reviews_text[1]

"Why would someone buy this pan & scan trash? do yourself a favor and recycle it!! The igonorance of some of these companies is overwhelming. This isn't the '80's any more, is it? Boycott rip-offs like this!"

In [37]:
%reload_ext autoreload
res = create_bag_of_words(reviews_text)

pd.set_option('display.max_colwidth', -1)
res[0]

['in',
 'the',
 'book',
 'the',
 'official',
 'godzilla',
 'compendium',
 ',',
 'king',
 'kong',
 'vs',
 'godzilla',
 'is',
 'described',
 'as',
 '``',
 'the',
 'jaws',
 'of',
 'the',
 'japanese',
 'film',
 'industry',
 "''",
 'or',
 '``',
 'one',
 'of',
 'the',
 'great',
 'monster',
 'battles',
 'of',
 'cinema',
 'history',
 "''",
 '.',
 'it',
 'is',
 'referred',
 'to',
 'as',
 'the',
 'film',
 'that',
 '``',
 '...',
 'lifted',
 'godzilla',
 'from',
 'the',
 'swelling',
 'ranks',
 'of',
 'interchangeable',
 'atomic',
 'monsters',
 'of',
 'the',
 'fifties',
 'and',
 'placed',
 'him',
 'among',
 'the',
 'pantheon',
 'of',
 'cinema',
 'creatures',
 "''",
 '.',
 'thus',
 ',',
 'i',
 'had',
 'high',
 'expectations',
 'when',
 'i',
 'watched',
 'this',
 'on',
 'the',
 'sci-fi',
 'channel.all',
 'i',
 'have',
 'to',
 'say',
 'is',
 ':',
 'wow',
 '!',
 'the',
 'special',
 'effects',
 'are',
 'incredible',
 '!',
 'the',
 'acting',
 'is',
 'top',
 'notch',
 '.',
 'the',
 'screenplay',
 'is',
 '

In [38]:
from nltk.corpus import stopwords

stopwords_ = set(stopwords.words('english'))
print(len(stopwords_))
print(stopwords_)

179
{'then', "you're", 'she', 'few', 'yourself', 'or', 'itself', 'does', 'out', 'isn', 'until', 'themselves', 'you', 'y', 'ain', 'wouldn', 'because', "won't", 'mustn', 'of', 'on', 'not', 'very', 'is', "doesn't", "weren't", 'below', "mightn't", 'further', 'himself', 'myself', "didn't", 'so', 'only', 'll', 'my', "don't", 'same', 'am', 'her', 'but', 'had', 'a', "hasn't", 'where', 'aren', 'other', 'be', "shan't", 'him', "you've", 'own', 'the', "isn't", 'are', 'were', 'haven', 'this', "couldn't", 'at', 'our', 'from', 'doesn', 'if', 'and', 'm', 'before', 'off', 've', 'by', "you'd", 'after', 'who', 'has', 'some', 'those', 'as', 'about', 'no', 'me', 'its', "you'll", 'have', 'through', "wouldn't", 'over', 'too', 'such', 'yourselves', 'he', 'his', 'them', "aren't", 'under', 'they', 'again', 'above', 'herself', 'between', "it's", 'don', 're', 'against', 'all', 'whom', "should've", 'during', 'here', 'now', 'do', 'to', 'can', 'up', 'ma', 'it', 'shan', 'theirs', 'we', 'what', 'yours', 'o', 'hadn', '

## TODO: Remove reviews that may not be on the movie, but on Amazon/support instead
Input: 
* word tokens 
* one line per review 

## TODO: merge negations with next word, remove next word

## TODO: encode the review length as other input

## Star rating modeling for base case, based on tf-idf

In [7]:
X_train, X_cv, y_train, y_cv = train_test_split(
  balanced_reviews['reviewText'], balanced_reviews['overall'], test_size=0.2, random_state=0)
X_train.shape

(20000,)

### TF-IDF setup

In [8]:
MAX_FEATURES = 20000

In [11]:
tfidf = TfidfVectorizer(lowercase=True, 
                        stop_words='english', 
                        max_features=MAX_FEATURES,
                        norm='l2',            # normalize each review
                        use_idf=True)        # Keep high weight for most common words

In [12]:
X_train_tf = tfidf.fit_transform(X_train)
X_cv_tf = tfidf.transform(X_cv)
print(X_cv_tf.shape)

20000
(5000, 20000)


In [19]:
pickle_out = open("pickles/tfidf_25kBalancedSamples_20kFeats.pkl","wb")
pickle.dump(tfidf, pickle_out)
pickle_out.close()
print(len(tfidf.vocabulary_))

20000


### Gradient Boosting Classifier

In [10]:
# Gradient Boosting Classifier parameters
N_TREES = 300 # math.floor(np.sqrt(NB_SAMPLES) * 1.2)
LEARN_RATE = 0.1
MAX_DEPTH = 15
MIN_IN_LEAF = 2 #7

In [13]:
gbc = GradientBoostingClassifier(learning_rate=LEARN_RATE, 
                                n_estimators=N_TREES, 
                                min_samples_leaf=MIN_IN_LEAF,
                                max_depth=MAX_DEPTH,
A)

In [21]:
gbc.fit(X_train_tf, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=15,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=2, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=300,
              n_iter_no_change=None, presort='auto', random_state=0,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [22]:
pickle.dump(gbc, open('pickles/GBC_after_tfidf_balanced_comments_'
                       + str(N_TREES) + '_trees_' 
                       + str(LEARN_RATE) + '_lr_' 
                       + str(MAX_DEPTH) + '_maxdpth_'
                       + str(MIN_IN_LEAF) + '_minleaf_'
                       + str(MAX_FEATURES) + '_feats_'
                       + '.pkl', 'wb'))

In [23]:
print(MAX_FEATURES, ' features', N_TREES,'trees; ',
      LEARN_RATE,'learn_rate; ', MAX_DEPTH, 'max_dpth; ',
      MIN_IN_LEAF, 'min_in_leaf')
classifier_report(gbc, X_train_tf, y_train,
                  'Gradient Boosting Classifier on training set')
classifier_report(gbc, X_cv_tf, y_cv, 
                  'Gradient Boosting Classifier on CV set')

20000  features 300 trees;  0.1 learn_rate;  15 max_dpth;  2 min_in_leaf


### Report for Gradient Boosting Classifier on training set:

##### Confusion RMSE: 0.101

#### Confusion Matrix:

[[4017    0    2    0    0]
 [   1 3964    8    4    3]
 [   8    4 3995   10   16]
 [   1    2    3 3972    4]
 [   0    0    3    0 3983]]


#### Classification Report:

              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00      4019
         2.0       1.00      1.00      1.00      3980
         3.0       1.00      0.99      0.99      4033
         4.0       1.00      1.00      1.00      3982
         5.0       0.99      1.00      1.00      3986

   micro avg       1.00      1.00      1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000



### Report for Gradient Boosting Classifier on CV set:

##### Confusion RMSE: 1.305

#### Confusion Matrix:

[[513 230 123  63  52]
 [224 339 270 121  66]
 [ 94 199 339 216 119]
 [ 62 110 282 326 238]
 [ 45  56 144 233 536]]


#### Classification Report:

              precision    recall  f1-score   support

         1.0       0.55      0.52      0.53       981
         2.0       0.36      0.33      0.35      1020
         3.0       0.29      0.35      0.32       967
         4.0       0.34      0.32      0.33      1018
         5.0       0.53      0.53      0.53      1014

   micro avg       0.41      0.41      0.41      5000
   macro avg       0.41      0.41      0.41      5000
weighted avg       0.41      0.41      0.41      5000



In [14]:
if False:
    gb_pipe = Pipeline([('vect', tfidf), ('gb', gbc)])
    gb_pipe.fit(X_train, y_train)
    pickle.dump(gb_pipe, open('pickles/GBCpipe_balanced_comments_'
                           + str(N_TREES) + '_trees_' 
                           + str(LEARN_RATE) + '_lr_' 
                           + str(MAX_DEPTH) + '_maxdpth_'
                           + str(MIN_IN_LEAF) + '_minleaf_'
                           + str(MAX_FEATURES) + '_feats_'
                           + '.pkl', 'wb'))
else:
    pickle_in = open("pickles/GBC_balanced_comments_300_trees_0.1_lr_15_maxdpth_2_minleaf_20000_feats_.pkl",
                     "rb")
    gb_pipe = pickle.load(pickle_in)

In [24]:
%reload_ext autoreload
print(MAX_FEATURES, ' features', N_TREES,'trees; ',
      LEARN_RATE,'learn_rate; ', MAX_DEPTH, 'max_dpth; ',
      MIN_IN_LEAF, 'min_in_leaf')
classifier_report(gb_pipe, X_train, y_train,
                  'Gradient Boosting Classifier on training set')
classifier_report(gb_pipe, X_cv, y_cv, 
                  'Gradient Boosting Classifier on CV set')

20000  features 300 trees;  0.1 learn_rate;  15 max_dpth;  2 min_in_leaf


### Report for Gradient Boosting Classifier on training set:

##### Confusion RMSE: 0.101

##### Off diagonal: 0.00

#### Confusion Matrix:

[[4017    0    2    0    0]
 [   1 3964    8    4    3]
 [   8    4 3995   10   16]
 [   1    2    3 3972    4]
 [   0    0    3    0 3983]]


#### Classification Report:

              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00      4019
         2.0       1.00      1.00      1.00      3980
         3.0       1.00      0.99      0.99      4033
         4.0       1.00      1.00      1.00      3982
         5.0       0.99      1.00      1.00      3986

   micro avg       1.00      1.00      1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000



### Report for Gradient Boosting Classifier on CV set:

##### Confusion RMSE: 1.305

##### Off diagonal: 0.59

#### Confusion Matrix:

[[513 230 123  63  52]
 [224 339 270 121  66]
 [ 94 199 339 216 119]
 [ 62 110 282 326 238]
 [ 45  56 144 233 536]]


#### Classification Report:

              precision    recall  f1-score   support

         1.0       0.55      0.52      0.53       981
         2.0       0.36      0.33      0.35      1020
         3.0       0.29      0.35      0.32       967
         4.0       0.34      0.32      0.33      1018
         5.0       0.53      0.53      0.53      1014

   micro avg       0.41      0.41      0.41      5000
   macro avg       0.41      0.41      0.41      5000
weighted avg       0.41      0.41      0.41      5000



### Misclassified CV samples: 5 stars instead of 1 star
Some words are very positive, the negativity is more subtle

In [16]:
y_predict = gb_pipe.predict(X_cv)

In [17]:
# np.set_printoptions(threshold=sys.maxsize)
pd.set_option('display.max_colwidth', -1)
true_1_pred_5 = (y_predict == 5) & (y_cv == 1)
print(np.unique(true_1_pred_5, return_counts=True))
print(len(true_1_pred_5))
print(len(X_cv))
X_cv[true_1_pred_5][:10]

(array([False,  True]), array([4948,   52]))
5000
5000


149193    I did not know one could write, produce and especially direct a film like this one. The accumulation of unbearable (I hope this film does not reflect the average American suburban precollege teen) nonsense is hardly  imaginable. I am really sorry about those people who liked this film and  especially two things:  a. the dichotomy between explicit verbal sex and  the absolute absence of any realistic love scene (I would be very  astonished if every single American teen is negotiating his First Time like  this).  b. the stereotypical image of the east-European girl. Although  Nadia was really amazing (but a bit too solarium burnt and silicon breasted  for being a Check or Slovakian girl (note for the scriptwriter: there is no  Checkoslovakia any more), I wonder why she was the only one who had to get  naked (and no American girl or guy)!  Well that's it. Thanks for reading  and sorry for those who might like the film.                                                             

# Other models in star_rating_modeling2 !!!

## Split comments into separate sentences

In [None]:
from nltk.tokenize import sent_tokenize
small['sentence'] = small['reviewText'].map(sent_tokenize)

In [None]:
small.columns

In [None]:
small.drop(['reviewerName', 'helpful', 'reviewText', 'summary', 
            'unixReviewTime', 'reviewTime'], axis=1, inplace=True)

In [None]:
small.shape

In [None]:
sentences = small['sentence'] \
.apply(pd.Series) \
.merge(small, left_index = True, right_index = True) \
.drop(['sentence'], axis = 1) \
.melt(id_vars = ['reviewerID', 'asin','overall'], value_name = 'sentence') \
.drop(['variable'], axis = 1) \
.dropna()

print(sentences.shape)
sentences.head(3)

## Sentence-level prep & cleaning

In [None]:
%reload_ext autoreload
from utils import split_n_lower, not_about_support

### Split into words and lower the case

In [None]:
sentences['words'] = sentences['sentence'].apply(lambda s: split_n_lower(s))

In [None]:
print(sentences.shape)
sentences.head(3)

### Keep support-related sentences as they probably have impact on rating

In [None]:
# on_movies_filter = [not_about_support(word) for word in sentences['words']]
sentences_on_movie = sentences #[on_movies_filter]

print('Removing {} records'.format(sentences.shape[0]- sentences_on_movie.shape[0]))

In [None]:
sentences_on_movie.shape

### Base case: A reviews with objective and subjective sentences

In [None]:
# Kernel dies here at 50K samples
all_reviews_groups = sentences_on_movie.groupby(['reviewerID','asin'])


In [None]:
all_reviews_stars = all_reviews_groups['overall'].mean()
all_reviews_stars[:3]

In [None]:
all_reviews_comments = all_reviews_groups['words'].sum()
print(sentences_on_movie.iloc[0, 4])
print(all_reviews_comments.shape)
print(all_reviews_comments[0])
len(all_reviews_comments)

### Remove objective sentences for case B using obj-subj model

In [None]:
# pd.set_option('display.max_colwidth', -1)
# sentences_on_movie['sentence']
sentences_on_movie.shape

In [None]:
sentences_on_movie.head(2)

#### Vectorize along the word space of the obj-subj training set

In [None]:
tfidf = pickle.load(open('pickles/Obj-Subj_tfidf.pkl', 'rb'))
len(tfidf.vocabulary_)

In [None]:
sentences_tfidf = tfidf.transform(sentences_on_movie['sentence']).todense()

In [None]:
sentences_tfidf.shape

#### Apply the obj-subj model

In [None]:
N_TREES = 100
LEARN_RATE = 0.1
MIN_IN_LEAF = 10
pickle_in = open('pickles/GBC_'+ str(N_TREES) +'_' + str(LEARN_RATE) 
                        +'_' + str(MIN_IN_LEAF) + '_20min.pkl', 'rb')
gb_model = pickle.load(pickle_in)

In [None]:
y_test = gb_model.predict(sentences_tfidf)
len(y_test)

In [None]:
subjective_sentences = sentences_on_movie[y_test == 1]

In [None]:
display(Markdown('### Removing {} objective sentences'
                 .format(len(y_test) - len(subjective_sentences))))

In [None]:
subjective_sentences.head(2)

#### Merge the sentences back into paragraph reviews

In [None]:
subj_groups = subjective_sentences.groupby(['reviewerID','asin'])
subj_reviews_stars = subj_groups['overall'].mean()
# subjective_reviewssubjective_reviews['sentence'].apply(lambda x: x.sum())
# subjective_reviews_reviews = 
subj_reviews_stars[:3]

In [None]:
subj_review_comments = subj_groups['words'].sum()
print(subj_review_comments.shape)
print(subj_review_comments[0])
subj_review_comments[:3]

### Check that stars still correspond to the right movie

In [None]:
start = 6000
end = 6010
all_reviews_comments.loc[('A33Z7JTV7SSW9Y', '0718000315')]


In [None]:
print(all_reviews_stars.loc[('A33Z7JTV7SSW9Y', '0718000315')])
print(sentences_on_movie.loc[sentences_on_movie['reviewerID']=='A33Z7JTV7SSW9Y']) 
# and sentences_on_movie['asin']=='0718000315'])

In [None]:
pd.options.display.max_colwidth = -1
print(small.loc[small['reviewerID']=='A33Z7JTV7SSW9Y'])

In [None]:
sentences_on_movie[start:end]

## Create emotion vectors

In [None]:
print('Total number of reviews:', all_reviews_comments.shape[0])
print('Total number of subjective reviews:', subj_review_comments.shape[0])

In [None]:
%reload_ext autoreload
from emotions_seven import Emotions7
emote = Emotions7()

In [None]:
all_reviews_emotions = emote.vectorize(all_reviews_comments)
print(all_reviews_emotions.shape)

In [None]:
# emote.emotions_in_text

In [None]:
all_reviews_emotions.shape

In [None]:
# all_revs_with_emotions = all_reviews_emotions[emote.emotions_in_text == True]

In [None]:
# print(all_revs_with_emotions.shape)
# all_revs_stars = all_reviews_stars[emote.emotions_in_text]
all_reviews_emotions[0]

In [None]:
subj_reviews_emotions = emote.vectorize(subj_review_comments)
print(subj_reviews_emotions.shape)
subj_reviews_emotions[0]

## Fit a model on base case (all comments) for star rating prediction

In [None]:
X_subj_train, X_subj_cv, y_subj_train, y_subj_cv = train_test_split(
    subj_reviews_emotions, subj_reviews_stars, test_size=0.2, random_state=0)
X_subj_train.shape

In [None]:
gbc_subj = GradientBoostingClassifier(learning_rate=LEARN_RATE, 
                                n_estimators=N_TREES, 
                                min_samples_leaf=MIN_IN_LEAF,
                                random_state=0)
gbc_subj.fit(X_subj_train, y_subj_train)

In [None]:
print('Gradient Boosting Classifier')
print('Training score using all comments: {0:.2f}'
      .format(gbc_all.score(X_train, y_train)))
print('CV score using all comments: {0:.2f}'
      .format(gbc_all.score(X_cv, y_cv)))
print('')

# print('Training score using subjective comments only: {0:.2f}'
#       .format(gbc_subj.score(X_subj_train, y_subj_train)))
# print('CV score using subjective comments only: {0:.2f}'
#       .format(gbc_subj.score(X_subj_cv, y_subj_cv)))


## Other techniques

In [None]:
# from sklearn.linear_model import LogisticRegression
# lr = LogisticRegression(random_state=0, solver='lbfgs',
#                        multi_class='multinomial',max_iter=1000)
# lr.fit(X_subj_train, y_subj_train)
# print(lr.score(X_subj_train, y_subj_train))
# print(lr.score(X_subj_cv, y_subj_cv))

In [None]:
ols_all = sm.OLS(y_train, X_train)
results_all = ols_all.fit()
results_all.summary()

In [None]:
ols_subj = sm.OLS(y_subj_train, X_subj_train)
results_subj = ols_subj.fit()
results_subj.summary()

In [None]:
import seaborn as sns

all_reviews_emotions, all_reviews_stars

sns.heatmap(raw_df.corr(), annot=True)