# Hypothesis to test:
### Removing objective sentences from reviews helps predict star rating from reviews

In [124]:
import numpy as np
import pandas as pd
import pickle
import gzip
import math
import random
from IPython.display import Markdown, display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import statsmodels.api as sm

from utils import rmse, rmse_train_cv, classifier_report

# Avoid restarting Kernel
%load_ext autoreload
%autoreload 2

# %autosave 50

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Configuration

In [53]:
# Subsampling from Amazon reviews
NB_SAMPLES = 20000  # up to 200k, then change the input file

# Gradient Boosting Classifier parameters
N_TREES = math.floor(np.sqrt(NB_SAMPLES) * 1.2)
LEARN_RATE = 0.01
MIN_IN_LEAF = 3 #7
N_TREES

169

## Get user comments

In [3]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path, trunc=0):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1        
    if trunc > 0 and i > trunc: 
        break
  return pd.DataFrame.from_dict(df, orient='index')

In [4]:
data_path = '../../datasets/'
file_name = 'reviews_Movies_and_TV.json.gz'

In [5]:
# comments_df = getDF(data_path + file_name, 200000)
# comments_df.loc[0,'reviewText']
# print(comments_df.shape)
# comments_df.head(1)

In [6]:
# Save the records
# import pickle
# pickle_out = open(data_path + "amzn_200k.pickle","wb")
# pickle.dump(comments_df, pickle_out)
# pickle_out.close()

In [7]:
# Reads records
import pickle
pickle_in = open(data_path + "amzn_200k.pickle","rb")
comments_df = pickle.load(pickle_in)

## Subsample

In [96]:
# small = comments_df.loc[:NB_SAMPLES, :]
# print(len(small.index))
# small.head(3)

In [None]:
pickle_in = open('pickles/five_k_balanced_reviews.pkl',"rb")
balanced_reviews = pickle.load(pickle_in)
pickle_in.close;

In [103]:
# balanced_reviews.iloc[random.sample(range(25000), 15), :]

## Star rating for base case, based on tf-idf

In [104]:
all_tfidf_vectorizer = TfidfVectorizer(max_df=0.98) # at 0.98 or 0.99, same result as 1: 
all_comments_tfidf = all_tfidf_vectorizer.fit_transform(balanced_reviews['reviewText'])
vocab = all_tfidf_vectorizer.vocabulary_
print(len(vocab))

63029


## Star rating modeling for base case, based on tf-idf

In [126]:
X_train, X_cv, y_train, y_cv = train_test_split(
    all_comments_tfidf, balanced_reviews['overall'], test_size=0.2, random_state=0)
X_train.shape

(20000, 63029)

### Gradient Boosting Classifier

In [138]:
# gbc_all = GradientBoostingClassifier(learning_rate=LEARN_RATE, 
#                                 n_estimators=N_TREES, 
#                                 min_samples_leaf=MIN_IN_LEAF,
#                                 random_state=0)
# gbc_all.fit(X_train, y_train)

In [82]:
np.unique(gbc_all.predict(X_train))

array([1., 2., 3., 4., 5.])

In [None]:
gbc_pred = gbc_all.predict(X_cv)
print(classification_report(y_cv, gbc_pred))
confusion_matrix(y_cv, gbc_pred)

In [84]:
pickle.dump(gbc_all, open('pickles/GBC_all_comments'+ str(N_TREES) +'_' + str(LEARN_RATE) 
                        +'_' + str(MIN_IN_LEAF) + '.pkl', 'wb'))

In [89]:
# ols_all = sm.OLS(y_train, X_train)
# results_subj = ols_all.fit()
# results_subj.summary()

### Logistic regressor

In [108]:
lr = LogisticRegression(random_state=0, solver='lbfgs',
                       multi_class='multinomial',max_iter=5000)
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=5000, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=0, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [132]:
%reload_ext autoreload
classifier_report(lr, X_train, y_train, 'training set')

### Report for LogisticRegression on training set:

#### Classification Report:

              precision    recall  f1-score   support

         1.0       0.76      0.82      0.79      4019
         2.0       0.75      0.72      0.74      3980
         3.0       0.72      0.70      0.71      4033
         4.0       0.74      0.69      0.71      3982
         5.0       0.77      0.82      0.79      3986

   micro avg       0.75      0.75      0.75     20000
   macro avg       0.75      0.75      0.75     20000
weighted avg       0.75      0.75      0.75     20000



#### Confusion Matrix:

[[3296  381  172   83   87]
 [ 500 2882  348  128  122]
 [ 292  321 2806  368  246]
 [ 139  168  397 2744  534]
 [ 100   71  170  387 3258]]


In [133]:
classifier_report(lr, X_cv, y_cv, 'CV set')
# lr_pred = lr.predict(X_cv)
# print(classification_report(y_cv, lr_pred))
# confusion_matrix(y_cv, lr_pred)

### Report for LogisticRegression on CV set:

#### Classification Report:

              precision    recall  f1-score   support

         1.0       0.58      0.62      0.60       981
         2.0       0.43      0.41      0.42      1020
         3.0       0.36      0.37      0.37       967
         4.0       0.41      0.39      0.40      1018
         5.0       0.60      0.60      0.60      1014

   micro avg       0.48      0.48      0.48      5000
   macro avg       0.48      0.48      0.48      5000
weighted avg       0.48      0.48      0.48      5000



#### Confusion Matrix:

[[610 235  69  30  37]
 [243 422 239  76  40]
 [ 93 209 358 220  87]
 [ 56  84 244 393 241]
 [ 53  24  82 242 613]]


In [134]:
lr_pipe = Pipeline([('vect', TfidfVectorizer(lowercase=True, stop_words=None)),
                ('lr', lr)])
lr_pipe.fit(balanced_reviews['reviewText'], balanced_reviews['overall'])

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
..., penalty='l2', random_state=0, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False))])

In [137]:
%reload_ext autoreload
classifier_report(lr_pipe, balanced_reviews['reviewText'], 
                  balanced_reviews['overall'], 'training set')

### Report for Pipeline on training set:

#### Classification Report:

              precision    recall  f1-score   support

         1.0       0.75      0.81      0.78      5000
         2.0       0.73      0.71      0.72      5000
         3.0       0.71      0.67      0.69      5000
         4.0       0.71      0.68      0.69      5000
         5.0       0.76      0.81      0.78      5000

   micro avg       0.73      0.73      0.73     25000
   macro avg       0.73      0.73      0.73     25000
weighted avg       0.73      0.73      0.73     25000



#### Confusion Matrix:

[[4053  522  200  109  116]
 [ 666 3526  468  192  148]
 [ 363  464 3342  520  311]
 [ 167  234  513 3383  703]
 [ 127   89  196  541 4047]]


### Stochastic gradient descent classifier

In [120]:

sgd= SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, 
                                        random_state=0, max_iter=500, tol=None)
sgd.fit(X_train, y_train)

y_cv_pred = sgd.predict(X_cv)

In [119]:
print(classification_report(y_cv, y_cv_pred))
print(confusion_matrix(y_cv,y_cv_pred))

              precision    recall  f1-score   support

         1.0       0.50      0.76      0.60       981
         2.0       0.42      0.22      0.29      1020
         3.0       0.39      0.31      0.34       967
         4.0       0.40      0.25      0.31      1018
         5.0       0.51      0.77      0.61      1014

   micro avg       0.46      0.46      0.46      5000
   macro avg       0.44      0.46      0.43      5000
weighted avg       0.44      0.46      0.43      5000

[[749  90  57  26  59]
 [430 229 191  80  90]
 [162 141 295 181 188]
 [ 97  70 175 257 419]
 [ 70  14  48 101 781]]


### Gradient boosting regressor

In [58]:
gbr_all = GradientBoostingRegressor(learning_rate=LEARN_RATE, 
                                n_estimators=N_TREES, 
                                min_samples_leaf=MIN_IN_LEAF,
                                random_state=0)
gbr_all.fit(X_train, y_train)
print('Gradient Boosting Regressor')
print('Training score using all comments: {0:.2f}'
      .format(gbr_all.score(X_train, y_train)))
print('CV score using all comments: {0:.2f}'
      .format(gbr_all.score(X_cv, y_cv)))

Gradient Boosting Regressor
Training score using all comments: 0.16
CV score using all comments: 0.12


## Split comments into separate sentences

In [9]:
from nltk.tokenize import sent_tokenize
small['sentence'] = small['reviewText'].map(sent_tokenize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [10]:
small.columns

Index(['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText',
       'overall', 'summary', 'unixReviewTime', 'reviewTime', 'sentence'],
      dtype='object')

In [11]:
small.drop(['reviewerName', 'helpful', 'reviewText', 'summary', 
            'unixReviewTime', 'reviewTime'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [12]:
small.shape

(2001, 4)

In [13]:
sentences = small['sentence'] \
.apply(pd.Series) \
.merge(small, left_index = True, right_index = True) \
.drop(['sentence'], axis = 1) \
.melt(id_vars = ['reviewerID', 'asin','overall'], value_name = 'sentence') \
.drop(['variable'], axis = 1) \
.dropna()

print(sentences.shape)
sentences.head(3)

(11115, 4)


Unnamed: 0,reviewerID,asin,overall,sentence
0,A3R5OBKS7OM2IR,143502,5.0,This has some great tips as always and is help...
1,A3R5OBKS7OM2IR,143529,5.0,This is a great pastry guide.
2,AH3QC2PC1VTGP,143561,2.0,I have to admit that I am a fan of Giada's coo...


## Sentence-level prep & cleaning

In [14]:
%reload_ext autoreload
from utils import split_n_lower, not_about_support

### Split into words and lower the case

In [15]:
sentences['words'] = sentences['sentence'].apply(lambda s: split_n_lower(s))

In [16]:
print(sentences.shape)
sentences.head(3)

(11115, 5)


Unnamed: 0,reviewerID,asin,overall,sentence,words
0,A3R5OBKS7OM2IR,143502,5.0,This has some great tips as always and is help...,"[this, has, some, great, tips, as, always, and..."
1,A3R5OBKS7OM2IR,143529,5.0,This is a great pastry guide.,"[this, is, a, great, pastry, guide, .]"
2,AH3QC2PC1VTGP,143561,2.0,I have to admit that I am a fan of Giada's coo...,"[i, have, to, admit, that, i, am, a, fan, of, ..."


### Keep support-related sentences as they probably have impact on rating

In [17]:
# on_movies_filter = [not_about_support(word) for word in sentences['words']]
sentences_on_movie = sentences #[on_movies_filter]

print('Removing {} records'.format(sentences.shape[0]- sentences_on_movie.shape[0]))

Removing 0 records


In [18]:
sentences_on_movie.shape

(11115, 5)

### Base case: A reviews with objective and subjective sentences

In [19]:
# Kernel dies here at 50K samples
all_reviews_groups = sentences_on_movie.groupby(['reviewerID','asin'])


In [20]:
all_reviews_stars = all_reviews_groups['overall'].mean()
all_reviews_stars[:3]

reviewerID      asin      
A100CQXJ6D44T9  0005119367    5.0
A100Z2S0880G9A  0005019281    5.0
A102Z4PIK7CYD8  030714139X    5.0
Name: overall, dtype: float64

In [21]:
all_reviews_comments = all_reviews_groups['words'].sum()
print(sentences_on_movie.iloc[0, 4])
print(all_reviews_comments.shape)
print(all_reviews_comments[0])
len(all_reviews_comments)

['this', 'has', 'some', 'great', 'tips', 'as', 'always', 'and', 'is', 'helping', 'me', 'to', 'complete', 'my', 'good', 'eats', 'collection', '.']
(2000,)
['awesome', 'movie', 'and', 'great', 'story', '.', 'highly', 'recommend', 'this', 'movie', 'and', 'follows', 'closely', 'to', 'biblical', 'accounts', '.', 'the', 'message', 'is', 'strong', 'about', 'forgiveness', 'and', 'faithfulness', '.']


2000

### Remove objective sentences for case B using obj-subj model

In [22]:
# pd.set_option('display.max_colwidth', -1)
# sentences_on_movie['sentence']
sentences_on_movie.shape

(11115, 5)

In [23]:
sentences_on_movie.head(2)

Unnamed: 0,reviewerID,asin,overall,sentence,words
0,A3R5OBKS7OM2IR,143502,5.0,This has some great tips as always and is help...,"[this, has, some, great, tips, as, always, and..."
1,A3R5OBKS7OM2IR,143529,5.0,This is a great pastry guide.,"[this, is, a, great, pastry, guide, .]"


#### Vectorize along the word space of the obj-subj training set

In [25]:
tfidf = pickle.load(open('pickles/Obj-Subj_tfidf.pkl', 'rb'))
len(tfidf.vocabulary_)

20893

In [35]:
sentences_tfidf = tfidf.transform(sentences_on_movie['sentence']).todense()

In [36]:
sentences_tfidf.shape

(11115, 20893)

#### Apply the obj-subj model

In [29]:
N_TREES = 100
LEARN_RATE = 0.1
MIN_IN_LEAF = 10
pickle_in = open('pickles/GBC_'+ str(N_TREES) +'_' + str(LEARN_RATE) 
                        +'_' + str(MIN_IN_LEAF) + '_20min.pkl', 'rb')
gb_model = pickle.load(pickle_in)

In [30]:
y_test = gb_model.predict(sentences_tfidf)
len(y_test)

11115

In [None]:
subjective_sentences = sentences_on_movie[y_test == 1]

In [34]:
display(Markdown('### Removing {} objective sentences'
                 .format(len(y_test) - len(subjective_sentences))))

### Removing 3340 objective sentences

In [None]:
subjective_sentences.head(2)

#### Merge the sentences back into paragraph reviews

In [None]:
subj_groups = subjective_sentences.groupby(['reviewerID','asin'])
subj_reviews_stars = subj_groups['overall'].mean()
# subjective_reviewssubjective_reviews['sentence'].apply(lambda x: x.sum())
# subjective_reviews_reviews = 
subj_reviews_stars[:3]

In [None]:
subj_review_comments = subj_groups['words'].sum()
print(subj_review_comments.shape)
print(subj_review_comments[0])
subj_review_comments[:3]

### Check that stars still correspond to the right movie

In [None]:
start = 6000
end = 6010
all_reviews_comments.loc[('A33Z7JTV7SSW9Y', '0718000315')]


In [None]:
print(all_reviews_stars.loc[('A33Z7JTV7SSW9Y', '0718000315')])
print(sentences_on_movie.loc[sentences_on_movie['reviewerID']=='A33Z7JTV7SSW9Y']) 
# and sentences_on_movie['asin']=='0718000315'])

In [None]:
pd.options.display.max_colwidth = -1
print(small.loc[small['reviewerID']=='A33Z7JTV7SSW9Y'])

In [None]:
sentences_on_movie[start:end]

## Create emotion vectors

In [None]:
print('Total number of reviews:', all_reviews_comments.shape[0])
print('Total number of subjective reviews:', subj_review_comments.shape[0])

In [None]:
%reload_ext autoreload
from emotions_seven import Emotions7
emote = Emotions7()

In [None]:
all_reviews_emotions = emote.vectorize(all_reviews_comments)
print(all_reviews_emotions.shape)

In [None]:
# emote.emotions_in_text

In [None]:
all_reviews_emotions.shape

In [None]:
# all_revs_with_emotions = all_reviews_emotions[emote.emotions_in_text == True]

In [None]:
# print(all_revs_with_emotions.shape)
# all_revs_stars = all_reviews_stars[emote.emotions_in_text]
all_reviews_emotions[0]

In [None]:
subj_reviews_emotions = emote.vectorize(subj_review_comments)
print(subj_reviews_emotions.shape)
subj_reviews_emotions[0]

## Fit a model on base case (all comments) for star rating prediction

In [None]:
X_subj_train, X_subj_cv, y_subj_train, y_subj_cv = train_test_split(
    subj_reviews_emotions, subj_reviews_stars, test_size=0.2, random_state=0)
X_subj_train.shape

In [None]:
gbc_subj = GradientBoostingClassifier(learning_rate=LEARN_RATE, 
                                n_estimators=N_TREES, 
                                min_samples_leaf=MIN_IN_LEAF,
                                random_state=0)
gbc_subj.fit(X_subj_train, y_subj_train)

In [None]:
print('Gradient Boosting Classifier')
print('Training score using all comments: {0:.2f}'
      .format(gbc_all.score(X_train, y_train)))
print('CV score using all comments: {0:.2f}'
      .format(gbc_all.score(X_cv, y_cv)))
print('')

# print('Training score using subjective comments only: {0:.2f}'
#       .format(gbc_subj.score(X_subj_train, y_subj_train)))
# print('CV score using subjective comments only: {0:.2f}'
#       .format(gbc_subj.score(X_subj_cv, y_subj_cv)))


## Other techniques

In [None]:
# from sklearn.linear_model import LogisticRegression
# lr = LogisticRegression(random_state=0, solver='lbfgs',
#                        multi_class='multinomial',max_iter=1000)
# lr.fit(X_subj_train, y_subj_train)
# print(lr.score(X_subj_train, y_subj_train))
# print(lr.score(X_subj_cv, y_subj_cv))

In [None]:
ols_all = sm.OLS(y_train, X_train)
results_all = ols_all.fit()
results_all.summary()

In [None]:
ols_subj = sm.OLS(y_subj_train, X_subj_train)
results_subj = ols_subj.fit()
results_subj.summary()

In [None]:
import seaborn as sns

all_reviews_emotions, all_reviews_stars

sns.heatmap(raw_df.corr(), annot=True)