In [1]:
import numpy as np
import pandas as pd
import pickle
import gzip
import math

# Avoid restarting Kernel
%load_ext autoreload
%autoreload 2

# %autosave 50

## Configuration

In [63]:
# Subsampling
NB_SAMPLES = 20000  # up to 200k, then change the input file

# Gradient Boosting Classifier parameters
N_TREES = 200 #math.floor(np.sqrt(NB_SAMPLES))
LEARN_RATE = 0.01
MIN_IN_LEAF = 10
N_TREES

200

## Get user comments

In [3]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path, trunc=0):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1        
    if trunc > 0 and i > trunc: 
        break
  return pd.DataFrame.from_dict(df, orient='index')

In [4]:
data_path = '../../datasets/'
file_name = 'reviews_Movies_and_TV.json.gz'

In [5]:
# comments_df = getDF(data_path + file_name, 200000)
# comments_df.loc[0,'reviewText']
# print(comments_df.shape)
# comments_df.head(1)

In [6]:
# Save the records
# import pickle
# pickle_out = open(data_path + "amzn_200k.pickle","wb")
# pickle.dump(comments_df, pickle_out)
# pickle_out.close()

In [7]:
# Reads records
import pickle
pickle_in = open(data_path + "amzn_200k.pickle","rb")
comments_df = pickle.load(pickle_in)

## Split comments into separate sentences

In [8]:
small = comments_df.loc[:NB_SAMPLES, :]
len(small.index)

20001

In [9]:
from nltk.tokenize import sent_tokenize
small['sentence'] = small['reviewText'].map(sent_tokenize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [10]:
small.columns

Index(['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText',
       'overall', 'summary', 'unixReviewTime', 'reviewTime', 'sentence'],
      dtype='object')

In [11]:
small.drop(['reviewerName', 'helpful', 'reviewText', 'summary', 
            'unixReviewTime', 'reviewTime'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [12]:
small.shape

(20001, 4)

In [13]:
sentences = small['sentence'] \
.apply(pd.Series) \
.merge(small, left_index = True, right_index = True) \
.drop(['sentence'], axis = 1) \
.melt(id_vars = ['reviewerID', 'asin','overall'], value_name = 'sentence') \
.drop(['variable'], axis = 1) \
.dropna()

print(sentences.shape)
sentences.head(3)

(118281, 4)


Unnamed: 0,reviewerID,asin,overall,sentence
0,A3R5OBKS7OM2IR,143502,5.0,This has some great tips as always and is help...
1,A3R5OBKS7OM2IR,143529,5.0,This is a great pastry guide.
2,AH3QC2PC1VTGP,143561,2.0,I have to admit that I am a fan of Giada's coo...


## Sentence-level prep & cleaning

In [14]:
%reload_ext autoreload
from utils import split_n_lower, not_about_support

### Split into words and lower the case

In [15]:
sentences['words'] = sentences['sentence'].apply(lambda s: split_n_lower(s))

In [16]:
print(sentences.shape)
sentences.head(3)

(118281, 5)


Unnamed: 0,reviewerID,asin,overall,sentence,words
0,A3R5OBKS7OM2IR,143502,5.0,This has some great tips as always and is help...,"[this, has, some, great, tips, as, always, and..."
1,A3R5OBKS7OM2IR,143529,5.0,This is a great pastry guide.,"[this, is, a, great, pastry, guide, .]"
2,AH3QC2PC1VTGP,143561,2.0,I have to admit that I am a fan of Giada's coo...,"[i, have, to, admit, that, i, am, a, fan, of, ..."


### Remove support-related sentences

In [17]:
on_movies_filter = [not_about_support(word) for word in sentences['words']]
sentences_on_movie = sentences[on_movies_filter]

print('Removing {} records'.format(sentences.shape[0]- sentences_on_movie.shape[0]))

Removing 7257 records


In [18]:
sentences_on_movie.shape

(111024, 5)

### Base case: A reviews with objective and subjective sentences

In [19]:
# Kernel dies here at 50K samples
all_reviews_groups = sentences_on_movie.groupby(['reviewerID','asin'])


In [20]:
all_reviews_stars = all_reviews_groups['overall'].mean()
all_reviews_stars[:3]

reviewerID             asin      
A00295401U6S2UG3RAQSZ  0767015533    4.0
A0047322388NOTO4N8SKD  0310274281    5.0
A00473363TJ8YSZ3YAGG9  0310263662    4.0
Name: overall, dtype: float64

In [21]:
all_reviews_comments = all_reviews_groups['words'].sum()
print(sentences_on_movie.iloc[0, 4])
print(all_reviews_comments.shape)
print(all_reviews_comments[0])
len(all_reviews_comments)

['this', 'has', 'some', 'great', 'tips', 'as', 'always', 'and', 'is', 'helping', 'me', 'to', 'complete', 'my', 'good', 'eats', 'collection', '.']
(19632,)
['item', 'delivered', 'on', 'time', 'and', 'well', 'packaged', ',', 'slip', 'case', 'a', 'bit', 'worn', '.', 'essential', 'diana', 'rigg', 'episodes', 'from', '1965', 'in', 'b', '&', 'w', '.', 'volumes', '3', '&', '4', '.', 'vol', '3', ':', 'the', 'murder', 'market', ',', 'a', 'surfeit', 'of', 'h2o', ',', 'the', 'hour', 'that', 'never', 'was', '.', 'vol', '4', ':', 'dial', 'a', 'deadly', 'number', ',', 'man-eater', 'of', 'surrey', 'green', ',', 'two', "'s", 'a', 'crowd', ',', 'and', 'bonus', 'episode', 'too', 'many', 'christmas', 'trees', '.']


19632

### Remove objective sentences for case B using obj-subj model

In [22]:
# pd.set_option('display.max_colwidth', -1)
# sentences_on_movie['sentence']
sentences_on_movie.shape

(111024, 5)

In [23]:
sentences_on_movie.head(2)

Unnamed: 0,reviewerID,asin,overall,sentence,words
0,A3R5OBKS7OM2IR,143502,5.0,This has some great tips as always and is help...,"[this, has, some, great, tips, as, always, and..."
1,A3R5OBKS7OM2IR,143529,5.0,This is a great pastry guide.,"[this, is, a, great, pastry, guide, .]"


#### Vectorize along the word space of the obj-subj training set

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = pickle.load(open('Obj-Subj_tfidf.pkl', 'rb'))
len(tfidf.vocabulary_)

20893

In [25]:
mat = tfidf.transform(sentences_on_movie['sentence']).todense()

In [26]:
mat

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

#### Apply the obj-subj model

In [27]:
N_TREES = 100
LEARN_RATE = 0.1
MIN_IN_LEAF = 10
pickle_in = open('GBC_'+ str(N_TREES) +'_' + str(LEARN_RATE) 
                        +'_' + str(MIN_IN_LEAF) + '_20min.pkl', 'rb')
gb_model = pickle.load(pickle_in)

In [28]:
y_test = gb_model.predict(mat)
len(y_test)

111024

In [29]:
subjective_sentences = sentences_on_movie[y_test == 1]

In [30]:
subjective_sentences.head(2)

Unnamed: 0,reviewerID,asin,overall,sentence,words
0,A3R5OBKS7OM2IR,143502,5.0,This has some great tips as always and is help...,"[this, has, some, great, tips, as, always, and..."
1,A3R5OBKS7OM2IR,143529,5.0,This is a great pastry guide.,"[this, is, a, great, pastry, guide, .]"


#### Merge the sentences back into paragraph reviews

In [31]:
subj_groups = subjective_sentences.groupby(['reviewerID','asin'])
subj_reviews_stars = subj_groups['overall'].mean()
# subjective_reviewssubjective_reviews['sentence'].apply(lambda x: x.sum())
# subjective_reviews_reviews = 
subj_reviews_stars[:3]

reviewerID             asin      
A00295401U6S2UG3RAQSZ  0767015533    4.0
A00473363TJ8YSZ3YAGG9  0310263662    4.0
A017699216H6YAFBGYJOW  0740328271    5.0
Name: overall, dtype: float64

In [32]:
subj_review_comments = subj_groups['words'].sum()
print(subj_review_comments.shape)
print(subj_review_comments[0])
subj_review_comments[:3]

(18797,)
['item', 'delivered', 'on', 'time', 'and', 'well', 'packaged', ',', 'slip', 'case', 'a', 'bit', 'worn', '.', 'volumes', '3', '&', '4', '.', 'vol', '3', ':', 'the', 'murder', 'market', ',', 'a', 'surfeit', 'of', 'h2o', ',', 'the', 'hour', 'that', 'never', 'was', '.', 'vol', '4', ':', 'dial', 'a', 'deadly', 'number', ',', 'man-eater', 'of', 'surrey', 'green', ',', 'two', "'s", 'a', 'crowd', ',', 'and', 'bonus', 'episode', 'too', 'many', 'christmas', 'trees', '.']


reviewerID             asin      
A00295401U6S2UG3RAQSZ  0767015533    [item, delivered, on, time, and, well, package...
A00473363TJ8YSZ3YAGG9  0310263662    [good, movie, showing, the, &, #, 34, ;, passi...
A017699216H6YAFBGYJOW  0740328271    [it, was, put, in, lay, terms, for, everyone, ...
Name: words, dtype: object

### Check that stars still correspond to the right movie

In [82]:
start = 6000
end = 6010
all_reviews_comments.loc[('A33Z7JTV7SSW9Y', '0718000315')]


['one',
 'of',
 'the',
 'first',
 'extended',
 'modern',
 'presentations',
 'of',
 'the',
 'life',
 'of',
 'jesus',
 'was',
 '&',
 'quot',
 ';',
 'jesus',
 'of',
 'nazareth',
 '&',
 'quot',
 ';',
 'starring',
 'robert',
 'powell',
 '.',
 'this',
 'one',
 'is',
 'the',
 'standard',
 'and',
 'measure',
 'of',
 'all',
 'other',
 'productions',
 'on',
 'the',
 'same',
 'subject',
 '.',
 'bruce',
 'marchiano',
 ',',
 'although',
 'very',
 'sincere',
 'in',
 'his',
 'efforts',
 'to',
 'portray',
 'jesus',
 ',',
 'just',
 'does',
 "n't",
 'have',
 'the',
 '&',
 'quot',
 ';',
 'gravitas',
 '&',
 'quot',
 ';',
 'of',
 'a',
 'robert',
 'powell.the',
 'one',
 'point',
 'in',
 'the',
 'visual',
 'bible',
 "'s",
 'favor',
 'is',
 'it',
 "'s",
 'dedication',
 'to',
 'following',
 'the',
 'gospel',
 'of',
 'st.',
 'matthew',
 'without',
 'any',
 'deviation',
 '.',
 '&',
 'quot',
 ';',
 'jesus',
 'of',
 'nazareth',
 '&',
 'quot',
 ';',
 'on',
 'the',
 'other',
 'hand',
 ',',
 'has',
 'the',
 'advantag

In [88]:
print(all_reviews_stars.loc[('A33Z7JTV7SSW9Y', '0718000315')])
print(sentences_on_movie.loc[sentences_on_movie['reviewerID']=='A33Z7JTV7SSW9Y']) 
# and sentences_on_movie['asin']=='0718000315'])

3.0
           reviewerID        asin  overall  \
6755   A33Z7JTV7SSW9Y  0718000315      3.0   
26756  A33Z7JTV7SSW9Y  0718000315      3.0   
46757  A33Z7JTV7SSW9Y  0718000315      3.0   
66758  A33Z7JTV7SSW9Y  0718000315      3.0   
86759  A33Z7JTV7SSW9Y  0718000315      3.0   

                                                sentence  \
6755   One of the first extended modern presentations...   
26756  This one is the standard and measure of all ot...   
46757  Bruce Marchiano, although very sincere in his ...   
66758  &quot;Jesus of Nazareth&quot; on the other han...   
86759  &quot;The Visaul Bible's Matthew&quot; comes a...   

                                                   words  
6755   [one, of, the, first, extended, modern, presen...  
26756  [this, one, is, the, standard, and, measure, o...  
46757  [bruce, marchiano, ,, although, very, sincere,...  
66758  [&, quot, ;, jesus, of, nazareth, &, quot, ;, ...  
86759  [&, quot, ;, the, visaul, bible, 's, matthew, ...  


In [94]:
pd.options.display.max_colwidth = -1
print(small.loc[small['reviewerID']=='A33Z7JTV7SSW9Y'])

          reviewerID        asin  overall  \
6755  A33Z7JTV7SSW9Y  0718000315  3.0       

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              sentence  
6755  [One of the first extended modern presentations of the life of Jesus was &quot;Jesus of Nazareth&quot; starring Robert Powell., This one is th

In [78]:
sentences_on_movie[start:end]

Unnamed: 0,reviewerID,asin,overall,sentence,words
6750,ABFSJM48TAT9C,718000315,5.0,"If you find ""The Greatest Story Ever Told"" bel...","[if, you, find, ``, the, greatest, story, ever..."
6751,A322DG05X2P9S6,718000315,5.0,I can't think of anything that makes a better ...,"[i, ca, n't, think, of, anything, that, makes,..."
6752,A3ETTZ88C79S3I,718000315,5.0,"I have seen all the movies and films, &quot;Je...","[i, have, seen, all, the, movies, and, films, ..."
6753,A1UESZE25C420N,718000315,5.0,I have found the Visual Gospel of Matthew an e...,"[i, have, found, the, visual, gospel, of, matt..."
6755,A33Z7JTV7SSW9Y,718000315,3.0,One of the first extended modern presentations...,"[one, of, the, first, extended, modern, presen..."
6756,A2GE5C1WR3MACR,718000315,2.0,"Watching Bruce Marchiano, with his forced syru...","[watching, bruce, marchiano, ,, with, his, for..."
6758,ATWGAAUYYLACR,718000315,5.0,Please ignore some of the low ratings and nega...,"[please, ignore, some, of, the, low, ratings, ..."
6759,AU8JDMFEW67K3,718000315,5.0,This is by far the best Jesus movie I have eve...,"[this, is, by, far, the, best, jesus, movie, i..."
6761,A2AYFAUPGHUV7J,718000315,5.0,I enjoyed these films.,"[i, enjoyed, these, films, .]"
6762,A1D0WBE8QNG6W2,718000315,5.0,I can more fully understand the text of Matthe...,"[i, can, more, fully, understand, the, text, o..."


## Create emotion vectors

In [33]:
print('Total number of reviews:', all_reviews_comments.shape[0])
print('Total number of subjective reviews:', subj_review_comments.shape[0])

Total number of reviews: 19632
Total number of subjective reviews: 18797


In [None]:
%reload_ext autoreload
from emotions_seven import Emotions7
emote = Emotions7()

In [None]:
all_reviews_emotions = emote.vectorize(all_reviews_comments)
print(all_reviews_emotions.shape)

In [52]:
# emote.emotions_in_text

In [56]:
all_reviews_emotions.shape

(19632, 7)

In [54]:
# all_revs_with_emotions = all_reviews_emotions[emote.emotions_in_text == True]

In [57]:
# print(all_revs_with_emotions.shape)
# all_revs_stars = all_reviews_stars[emote.emotions_in_text]
all_reviews_emotions[0]

array([0.24092674, 0.47308685, 0.19926242, 0.46941505, 0.37559606,
       0.47539984, 0.30167819])

In [36]:
subj_reviews_emotions = emote.vectorize(subj_review_comments)
print(subj_reviews_emotions.shape)
subj_reviews_emotions[0]

Emotions empty for comment:  ['?']
Emotions empty for comment:  ['!']
Emotions empty for comment:  ['!']
Emotions empty for comment:  ['!']
Emotions empty for comment:  [':', '^', ')']
Emotions empty for comment:  ['!']
Emotions empty for comment:  ['!']
Emotions empty for comment:  ['!']
Emotions empty for comment:  ['.', '.']
Emotions empty for comment:  ['!']
Emotions empty for comment:  ['booyah', '!']
Emotions empty for comment:  ['!']
Emotions empty for comment:  ['!']
Emotions empty for comment:  ['!']
Emotions empty for comment:  ['!']
Emotions empty for comment:  [')', '.jim']
Emotions empty for comment:  ['!']
Emotions empty for comment:  ['!']
Emotions empty for comment:  ['magoo', '!', '!']
Emotions empty for comment:  ['!']
Emotions empty for comment:  ['5', 'stars', '!', '!', '!']
Emotions empty for comment:  ['lotus', 'mccart', ',', 'carmichael']
Emotions empty for comment:  ['!']
Emotions empty for comment:  ['aaa']
Emotions empty for comment:  ['!']
Emotions empty for 

array([0.23922288, 0.42554062, 0.19818131, 0.50833329, 0.3574801 ,
       0.51337347, 0.26956389])

## Fit a model on base case (all comments) for star rating prediction

In [37]:
from sklearn.model_selection import train_test_split

X_train, X_cv, y_train, y_cv = train_test_split(
    all_reviews_emotions, all_reviews_stars, test_size=0.2, random_state=0)

In [58]:
X_train.shape

(15705, 7)

In [39]:
from sklearn.ensemble import GradientBoostingRegressor,GradientBoostingClassifier

# loss: deviance: logistic log likelihood

In [64]:
gbc_all = GradientBoostingClassifier(learning_rate=LEARN_RATE, 
                                n_estimators=N_TREES, 
                                min_samples_leaf=MIN_IN_LEAF,
                                random_state=0)
gbc_all.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=10, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              n_iter_no_change=None, presort='auto', random_state=0,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [59]:
X_subj_train, X_subj_cv, y_subj_train, y_subj_cv = train_test_split(
    subj_reviews_emotions, subj_reviews_stars, test_size=0.2, random_state=0)
X_subj_train.shape

(15037, 7)

In [41]:
gbc_subj = GradientBoostingClassifier(learning_rate=LEARN_RATE, 
                                n_estimators=N_TREES, 
                                min_samples_leaf=MIN_IN_LEAF,
                                random_state=0)
gbc_subj.fit(X_subj_train, y_subj_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=10, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=0,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [65]:
print('Training score using all comments: {0:.2f}'
      .format(gbc_all.score(X_train, y_train)))
print('CV score using all comments: {0:.2f}'
      .format(gbc_all.score(X_cv, y_cv)))
print('')

print('Training score using subjective comments only: {0:.2f}'
      .format(gbc_subj.score(X_subj_train, y_subj_train)))
print('CV score using subjective comments only: {0:.2f}'
      .format(gbc_subj.score(X_subj_cv, y_subj_cv)))


Training score using all comments: 0.67
CV score using all comments: 0.67

Training score using subjective comments only: 0.67
CV score using subjective comments only: 0.67


In [43]:
np.unique(gbc_subj.predict(X_subj_train))

array([1., 2., 3., 4., 5.])

## Other techniques

In [61]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=0, solver='lbfgs',
                       multi_class='multinomial',max_iter=1000)
lr.fit(X_subj_train, y_subj_train)
print(lr.score(X_subj_train, y_subj_train))
print(lr.score(X_subj_cv, y_subj_cv))

0.6680853893728802
0.674468085106383
