In [1]:
import numpy as np
import pandas as pd
import pickle
import gzip
import math

# Avoid restarting Kernel
%load_ext autoreload
%autoreload 2

# %autosave 50

## Configuration

In [2]:
# Subsampling
NB_SAMPLES = 50000  # up to 200k, then change the input file

# Gradient Boosting Classifier parameters
N_TREES = math.floor(np.sqrt(NB_SAMPLES))
LEARN_RATE = 0.1
MIN_IN_LEAF = 10

## Get user comments

In [3]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path, trunc=0):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1        
    if trunc > 0 and i > trunc: 
        break
  return pd.DataFrame.from_dict(df, orient='index')

In [4]:
data_path = '../../datasets/'
file_name = 'reviews_Movies_and_TV.json.gz'

In [None]:
# comments_df = getDF(data_path + file_name, 200000)
# comments_df.loc[0,'reviewText']
# print(comments_df.shape)
# comments_df.head(1)

In [None]:
# Save the records
# import pickle
# pickle_out = open(data_path + "amzn_200k.pickle","wb")
# pickle.dump(comments_df, pickle_out)
# pickle_out.close()

In [None]:
# Reads records
import pickle
pickle_in = open(data_path + "amzn_200k.pickle","rb")
comments_df = pickle.load(pickle_in)

## Split comments into separate sentences

In [None]:
small = comments_df.loc[:NB_SAMPLES, :]
len(small.index)

50001

In [None]:
from nltk.tokenize import sent_tokenize
small['sentence'] = small['reviewText'].map(sent_tokenize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [None]:
small.columns

Index(['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText',
       'overall', 'summary', 'unixReviewTime', 'reviewTime', 'sentence'],
      dtype='object')

In [None]:
small.drop(['reviewerName', 'helpful', 'reviewText', 'summary', 
            'unixReviewTime', 'reviewTime'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [None]:
small.shape

(50001, 4)

In [None]:
sentences = small['sentence'] \
.apply(pd.Series) \
.merge(small, left_index = True, right_index = True) \
.drop(['sentence'], axis = 1) \
.melt(id_vars = ['reviewerID', 'asin','overall'], value_name = 'sentence') \
.drop(['variable'], axis = 1) \
.dropna()

print(sentences.shape)
sentences.head(3)

(314947, 4)


Unnamed: 0,reviewerID,asin,overall,sentence
0,A3R5OBKS7OM2IR,143502,5.0,This has some great tips as always and is help...
1,A3R5OBKS7OM2IR,143529,5.0,This is a great pastry guide.
2,AH3QC2PC1VTGP,143561,2.0,I have to admit that I am a fan of Giada's coo...


## Sentence-level prep & cleaning

In [None]:
%reload_ext autoreload
from utils import split_n_lower, not_about_support

### Split into words and lower the case

In [None]:
sentences['words'] = sentences['sentence'].apply(lambda s: split_n_lower(s))

In [None]:
print(sentences.shape)
sentences.head(3)

### Remove support-related sentences

In [None]:
on_movies_filter = [not_about_support(word) for word in sentences['words']]
sentences_on_movie = sentences[on_movies_filter]

print('Removing {} records'.format(sentences.shape[0]- sentences_on_movie.shape[0]))

In [None]:
sentences_on_movie.shape

### Base case: A reviews with objective and subjective sentences

In [None]:
all_reviews_groups = sentences_on_movie.groupby(['reviewerID','asin'])
all_reviews_stars = all_reviews_groups['overall'].mean()
all_reviews_stars[:3]

In [None]:
all_reviews_comments = all_reviews_groups['words'].sum()
print(sentences_on_movie.iloc[0, 4])
print(all_reviews_comments.shape)
print(all_reviews_comments[0])
len(all_reviews_comments)

### Remove objective sentences for case B using obj-subj model

In [None]:
# pd.set_option('display.max_colwidth', -1)
# sentences_on_movie['sentence']
sentences_on_movie.shape

In [None]:
sentences_on_movie.head(2)

#### Vectorize along the word space of the obj-subj training set

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = pickle.load(open('Obj-Subj_tfidf.pkl', 'rb'))
len(tfidf.vocabulary_)

In [None]:
mat = tfidf.transform(sentences_on_movie['sentence']).todense()

In [None]:
mat

#### Apply the obj-subj model

In [None]:
N_TREES = 100
LEARN_RATE = 0.1
MIN_IN_LEAF = 10
pickle_in = open('GBC_'+ str(N_TREES) +'_' + str(LEARN_RATE) 
                        +'_' + str(MIN_IN_LEAF) + '_20min.pkl', 'rb')
gb_model = pickle.load(pickle_in)

In [None]:
y_test = gb_model.predict(mat)
len(y_test)

In [None]:
subjective_sentences = sentences_on_movie[y_test == 1]

In [None]:
subjective_sentences.head(2)

#### Merge the sentences back into paragraph reviews

In [None]:
subj_groups = subjective_sentences.groupby(['reviewerID','asin'])
subj_reviews_stars = subj_groups['overall'].mean()
# subjective_reviewssubjective_reviews['sentence'].apply(lambda x: x.sum())
# subjective_reviews_reviews = 
subj_reviews_stars[:3]

In [None]:
subj_review_comments = subj_groups['words'].sum()
print(subj_review_comments.shape)
print(subj_review_comments[0])
subj_review_comments[:3]

## Create emotion vectors

In [None]:
print('Total number of reviews:', all_reviews_comments.shape[0])
print('Total number of subjective reviews:', subj_review_comments.shape[0])

In [None]:
%reload_ext autoreload
from emotions_seven import Emotions7
emote = Emotions7()

In [None]:
all_reviews_emotions = emote.vectorize(all_reviews_comments)
print(all_reviews_emotions.shape)
all_reviews_emotions[0]

In [None]:
subj_reviews_emotions = emote.vectorize(subj_review_comments)
print(subj_reviews_emotions.shape)
subj_reviews_emotions[0]

## Fit a model on base case (all comments) for star rating prediction

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_cv, y_train, y_cv = train_test_split(
    all_reviews_emotions, all_reviews_stars, test_size=0.2, random_state=0)

In [None]:
y_train.shape

In [None]:
from sklearn.ensemble import GradientBoostingRegressor,GradientBoostingClassifier

# loss: deviance: logistic log likelihood

In [None]:
gbc_all = GradientBoostingClassifier(learning_rate=LEARN_RATE, 
                                n_estimators=N_TREES, 
                                min_samples_leaf=MIN_IN_LEAF,
                                random_state=0)
gbc_all.fit(X_train, y_train)

In [None]:
X_subj_train, X_subj_cv, y_subj_train, y_subj_cv = train_test_split(
    subj_reviews_emotions, subj_reviews_stars, test_size=0.2, random_state=0)

gbc_subj = GradientBoostingClassifier(learning_rate=LEARN_RATE, 
                                n_estimators=N_TREES, 
                                min_samples_leaf=MIN_IN_LEAF,
                                random_state=0)
gbc_subj.fit(X_subj_train, y_subj_train)

In [None]:
print('Training score using all comments: {0:.2f}'
      .format(gbc_all.score(X_train, y_train)))
print('CV score using all comments: {0:.2f}'
      .format(gbc_all.score(X_cv, y_cv)))
print('')

print('Training score using subjective comments only: {0:.2f}'
      .format(gbc_subj.score(X_subj_train, y_subj_train)))
print('CV score using subjective comments only: {0:.2f}'
      .format(gbc_subj.score(X_subj_cv, y_subj_cv)))


In [None]:
np.unique(gbc_subj.predict(X_subj_train))