# Hypothesis to test:
### Removing objective sentences from reviews helps predict star rating from reviews

In [1]:
import numpy as np
import pandas as pd
import pickle
import gzip
import math
import random
from IPython.display import Markdown, display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor, \
GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, \
classification_report, make_scorer
import statsmodels.api as sm

In [2]:
import sys
sys.path.append('..')

In [20]:
# From this project
from utils import rmse, rmse_train_cv, classifier_report, confusion_rmse
from NLP import WordBag, AboutMovie
from subjective_filter import SubjectiveFilter

# Avoid restarting Kernel
%load_ext autoreload
%autoreload 2

pd.set_option('display.max_colwidth', -1)

# %autosave 50

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Configuration

In [4]:
# Subsampling from Amazon reviews
NB_SAMPLES = 360000 #4000  # up to 200k, then change the input file

data_path = '../../../datasets/'

## Get users' positive and negative reviews

In [5]:
# file_name = '360000_balanced_train_test_reviews.pkl'
file_name = '_balanced_pos_neg_train_test_reviews.pkl'

In [6]:
pickle_in = open(data_path + str(NB_SAMPLES) + file_name,"rb")
train_test_dic0 = pickle.load(pickle_in)

## Subsample

In [7]:
SAMPLE_FRACTION = 0.01

In [8]:
train_test_dic = {'train': {}, 'test':{}}

In [9]:
for i in ['train','test']:
    for j in ['positive','negative']:
         train_test_dic[i][j] = train_test_dic0[i][j] \
            .iloc[:math.floor(len(train_test_dic0[i][j].index) * SAMPLE_FRACTION), :] \
            .drop(['reviewerName', 'helpful', 'summary', 'unixReviewTime', 'reviewTime'], axis=1)
            

## Create Train-CV and Test sets

In [12]:
reviews = {}
y = {}
for i in ['train','test']:
    reviews[i] = pd.concat([train_test_dic[i]['positive'],
                     train_test_dic[i]['negative']]).reset_index(drop=True)
    y[i] = np.concatenate([np.ones((train_test_dic[i]['positive'].shape[0],)), 
                          np.zeros((train_test_dic[i]['negative'].shape[0],))])

In [18]:
def info(dic, nb_lines=2):
    for i in ['train','test']:
        display(Markdown('#### {}:'.format(i)))
        print(dic[i].shape)
        display(dic[i].head(nb_lines))

In [19]:
info(reviews)

#### train:

(2880, 4)


Unnamed: 0,reviewerID,asin,reviewText,overall
0,A32244V7CQUBD6,B00005QFEK,This video actually focuses mostly on one of the characters that Emmanuelle (Krista Allen) is trying to teach about sex & love. It's still pretty entertaining but if you are mostly interested in Kirsta Allen then you should know that she's not really in much of this episode.,4.0
1,A32244V7CQUBD6,B00005QFEN,"This episode pretty much has Hafron and Emmanuelle teleporting to different parts of the world and &quot;doing it&quot;. There is the continuing plot from an earlier episode of some group on Earth trying to track them down. That's the main reason for Emmanuelle and Hafron to jump to different parts of the world. Otherwise, this episode is mostly sex scenes.",4.0


#### test:

(720, 4)


Unnamed: 0,reviewerID,asin,reviewText,overall
0,A2D832OA6Q5ZAS,B00005QFFP,"What a pleasure to see this peerless diva perform -- she is the purest example of her art that I have ever known. Like a great actress, she dissolves into her role, and yet her voice and style are easily recognizable for the effortless simplicity with which she nails every note and figure. I wish they could have taped in color in those days, and I would have liked to see her perform more examples of the coloratura repertoire and less of the popular romantic themes. Overall, I am delighted with this video.",4.0
1,A1JF78EP4GPAOO,B00005QG2N,The concert is fantastic as are the videos and Cradle of Fear trailer. The rest however seems to be uninspired filler. I would have liked this better if it included the Pandaemonaeon music videos and Her Ghost in the Fog. Where was the BBC documentary?,4.0


## Remove objective sentences for case B

In [22]:
%reload_ext autoreload
subj_filter = SubjectiveFilter()
cases = {
    'A': reviews,
    'B': {
        'train' : subj_filter.transform(reviews['train']),
        'test': subj_filter.transform(reviews['test']),
    }}

## Create bag of words
Remove accents  
Tokenize  
Lower the case  
Apply custom stop words (keep all negations)  
Remove all non alphabetic characters  
Lematize  
 
Output:  
One list of words for each review 

In [None]:
train_test_dic['train']['positive'].iloc[:3,:]

In [None]:
%reload_ext autoreload
word_bag = WordBag()

for i in ['train','test']:
    for j in ['positive','negative']:
        train_test_dic[i][j]['words'] = \
            word_bag.create(train_test_dic[i][j]['reviewText'])

## Remove reviews that may not be on the movie, but on Amazon/support instead
Input: 
* word tokens 
* one line per review 

In [None]:
%reload_ext autoreload
about_movie = AboutMovie()
movie_reviews = {'train':{}, 'test':{}}
for i in ['train','test']:
    for j in ['positive','negative']:
         movie_reviews[i][j] = train_test_dic[i][j][[about_movie.check(words) \
                                                    for words in train_test_dic[i][j]['words']]]

In [None]:
# train_test_dic['test']['positive'][[not i for i in \
#                                     [about_movie.check(words) for words in train_test_dic[i][j]['words']]]]

In [None]:
tot_reviews = 0
for i in ['train','test']:
    for j in ['positive','negative']:
        removed = train_test_dic[i][j].shape[0] - movie_reviews[i][j].shape[0]
        tot_reviews += train_test_dic[i][j].shape[0]
        print('Removed {0} ({1:.0%}) {2} {3} reviews'.format(removed, removed / train_test_dic[i][j].shape[0],
                                                i, j))

In [None]:
# save results
import datetime
if False:
    currentDT = datetime.datetime.now()
    for i in ['train','test']:
        for j in ['positive','negative']:
            train_test_dic[i][j].to_hdf(data_path + currentDT.strftime("%Y-%m-%d-%H-%M-%S") + '_' + 
                      str(tot_reviews) + '_cleaned_reviews_before_B_' + i + '_' + j + '.pkl'
              , key='df', mode='w', complevel=9)

### TF-IDF setup

In [None]:
MAX_FEATURES = 10000

In [None]:
def dummy_fun(doc):
    return doc

tfidf = TfidfVectorizer(
    analyzer='word',       # Feed a list of words to TF-IDF
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None,
    lowercase=False, 
    stop_words=None, 
    max_features=MAX_FEATURES,
    norm='l2',            # normalize each review
    use_idf=True)        # Keep high weight for most common words

In [None]:
SPARSE = True

if SPARSE:
    # Optimization: add the review length while keeping sparse matrix
    tf_train = tfidf.fit_transform(train_words)
    tf_test = tfidf.transform(test_words)
else:
    tf_train = tfidf.fit_transform(train_words).todense()
    tf_test = tfidf.transform(test_words).todense()

In [None]:
# print(len(tfidf.vocabulary_))
# tfidf.vocabulary_

## Add review length to modeling input

In [None]:
ADD_LENGTH = False

if ADD_LENGTH:
    if SPARSE:
        # Hack: pick an existing word to store the count
        len_idx = 0
        test_lengths = [len(words) for words in test_words]

        for idx,words in enumerate(train_words):
            tf_train[idx][len_idx] = len(words)
        for idx,words in enumerate(test_words):
            tf_test[idx][len_idx] = len(words)
        X_train = tf_train
        X_test = tf_test
    else:
        train_lengths = np.array([len(words) for words in train_words]).reshape(-1,1)
        test_lengths = np.array([len(words) for words in test_words]).reshape(-1,1)
        X_train = np.concatenate([tf_train, train_lengths],axis=1)
        X_test = np.concatenate([tf_test, test_lengths],axis=1)
else:
    X_train = tf_train
    X_test = tf_test

### Test and save

In [None]:
if X_train.shape[0] != y_train.shape[0] or X_test.shape[0] != y_test.shape[0]:
    print('@@@ Problem! @@@')
    print(X_train.shape)
    print(y_train.shape)
    print(X_test.shape)
    print(y_test.shape)

In [None]:
if False:
    pickle_out = open(data_path 
                      + 'tfidf_' 
                      + str(X_train.shape[0]) + 'Pos_Neg_Samples_'
                      + str(X_train.shape[1]) + 'Feats.pkl'
                      ,"wb")
    pickle.dump(tfidf, pickle_out)
    pickle_out.close()

## Gradient Boosting Classifier for Base

In [None]:
# Gradient Boosting Classifier parameters
# N_TREES = math.floor(np.sqrt(NB_SAMPLES) * 1.2)
N_TREES = 500
LEARN_RATE = 0.2
MAX_DEPTH = 8
MIN_IN_LEAF = 5 #7
MAX_FEATURES = 'sqrt'

In [None]:
gbc = GradientBoostingClassifier(learning_rate=LEARN_RATE, 
                                n_estimators=N_TREES, 
                                min_samples_leaf=MIN_IN_LEAF,
                                max_depth=MAX_DEPTH,
                                max_features=MAX_FEATURES)

In [None]:
gbc.fit(X_train, y_train)

In [None]:
if False:
    pickle.dump(gbc, open(data_path + 'GBC_'
                       + str(NB_SAMPLES) + '_samples_'
                       + str(N_TREES) + '_trees_' 
                       + str(LEARN_RATE) + '_lr_' 
                       + str(MAX_DEPTH) + '_maxdpth_'
                       + str(MIN_IN_LEAF) + '_minleaf_'
                       + str(MAX_FEATURES) + '_feats_'
                       + '.pkl', 'wb'))

In [None]:
%reload_ext autoreload

print(MAX_FEATURES, ' features', N_TREES,'trees; ',
      LEARN_RATE,'learn_rate; ', MAX_DEPTH, 'max_dpth; ',
      MIN_IN_LEAF, 'min_in_leaf')
classifier_report(gbc, X_train, y_train,
                  'Gradient Boosting Classifier on training set')
classifier_report(gbc, X_test, y_test, 
                  'Gradient Boosting Classifier on test set')

In [None]:
print('SAMPLE_FRACTION:', SAMPLE_FRACTION,'ADD_LENGTH:',ADD_LENGTH,' SPARSE:',SPARSE,' MAX_FEATURES:',MAX_FEATURES)

## Grid search

In [None]:
if False:
    gb_pipe = Pipeline([('vect', tfidf), ('gb', gbc)])
    gb_pipe.fit(X_train, y_train)
    pickle.dump(gb_pipe, open('pickles/GBCpipe_balanced_comments_'
                           + str(N_TREES) + '_trees_' 
                           + str(LEARN_RATE) + '_lr_' 
                           + str(MAX_DEPTH) + '_maxdpth_'
                           + str(MIN_IN_LEAF) + '_minleaf_'
                           + str(MAX_FEATURES) + '_feats_'
                           + '.pkl', 'wb'))
else:
#     pickle_in = open("pickles/GBC_balanced_comments_300_trees_0.1_lr_15_maxdpth_2_minleaf_20000_feats_.pkl",
#                      "rb")
#     gb_pipe = pickle.load(pickle_in)

In [None]:
if True:
    grid = {
        'learning_rate': [.1,0.2,0.3],
        'max_depth': [8],
        'min_samples_leaf': [5],
        'max_features': ['sqrt'],
        'n_estimators': [300],
        'random_state': [0]
    }
else:  # TEST
    grid = {
    'learning_rate': [1],
    'max_depth': [2], 
    'min_samples_leaf': [2],
#     'max_features': ['sqrt', None],
    'n_estimators': [2],
    'random_state': [0]
}
    
# confusion_score = make_scorer(confusion_rmse, greater_is_better=False)

gbc_grid_cv = GridSearchCV(
    GradientBoostingClassifier(), 
    grid,
    cv=4,  # number of folds
    return_train_score=True,
    verbose=1, 
    n_jobs=-1)
gbc_grid_cv.fit(X_train, y_train)

In [None]:
y_pred = gbc_grid_cv.predict(X_test)

print('SAMPLE_FRACTION:', SAMPLE_FRACTION,'ADD_LENGTH:',ADD_LENGTH,' SPARSE:',SPARSE,' MAX_FEATURES:',MAX_FEATURES)

print(gbc_grid_cv.best_params_)
print(gbc_grid_cv.best_score_)
res_df = pd.DataFrame(gbc_grid_cv.cv_results_)
res_df

# Case B: with objective sentences removed

In [None]:
## Split comments into separate sentences

In [None]:
test_reviews.head(1)

In [None]:
from nltk.tokenize import sent_tokenize
test_reviews['sentence'] = test_reviews['reviewText'].map(sent_tokenize)

In [None]:
test_reviews.head(1)

In [None]:
test_reviews.shape

In [None]:
# WARNING: update test_reviews in 2 places!
sentences = test_reviews['sentence'] \
.apply(pd.Series) \
.merge(test_reviews, left_index = True, right_index = True) \
.drop(['sentence'], axis = 1) \
.melt(id_vars = ['reviewerID', 'asin','overall'], value_name = 'sentence') \
.drop(['variable'], axis = 1) \
.dropna()

print(sentences.shape)

In [None]:
sentences.head(10)

## Vectorize along the word space of the obj-subj training set

In [None]:
obj_model_path = '../obj_subj_dev/'

In [None]:
pickle_in = open(obj_model_path + 'fit_tfidf_vectorizer_for_obj_subj_sentences_classification.pkl', 'rb')
obj_tfidf = pickle.load(pickle_in)
pickle_in.close()
len(obj_tfidf.vocabulary_)

In [None]:
obj_X = obj_tfidf.transform(sentences['sentence']).todense()

In [None]:
obj_X.shape

## Apply the obj-subj model

In [None]:
N_TREES = 100
LEARN_RATE = 0.1
MIN_IN_LEAF = 10
pickle_in = open(obj_model_path + 'GBC_300_0.5_5.pkl', 'rb')
obj_model = pickle.load(pickle_in)

In [None]:
y_test = obj_model.predict(obj_X)
print(len(y_test))
y_test[:10]

## Remove objective sentences for case B using obj-subj model

In [None]:
print(sentences.shape)
subjective_sentences = sentences[y_test == 1]

In [None]:
diff = len(y_test) - len(subjective_sentences)

display(Markdown('## => Removing {0} ({1:.0%}) objective sentences'
                 .format(diff, diff/len(y_test))))

### Quality control: objective sentences discarded: really bad filter!!!

In [None]:
sentences.shape

In [None]:
sentences[y_test == 0][:10]

In [None]:
subjective_sentences.head(2)

### Merge the sentences back into paragraph reviews

In [None]:
subj_groups = subjective_sentences.groupby(['reviewerID','asin'])

In [None]:
# can't rely on words, won't be there
def add_space(sentence):
    return ' ' + sentence
def merge_sentences(series):
    s = series.map(add_space)
    res = s.sum()
#     print(res)
#     print('##########')
    return res

subj_reviews_sentences = subj_groups['sentence'].agg(merge_sentences)
print(type(sentences))

In [None]:
subj_reviews_stars = subj_groups['overall'].mean() 

In [None]:
subj_reviews = pd.merge(subj_reviews_sentences, 
                        subj_reviews_stars, 
                        how='inner', on=['reviewerID', 'asin']).reset_index()
subj_reviews.head(2)

In [None]:
review_diff = len(test_reviews) - len(subj_review_comments)

display(Markdown('## => Removing {0} ({1:.0%}) reviews with no emotional content'
                 .format(review_diff, review_diff/len(test_reviews))))

### Check that stars still correspond to the right movie

In [None]:
# test = pd.merge(subj_reviews, )
test_reviews_groups = test_reviews.groupby(['reviewerID','asin'])

test_reviews_stars = test_reviews_groups['overall'].mean()

In [None]:
check = pd.merge(test_reviews_stars, subj_reviews,
                 how='inner', on=['reviewerID', 'asin'])
res = (check['overall_x'] == check['overall_y']).mean()
if res == 1:
    print('OK!')
else:
    print('### @@@@@@@@ PROBLEM! @@@@@@')
    