# Hypothesis to test:
### Removing objective sentences from reviews helps predict star rating from reviews

In [1]:
import numpy as np
import pandas as pd
import pickle
import gzip
import math
import random
from IPython.display import Markdown, display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor, \
GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, \
classification_report, make_scorer
import statsmodels.api as sm

In [2]:
import sys
sys.path.append('..')

In [3]:
# From this project
from utils import rmse, rmse_train_cv, classifier_report, confusion_rmse
from NLP import WordBag, AboutMovie
from subjective_filter import SubjectiveFilter

# Avoid restarting Kernel
%load_ext autoreload
%autoreload 2

pd.set_option('display.max_colwidth', -1)

# %autosave 50

## Configuration

In [4]:
# Subsampling from Amazon reviews
NB_SAMPLES = 360000 #4000  # up to 200k, then change the input file

data_path = '../../../datasets/'

## Get users' positive and negative reviews

In [5]:
# file_name = '360000_balanced_train_test_reviews.pkl'
file_name = '_balanced_pos_neg_train_test_reviews.pkl'

In [6]:
pickle_in = open(data_path + str(NB_SAMPLES) + file_name,"rb")
train_test_dic0 = pickle.load(pickle_in)

## Subsample

In [7]:
SAMPLE_FRACTION = 0.01

In [8]:
train_test_dic = {'train': {}, 'test':{}}

In [9]:
for i in ['train','test']:
    for j in ['positive','negative']:
         train_test_dic[i][j] = train_test_dic0[i][j] \
            .iloc[:math.floor(len(train_test_dic0[i][j].index) * SAMPLE_FRACTION), :] \
            .drop(['reviewerName', 'helpful', 'summary', 'unixReviewTime', 'reviewTime'], axis=1)
            

## Create Train-CV and Test sets

In [10]:
reviews = {}
y = {}
for i in ['train','test']:
    reviews[i] = pd.concat([train_test_dic[i]['positive'],
                     train_test_dic[i]['negative']]).reset_index(drop=True)
#     y[i] = np.concatenate([np.ones((train_test_dic[i]['positive'].shape[0],)), 
#                           np.zeros((train_test_dic[i]['negative'].shape[0],))])

In [12]:
def info0(dic, nb_lines=2):
    for i in ['train','test']:
        display(Markdown('#### {}:'.format(i)))
        print(dic[i].shape)
        display(dic[i].head(nb_lines))

In [13]:
info0(reviews)

#### train:

(2880, 4)


Unnamed: 0,reviewerID,asin,reviewText,overall
0,A32244V7CQUBD6,B00005QFEK,This video actually focuses mostly on one of the characters that Emmanuelle (Krista Allen) is trying to teach about sex & love. It's still pretty entertaining but if you are mostly interested in Kirsta Allen then you should know that she's not really in much of this episode.,4.0
1,A32244V7CQUBD6,B00005QFEN,"This episode pretty much has Hafron and Emmanuelle teleporting to different parts of the world and &quot;doing it&quot;. There is the continuing plot from an earlier episode of some group on Earth trying to track them down. That's the main reason for Emmanuelle and Hafron to jump to different parts of the world. Otherwise, this episode is mostly sex scenes.",4.0


#### test:

(720, 4)


Unnamed: 0,reviewerID,asin,reviewText,overall
0,A2D832OA6Q5ZAS,B00005QFFP,"What a pleasure to see this peerless diva perform -- she is the purest example of her art that I have ever known. Like a great actress, she dissolves into her role, and yet her voice and style are easily recognizable for the effortless simplicity with which she nails every note and figure. I wish they could have taped in color in those days, and I would have liked to see her perform more examples of the coloratura repertoire and less of the popular romantic themes. Overall, I am delighted with this video.",4.0
1,A1JF78EP4GPAOO,B00005QG2N,The concert is fantastic as are the videos and Cradle of Fear trailer. The rest however seems to be uninspired filler. I would have liked this better if it included the Pandaemonaeon music videos and Her Ghost in the Fog. Where was the BBC documentary?,4.0


## Remove objective sentences for case B

In [14]:
OBJ_THRESHOLD = 0.8

In [15]:
%reload_ext autoreload
obj_path = '../obj_subj_dev/'
fit_obj_tf = obj_path + 'fit_tfidf_vectorizer_for_obj_subj_sentences_classification.pkl'
fit_obj_model = obj_path + 'GBC_300_0.5_5_0.88cv.pkl'
subj_filter = SubjectiveFilter(fit_obj_tf, fit_obj_model)

In [16]:
abtest = {
    'A': {
        'train': {
            'reviews_df': reviews['train']
        },
        'test': {
            'reviews_df': reviews['test']
        }
    },
    'B': {
        'train': {
            'reviews_df': subj_filter.transform(reviews['train'],'reviewText', 
                                        threshold=OBJ_THRESHOLD,
                                        debug_level=2)
                                    .rename(columns={"sentence": "reviewText"}),
        },
        'test': {
            'reviews_df': subj_filter.transform(reviews['test'],'reviewText',
                                        threshold=OBJ_THRESHOLD,
                                        debug_level=2)
                                    .rename(columns={"sentence": "reviewText"}),
        }
    },
}

#### => Removed 2360 (10%) objective sentences

#### => Removed 16 (1%) reviews with no emotional content

OK! Split-merge match


##### A few objective sentences removed:

Unnamed: 0,reviewerID,asin,overall,sentence
3,A33P47VEH0YULL,B00005QG2N,4.0,"A well put together DVD for the ""Stinkiest band"" in the world."
18,A20EEWWSFMZ1PN,B00005QIVV,4.0,"Based on Hugh Wiley's inscrutable Oriental detective Mr. Wong, Boris Karloff is called upon by his detective friend to solve the murder of undercover detective Dan O'Grady that includes smuggling."
21,A3NKLBL5XYHRH1,B00005QJHJ,4.0,"This concert shows Bonnie's versatility as a performer, from her brand of country-rock to blues to pure vocalist."


#### => Removed 671 (12%) objective sentences

#### => Removed 1 (0%) reviews with no emotional content

OK! Split-merge match


##### A few objective sentences removed:

Unnamed: 0,reviewerID,asin,overall,sentence
0,A2D832OA6Q5ZAS,B00005QFFP,4.0,What a pleasure to see this peerless diva perform -- she is the purest example of her art that I have ever known.
1,A1JF78EP4GPAOO,B00005QG2N,4.0,The concert is fantastic as are the videos and Cradle of Fear trailer.
3,AINEXVPR5094O,B00005QIVM,4.0,storyline: eking cheng is like former gang boss that retired and is now restaurant worker or somthjing...his gf turns into a gangsta re[prsentin...so she tries to bring him back to thegangsta world...pretty ok action...


In [17]:
# res = subj_filter.transform(test_reviews.iloc[:200,:],'reviewText', debug_level=10)

In [18]:
def info(ab, train_test, df, nb_lines=3):
    print(abtest[ab][train_test][df].shape)
    display(abtest[ab][train_test][df].head(nb_lines))
info('A','test','reviews_df', 2)
info('B','test','reviews_df', 2)

(720, 5)


Unnamed: 0,reviewerID,asin,reviewText,overall,sentence
0,A2D832OA6Q5ZAS,B00005QFFP,"What a pleasure to see this peerless diva perform -- she is the purest example of her art that I have ever known. Like a great actress, she dissolves into her role, and yet her voice and style are easily recognizable for the effortless simplicity with which she nails every note and figure. I wish they could have taped in color in those days, and I would have liked to see her perform more examples of the coloratura repertoire and less of the popular romantic themes. Overall, I am delighted with this video.",4.0,"[What a pleasure to see this peerless diva perform -- she is the purest example of her art that I have ever known., Like a great actress, she dissolves into her role, and yet her voice and style are easily recognizable for the effortless simplicity with which she nails every note and figure., I wish they could have taped in color in those days, and I would have liked to see her perform more examples of the coloratura repertoire and less of the popular romantic themes., Overall, I am delighted with this video.]"
1,A1JF78EP4GPAOO,B00005QG2N,The concert is fantastic as are the videos and Cradle of Fear trailer. The rest however seems to be uninspired filler. I would have liked this better if it included the Pandaemonaeon music videos and Her Ghost in the Fog. Where was the BBC documentary?,4.0,"[The concert is fantastic as are the videos and Cradle of Fear trailer., The rest however seems to be uninspired filler., I would have liked this better if it included the Pandaemonaeon music videos and Her Ghost in the Fog., Where was the BBC documentary?]"


(719, 4)


Unnamed: 0,reviewerID,asin,reviewText,overall
0,A100RW34WSLTUW,B00005QTA2,"This was predictable, and well done. I love Kira, however it was lacking originality. I went away a little disappointed. It was good, however it could have been written better. This was predictable, and well done. I love Kira, however it was lacking originality. I went away a little disappointed. It was good, however it could have been written better.",4.0
1,A106WW1XZXU3JZ,630369067X,"This particular version of the film is incomplete. The tape ends right in the middle of the movie, literally end of the spindle ends. Unless mine was a fluke copy, which I doubt, the whole movie is not on the tape, which is lame, because you're going to want to see where this film goes. The movie itself is pretty good and weird, but this release of it is the pits. This particular version of the film is incomplete. The tape ends right in the middle of the movie, literally end of the spindle ends. Unless mine was a fluke copy, which I doubt, the whole movie is not on the tape, which is lame, because you're going to want to see where this film goes. The movie itself is pretty good and weird, but this release of it is the pits.",1.0


## Create bag of words
Remove accents  
Tokenize  
Lower the case  
Apply custom stop words (keep all negations)  
Remove all non alphabetic characters  
Lematize  
 
Output:  
One list of words for each review 

In [21]:
abtest.pop('y', None)

In [22]:
word_bag = WordBag()
for train_test_dico in abtest.values():
    for dico in train_test_dico.values():
        # print(dico['reviews_df'].keys())
        dico['reviews_df']['word_bag'] = word_bag.create_word_bag(
            dico['reviews_df']['reviewText'], 
            remove_stop_words=True, 
            lemmatize=True)

In [23]:
info('A','test','reviews_df', 1)
# info('B','test','reviews_df', 2)

(720, 6)


Unnamed: 0,reviewerID,asin,reviewText,overall,sentence,word_bag
0,A2D832OA6Q5ZAS,B00005QFFP,"What a pleasure to see this peerless diva perform -- she is the purest example of her art that I have ever known. Like a great actress, she dissolves into her role, and yet her voice and style are easily recognizable for the effortless simplicity with which she nails every note and figure. I wish they could have taped in color in those days, and I would have liked to see her perform more examples of the coloratura repertoire and less of the popular romantic themes. Overall, I am delighted with this video.",4.0,"[What a pleasure to see this peerless diva perform -- she is the purest example of her art that I have ever known., Like a great actress, she dissolves into her role, and yet her voice and style are easily recognizable for the effortless simplicity with which she nails every note and figure., I wish they could have taped in color in those days, and I would have liked to see her perform more examples of the coloratura repertoire and less of the popular romantic themes., Overall, I am delighted with this video.]","[pleasure, see, peerless, diva, perform, purest, example, art, ever, known, like, great, actress, dissolve, role, and, yet, voice, and, style, easily, recognizable, effortless, simplicity, nail, every, note, and, figure, wish, could, taped, color, day, and, would, liked, see, perform, more, example, coloratura, repertoire, and, le, popular, romantic, theme, overall, delighted, video]"


In [None]:
# %reload_ext autoreload
# word_bag = WordBag()

# for i in ['train','test']:
#     for j in ['positive','negative']:
#         train_test_dic[i][j]['words'] = \
#             word_bag.create(train_test_dic[i][j]['reviewText'])

## Remove reviews that may not be on the movie, but on Amazon/support instead
Input: 
* word tokens 
* one line per review 

In [24]:
%reload_ext autoreload
about_movie = AboutMovie()

for train_test_dico in abtest.values():
    for dico in train_test_dico.values():
        dico['movie_reviews'] = dico['reviews_df'][[about_movie.check(words) \
                                for words in dico['reviews_df']['word_bag']]]
        

In [25]:
info('A','test','movie_reviews', 1)
# info('B','test','reviews_df', 2)

(518, 6)


Unnamed: 0,reviewerID,asin,reviewText,overall,sentence,word_bag
0,A2D832OA6Q5ZAS,B00005QFFP,"What a pleasure to see this peerless diva perform -- she is the purest example of her art that I have ever known. Like a great actress, she dissolves into her role, and yet her voice and style are easily recognizable for the effortless simplicity with which she nails every note and figure. I wish they could have taped in color in those days, and I would have liked to see her perform more examples of the coloratura repertoire and less of the popular romantic themes. Overall, I am delighted with this video.",4.0,"[What a pleasure to see this peerless diva perform -- she is the purest example of her art that I have ever known., Like a great actress, she dissolves into her role, and yet her voice and style are easily recognizable for the effortless simplicity with which she nails every note and figure., I wish they could have taped in color in those days, and I would have liked to see her perform more examples of the coloratura repertoire and less of the popular romantic themes., Overall, I am delighted with this video.]","[pleasure, see, peerless, diva, perform, purest, example, art, ever, known, like, great, actress, dissolve, role, and, yet, voice, and, style, easily, recognizable, effortless, simplicity, nail, every, note, and, figure, wish, could, taped, color, day, and, would, liked, see, perform, more, example, coloratura, repertoire, and, le, popular, romantic, theme, overall, delighted, video]"


In [26]:
# %reload_ext autoreload
# about_movie = AboutMovie()
# movie_reviews = {'train':{}, 'test':{}}
# for i in ['train','test']:
#     for j in ['positive','negative']:
#          movie_reviews[i][j] = train_test_dic[i][j][[about_movie.check(words) \
#                                                     for words in train_test_dic[i][j]['words']]]

In [27]:
# train_test_dic['test']['positive'][[not i for i in \
#                                     [about_movie.check(words) for words in train_test_dic[i][j]['words']]]]

In [28]:
tot_reviews = 0

for ab_name, train_test_dico in abtest.items():
    for train_test_name, dico in train_test_dico.items():
        removed = dico['reviews_df'].shape[0] - dico['movie_reviews'].shape[0]
        tot_reviews += dico['reviews_df'].shape[0]
        print('Removed {0} ({1:.0%}) {2} {3} reviews'
              .format(removed, removed / dico['reviews_df'].shape[0],
                      train_test_name, ab_name))
        

Removed 889 (31%) train A reviews
Removed 202 (28%) test A reviews
Removed 878 (31%) train B reviews
Removed 200 (28%) test B reviews


In [None]:
# tot_reviews = 0
# for i in ['train','test']:
#     for j in ['positive','negative']:
#         removed = train_test_dic[i][j].shape[0] - movie_reviews[i][j].shape[0]
#         tot_reviews += train_test_dic[i][j].shape[0]
#         print('Removed {0} ({1:.0%}) {2} {3} reviews'.format(removed, removed / train_test_dic[i][j].shape[0],
#                                                 i, j))

In [None]:
# save results
import datetime
if False:
    currentDT = datetime.datetime.now()
    for i in ['train','test']:
        for j in ['positive','negative']:
            train_test_dic[i][j].to_hdf(data_path + currentDT.strftime("%Y-%m-%d-%H-%M-%S") + '_' + 
                      str(tot_reviews) + '_cleaned_reviews_before_B_' + i + '_' + j + '.pkl'
              , key='df', mode='w', complevel=9)

### TF-IDF setup

In [29]:
MAX_FEATURES = 10000

In [30]:
def dummy_fun(doc):
    return doc

tfidf = TfidfVectorizer(
    analyzer='word',       # Feed a list of words to TF-IDF
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None,
    lowercase=False, 
    stop_words=None, 
    max_features=MAX_FEATURES,
    norm='l2',            # normalize each review
    use_idf=True)        # Keep high weight for most common words

In [31]:
SPARSE = True

for ab_dico in abtest.values():
    for dico in ab_dico.values():
        dico['tf_transform'] = tfidf.fit_transform(dico['movie_reviews']['word_bag'])
        if not SPARSE:
            dico['tf_transform'] = dico['tf_transform'].todense()

In [77]:
def print_ab():
    for abname, ab_dico in abtest.items():
        print(abname)
        for ttname, tt in ab_dico.items():
            print('--',ttname)
            if type(tt) == dict:
                for key in tt.keys():
                    print('    ',key)
print_ab()

A
-- train
     reviews_df
     movie_reviews
     tf_transform
-- test
     reviews_df
     movie_reviews
     tf_transform
-- gbc
B
-- train
     reviews_df
     movie_reviews
     tf_transform
-- test
     reviews_df
     movie_reviews
     tf_transform
-- gbc


In [80]:
for ab_dico in abtest.values():
    for tt in ['train','test']:
        print(ab_dico[tt]['tf_transform'].shape)

(1991, 10000)
(518, 8833)
(1986, 10000)
(519, 8155)


In [None]:
# SPARSE = True

# if SPARSE:
#     # Optimization: add the review length while keeping sparse matrix
#     tf_train = tfidf.fit_transform(train_words)
#     tf_test = tfidf.transform(test_words)
# else:
#     tf_train = tfidf.fit_transform(train_words).todense()
#     tf_test = tfidf.transform(test_words).todense()

In [None]:
# print(len(tfidf.vocabulary_))
# tfidf.vocabulary_

## Add review length to modeling input

In [69]:
ADD_LENGTH = False

# if ADD_LENGTH:
#     if SPARSE:
#         # Hack: pick an existing word to store the count
#         len_idx = 0
#         test_lengths = [len(words) for words in test_words]

#         for idx,words in enumerate(train_words):
#             tf_train[idx][len_idx] = len(words)
#         for idx,words in enumerate(test_words):
#             tf_test[idx][len_idx] = len(words)
#         X_train = tf_train
#         X_test = tf_test
#     else:
#         train_lengths = np.array([len(words) for words in train_words]).reshape(-1,1)
#         test_lengths = np.array([len(words) for words in test_words]).reshape(-1,1)
#         X_train = np.concatenate([tf_train, train_lengths],axis=1)
#         X_test = np.concatenate([tf_test, test_lengths],axis=1)


## Create and test y

In [67]:
# [(abtest['A']['train']['movie_reviews']['overall'] > 3) * 1.0]
y = {'A':{},'B':{}}
start = 100
end = 103
for ab, dico in abtest.items():
    for tt in ['train','test']:
        y[ab][tt] = (dico[tt]['movie_reviews']['overall'] > 3) * 1.0

        print('\n--------\n{} {} {}'.format(ab, tt, len(y[ab][tt])))
        print(dico[tt]['movie_reviews'][['reviewText','overall']]
              .iloc[start:end,:]) #[:5])
        print(y[ab][tt][start:end]) #[-5:])    


--------
A train 1991
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

### Test and save

In [None]:
# for train_test_dico in abtest.values():
#     for dico in train_test_dico.values():
#         if dico['tf_transform'].shape[0] != 

In [35]:
# if abtest['B']['train'].shape[0] != y['train'].shape[0] \
#     or abtest['B']['test'] != y['test'].shape[0]:
#     print('@@@ Problem! @@@')
#     print(abtest['B']['train'].shape)
#     print(y['train'].shape)
#     print(abtest['B']['test'].shape)
#     print(y['test'].shape)
# else:
#     print('OK!')

In [None]:
if False:
    pickle_out = open(data_path 
                      + 'tfidf_' 
                      + str(X_train.shape[0]) + 'Pos_Neg_Samples_'
                      + str(X_train.shape[1]) + 'Feats.pkl'
                      ,"wb")
    pickle.dump(tfidf, pickle_out)
    pickle_out.close()

## Gradient Boosting Classifier for Base

In [56]:
# Gradient Boosting Classifier parameters
# N_TREES = math.floor(np.sqrt(NB_SAMPLES) * 1.2)
N_TREES = 500
LEARN_RATE = 0.2
MAX_DEPTH = 8
MIN_IN_LEAF = 5 #7
MAX_FEATURES = 'sqrt'

In [58]:
for ab, ab_dico in abtest.items():
    ab_dico['gbc'] = GradientBoostingClassifier(learning_rate=LEARN_RATE, 
                                n_estimators=N_TREES, 
                                min_samples_leaf=MIN_IN_LEAF,
                                max_depth=MAX_DEPTH,
                                max_features=MAX_FEATURES)

    
    ab_dico['gbc'].fit(ab_dico['train']['tf_transform'], y[ab]['train'])

In [None]:
# gbc = GradientBoostingClassifier(learning_rate=LEARN_RATE, 
#                                 n_estimators=N_TREES, 
#                                 min_samples_leaf=MIN_IN_LEAF,
#                                 max_depth=MAX_DEPTH,
#                                 max_features=MAX_FEATURES)

In [None]:
# gbc.fit(X_train, y_train)

In [None]:
if False:
    pickle.dump(gbc, open(data_path + 'GBC_'
                       + str(NB_SAMPLES) + '_samples_'
                       + str(N_TREES) + '_trees_' 
                       + str(LEARN_RATE) + '_lr_' 
                       + str(MAX_DEPTH) + '_maxdpth_'
                       + str(MIN_IN_LEAF) + '_minleaf_'
                       + str(MAX_FEATURES) + '_feats_'
                       + '.pkl', 'wb'))

In [59]:
for ab, ab_dico in abtest.items():
    classifier_report(ab_dico['gbc'], 
                      ab_dico['train']['tf_transform'], 
                      y[ab]['train'],
                      'Gradient Boosting Classifier on training set {}'.format(ab))

### Report for Gradient Boosting Classifier on training set A:

##### Off diagonal: 0%

#### Confusion Matrix:

[[ 964    0]
 [   0 1027]]


#### Classification Report:

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       964
         1.0       1.00      1.00      1.00      1027

    accuracy                           1.00      1991
   macro avg       1.00      1.00      1.00      1991
weighted avg       1.00      1.00      1.00      1991



### Report for Gradient Boosting Classifier on training set B:

##### Off diagonal: 0%

#### Confusion Matrix:

[[ 963    0]
 [   0 1023]]


#### Classification Report:

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       963
         1.0       1.00      1.00      1.00      1023

    accuracy                           1.00      1986
   macro avg       1.00      1.00      1.00      1986
weighted avg       1.00      1.00      1.00      1986



In [70]:
print('SAMPLE_FRACTION:', SAMPLE_FRACTION,'ADD_LENGTH:',ADD_LENGTH,' SPARSE:',SPARSE,' MAX_FEATURES:',MAX_FEATURES)

print(MAX_FEATURES, ' features', N_TREES,'trees; ',
      LEARN_RATE,'learn_rate; ', MAX_DEPTH, 'max_dpth; ',
      MIN_IN_LEAF, 'min_in_leaf')

for ab, ab_dico in abtest.items():
    classifier_report(ab_dico['gbc'], 
                      ab_dico['test']['tf_transform'], 
                      y[ab]['test'],
                      'Gradient Boosting Classifier on test set {}'.format(ab))



SAMPLE_FRACTION: 0.01 ADD_LENGTH: False  SPARSE: True  MAX_FEATURES: sqrt
sqrt  features 500 trees;  0.2 learn_rate;  8 max_dpth;  5 min_in_leaf


ValueError: Number of features of the model must match the input. Model n_features is 10000 and input n_features is 8833 

In [None]:
print('SAMPLE_FRACTION:', SAMPLE_FRACTION,'ADD_LENGTH:',ADD_LENGTH,' SPARSE:',SPARSE,' MAX_FEATURES:',MAX_FEATURES)


## Grid search

In [None]:
if False:
    gb_pipe = Pipeline([('vect', tfidf), ('gb', gbc)])
    gb_pipe.fit(X_train, y_train)
    pickle.dump(gb_pipe, open('pickles/GBCpipe_balanced_comments_'
                           + str(N_TREES) + '_trees_' 
                           + str(LEARN_RATE) + '_lr_' 
                           + str(MAX_DEPTH) + '_maxdpth_'
                           + str(MIN_IN_LEAF) + '_minleaf_'
                           + str(MAX_FEATURES) + '_feats_'
                           + '.pkl', 'wb'))
else:
#     pickle_in = open("pickles/GBC_balanced_comments_300_trees_0.1_lr_15_maxdpth_2_minleaf_20000_feats_.pkl",
#                      "rb")
#     gb_pipe = pickle.load(pickle_in)

In [None]:
if True:
    grid = {
        'learning_rate': [.1,0.2,0.3],
        'max_depth': [8],
        'min_samples_leaf': [5],
        'max_features': ['sqrt'],
        'n_estimators': [300],
        'random_state': [0]
    }
else:  # TEST
    grid = {
    'learning_rate': [1],
    'max_depth': [2], 
    'min_samples_leaf': [2],
#     'max_features': ['sqrt', None],
    'n_estimators': [2],
    'random_state': [0]
}
    
# confusion_score = make_scorer(confusion_rmse, greater_is_better=False)

gbc_grid_cv = GridSearchCV(
    GradientBoostingClassifier(), 
    grid,
    cv=4,  # number of folds
    return_train_score=True,
    verbose=1, 
    n_jobs=-1)
gbc_grid_cv.fit(X_train, y_train)

In [None]:
y_pred = gbc_grid_cv.predict(X_test)

print('SAMPLE_FRACTION:', SAMPLE_FRACTION,'ADD_LENGTH:',ADD_LENGTH,' SPARSE:',SPARSE,' MAX_FEATURES:',MAX_FEATURES)

print(gbc_grid_cv.best_params_)
print(gbc_grid_cv.best_score_)
res_df = pd.DataFrame(gbc_grid_cv.cv_results_)
res_df

# Case B: with objective sentences removed

## Split comments into separate sentences

In [None]:
test_reviews = reviews['test']
test_reviews.head(1)

In [None]:
from nltk.tokenize import sent_tokenize
test_reviews['sentence'] = test_reviews['reviewText'].map(sent_tokenize)

In [None]:
test_reviews.head(1)

In [None]:
test_reviews.shape

In [None]:
# WARNING: update test_reviews in 2 places!
sentences = test_reviews['sentence'] \
.apply(pd.Series) \
.merge(test_reviews, left_index = True, right_index = True) \
.drop(['sentence'], axis = 1) \
.melt(id_vars = ['reviewerID', 'asin','overall'], value_name = 'sentence') \
.drop(['variable'], axis = 1) \
.dropna()

print(sentences.shape)

In [None]:
sentences.head(2)

## Vectorize along the word space of the obj-subj training set

In [None]:
obj_model_path = '../obj_subj_dev/'

In [None]:
pickle_in = open(obj_model_path + 'fit_tfidf_vectorizer_for_obj_subj_sentences_classification.pkl', 'rb')
obj_tfidf = pickle.load(pickle_in)
pickle_in.close()
len(obj_tfidf.vocabulary_)

In [None]:
obj_X = obj_tfidf.transform(sentences['sentence']).todense()

In [None]:
obj_X.shape

## Apply the obj-subj model

In [None]:
N_TREES = 100
LEARN_RATE = 0.1
MIN_IN_LEAF = 10
pickle_in = open(obj_model_path + 'GBC_300_0.5_5.pkl', 'rb')
obj_model = pickle.load(pickle_in)

In [None]:
y_test = obj_model.predict(obj_X)
print(len(y_test))
y_test[:10]

## Remove objective sentences for case B using obj-subj model

In [None]:
print(sentences.shape)
subjective_sentences = sentences[y_test == 1]

In [None]:
diff = len(y_test) - len(subjective_sentences)

display(Markdown('## => Removing {0} ({1:.0%}) objective sentences'
                 .format(diff, diff/len(y_test))))

### Quality control: objective sentences discarded: really bad filter!!!

In [None]:
sentences.shape

In [None]:
sentences[y_test == 0][:10]

In [None]:
subjective_sentences.head(2)

### Merge the sentences back into paragraph reviews

In [None]:
subj_groups = subjective_sentences.groupby(['reviewerID','asin'])

In [None]:
# can't rely on words, won't be there
def add_space(sentence):
    return ' ' + sentence
def merge_sentences(series):
    s = series.map(add_space)
    res = s.sum()
#     print(res)
#     print('##########')
    return res

subj_reviews_sentences = subj_groups['sentence'].agg(merge_sentences)
print(type(sentences))

In [None]:
subj_reviews_stars = subj_groups['overall'].mean() 

In [None]:
subj_reviews = pd.merge(subj_reviews_sentences, 
                        subj_reviews_stars, 
                        how='inner', on=['reviewerID', 'asin']).reset_index()
subj_reviews.head(2)

In [None]:
review_diff = len(test_reviews) - len(subj_review_comments)

display(Markdown('## => Removing {0} ({1:.0%}) reviews with no emotional content'
                 .format(review_diff, review_diff/len(test_reviews))))

### Check that stars still correspond to the right movie

In [None]:
# test = pd.merge(subj_reviews, )
test_reviews_groups = test_reviews.groupby(['reviewerID','asin'])

test_reviews_stars = test_reviews_groups['overall'].mean()

In [None]:
check = pd.merge(test_reviews_stars, subj_reviews,
                 how='inner', on=['reviewerID', 'asin'])
res = (check['overall_x'] == check['overall_y']).mean()
if res == 1:
    print('OK!')
else:
    print('### @@@@@@@@ PROBLEM! @@@@@@')
    