# Hypothesis to test: 
### Removing objective sentences from reviews helps predict star rating from reviews

In [1]:
import numpy as np
import pandas as pd
import pickle
import gzip
import math
import random
from IPython.display import Markdown, display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor, \
GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, \
classification_report, make_scorer
import statsmodels.api as sm

# From this project
from utils import rmse, rmse_train_cv, classifier_report, confusion_rmse
from NLPv0 import WordBag, AboutMovie


# Avoid restarting Kernel
%load_ext autoreload
%autoreload 2

pd.set_option('display.max_colwidth', -1)

# %autosave 50

In [2]:
%reload_ext autoreload
word_bag = WordBag()

## Configuration

In [3]:
# Subsampling from Amazon reviews
NB_SAMPLES = 360000 #4000  # up to 200k, then change the input file

data_path = '../../datasets/'

In [5]:
# 0.4*360

## Get users' positive and negative reviews

In [None]:
# # file_name = '360000_balanced_train_test_reviews.pkl'
# file_name = '_balanced_pos_neg_train_test_reviews.pkl'

# pickle_in = open(data_path + str(NB_SAMPLES) + file_name,"rb")
# train_test_dic0 = pickle.load(pickle_in)

## Subsample

In [30]:
SAMPLE_FRACTION = 0.4

# train_test_dic = {'train': {}, 'test':{}}

# for i in ['train','test']:
#     for j in ['positive','negative']:
#          train_test_dic[i][j] = train_test_dic0[i][j] \
#             .iloc[:math.floor(len(train_test_dic0[i][j].index) * SAMPLE_FRACTION), :] \
#             .drop(['reviewerName', 'helpful', 'summary', 'unixReviewTime', 'reviewTime'], axis=1)

In [8]:
train_test_dic['train']['positive'].head(3)
# df1[(df1['reviewerID'] == 'A32244V7CQUBD6') & (df1['asin'] == 'B00005QFEK')]

Unnamed: 0,asin,reviewerID,reviewText,overall
0,0,A32244V7CQUBD6,This video actually focuses mostly on one of the characters that Emmanuelle (Krista Allen) is trying to teach about sex & love. It's still pretty entertaining but if you are mostly interested in Kirsta Allen then you should know that she's not really in much of this episode.,4.0
1,1,A32244V7CQUBD6,"This episode pretty much has Hafron and Emmanuelle teleporting to different parts of the world and &quot;doing it&quot;. There is the continuing plot from an earlier episode of some group on Earth trying to track them down. That's the main reason for Emmanuelle and Hafron to jump to different parts of the world. Otherwise, this episode is mostly sex scenes.",4.0
2,2,A33KKMGGVLZ29T,"This is an intimate concert of Robert Mirabal. Although I thought that it was, as I said, masterful, the sound, at times sounded a little muffled.The storytelling of the songs gave an insight of native culture and of Mirabal's own family stories and history.The Dance and Ee You Oo are my picks for the best songs, but they are all a joy to watch. The Rare Tribal Mob and the Mirabal Singers/Dancers are great and provide a mesmerising stage performance.Very enjoyable",4.0
3,3,A33P47VEH0YULL,"A well put together DVD for the ""Stinkiest band"" in the world. Easy to navigate, and with some pretty interesting side notes...a great collectors item for any fan of Cradle of Filth or Swedish metal.",4.0
4,4,ADG33WELAQRZJ,"Great show from the inventors of the extreme gothic metal genre !! 75'of pure mayhem with good visuals and a great sound, although the vocals are a little high in the mix for my taste and the guitar on the right channel is 10 times louder than the one on the left so I ended up switching it to mono to even things up. It's hard to give this more stars esp. when you've watched the PTSFirepower DVD first like I did, that one is so much better. The 5 bonus videoclips totally rule and round up the package making the purchase a worthy one.Now the mockumentary may not be suited to everyone's tastes, personally it bored me and know of a lot of people who feels it ruins the band's image. Well don't watch it then.",4.0
5,5,A3YXITJWFW4BW,"I rented this dvd, i didnt buy it. I also rented PanDamonAeon and Mannequin. This dvd has excellent very long concert footage. Its practically the same songs as on Livebait for the Dead. And then we have the rest of the dvd. Im sorry, i love CoF but that ""schockumentary"" thing was just sad. I could barely stand to watch it. It's stupid and boring, maybe if theyd shown more of the band in a more coherent manner...and it just goes on and on forever. This is a 2 hour plus dvd adn half of it is crap. The ""blair twit Project"" is slightly amusing, but mostly boring. The music videos are good though. I would still recommend buying this dvd just for the music videos and the concert footage. PanDamonAeon gave me more joy though.",4.0
6,6,A214NHULS3H7OX,"Excellent Sound, Excellent Picture, Tons and Tons of songs. Love the Scorched Earth Erotic video. The only reason I didn't give this 5 stars is because I don't think they let their female singer sing enough, it adds a nice goth touch. I also did not give it five stars because it would have been nice to have the ""her ghost in the fog"" video on here, and finally I thin Dani could have done better on some parts of the songs. For example: On ""her ghost in the fog"" (live version), he chooses to growl in that typical death metal voice, instead of talking in the deep gothic voice like he does on the album. By using this growly voice, I think he is cheese'n off some of the songs best parts. Anyway, these are minor gripes and this is a must have DVD, so definitely BUY IT!...",4.0
7,7,A2IZOU2G1QX0JD,"As most of my friends hate, I'm massively into Cradle of Filth. In the style of Napoleon Dynamite describing a Liger, I would say ""...they're pretty much my favorite band."" Their music is among the worst in the industry, they suck in a way few bands can meaningfully aspire to. I've seen them live, bought pretty much every t-shirt from every album and tour, and have almost every album that they've released over the last ""decade and a bit"". I love it, I love it all. I recently purchased Heavy, Left-Handed, & Candid. If you don't like Cradle of Filth, then you'll most certainly hate this DVD. It's absolutely fantastic, full of everything I was hoping for.There is a bunch of documentary type stuff, eleven live performances, a couple of music videos, and a few other ""shorts"" put together by the band. Another one of the bonus items they threw on the disc is a short movie called ""The Blair Twit Project"". This is basically footage of the band walking through woodland, filmed entirely using the night-vision mode on a video camera as they follow the trail of ""a pissed northern bastard."" Being a ""northerner"" myself, it felt very much like a standard night walking home from the pub on a Friday night being idiots. It's funny to watch *once*.The DVD is actually really well put together and there is a ton of material included, which is a refreshing change when compared to some other music DVDs I've bought that barely even take the time to put a decent menu together. As with their music, most people will hate this DVD. I loved it, but then again, I'm weird. The live performances are pretty awesome and it has solidified my decision to have to see this band again soon. I don't know if they come to the US very often if at all, but some guy in Hot Topic assured me that he'd seen them here a few years ago. I'll have to keep my eyes on the tour dates. Then the hardest job will be convincing anyone to actually go with me. My wife say's she's up for it, but I think that will have to be seen to be believed.",4.0
8,8,ARUHLQVP23GQ1,"This category 3 classic mixes genres like only Hong Kong use to be able to do before the takeover. It over priced because of it's rarity so unless you collect rare Category 3 films you might want to get something else. As Category 3 films go this is entertaining but not as over the top as Dr. lamb, Red to Kill or the Untold Story.",4.0
9,9,ARYXA0US2VB5Q,"I remember this when i had it on video cd,it was a 2 cd set.I got because i wanted to know more about the asian culture.The story was strange,somewhat brutal & interesting.There is nudity for the guys here.I learned much from the movies.",4.0


In [None]:
# df1 = movie_reviews['train']['positive']
# df1[(df1['reviewerID'] == 'A32244V7CQUBD6') & (df1['asin'] == 'B00005QFEK')]

In [None]:
# movie_reviews = train_test_dic

# Choose case

In [32]:
# if False:
#     pickle_out = open(data_path
#                     + 'movie_reviews_' 
#                     + str(tot_reviews) + 'Pos_Neg_Samples.pkl'
#                     , "wb")
#     pickle.dump(movie_reviews, pickle_out)
#     pickle_out.close()
CASE = 'A'

if CASE == 'A':
    pickle_in = open(data_path
                    + 'A_forreviews_wout_most_subj_0.15.pkl'
                    , "rb")
    movie_reviews = pickle.load(pickle_in)
else:
    pickle_in = open(data_path
                    + 'reviews_wout_most_subj_0.15.pkl'
                    , "rb")
    movie_reviews = pickle.load(pickle_in)
pickle_in.close()

In [10]:
# if False:
#     for i in ['train','test']:
#         for j in ['positive','negative']:
#             movie_reviews[i][j]['words'] = \
#                 word_bag.create(movie_reviews[i][j]['reviewText'])

In [33]:
# total = 0
# for i in ['train','test']:
#     for j in ['positive','negative']:
#         print(train_test_dic[i][j].shape)
#         total += train_test_dic[i][j].shape[0]
# total

In [34]:
total = 0
for i in ['train','test']:
    for j in ['positive','negative']:
        print(movie_reviews[i][j].shape)
        total += movie_reviews[i][j].shape[0]
total

(57193, 4)
(57219, 4)
(14309, 4)
(14319, 4)


143040

In [35]:
movie_reviews['train']['positive'].head(3)

Unnamed: 0,asin,reviewerID,reviewText,overall
0,0,A32244V7CQUBD6,This video actually focuses mostly on one of the characters that Emmanuelle (Krista Allen) is trying to teach about sex & love. It's still pretty entertaining but if you are mostly interested in Kirsta Allen then you should know that she's not really in much of this episode.,4.0
1,1,A32244V7CQUBD6,"This episode pretty much has Hafron and Emmanuelle teleporting to different parts of the world and &quot;doing it&quot;. There is the continuing plot from an earlier episode of some group on Earth trying to track them down. That's the main reason for Emmanuelle and Hafron to jump to different parts of the world. Otherwise, this episode is mostly sex scenes.",4.0
2,2,A33KKMGGVLZ29T,"This is an intimate concert of Robert Mirabal. Although I thought that it was, as I said, masterful, the sound, at times sounded a little muffled.The storytelling of the songs gave an insight of native culture and of Mirabal's own family stories and history.The Dance and Ee You Oo are my picks for the best songs, but they are all a joy to watch. The Rare Tribal Mob and the Mirabal Singers/Dancers are great and provide a mesmerising stage performance.Very enjoyable",4.0


### TF-IDF setup

In [36]:
MAX_FEATURES = 10000

In [37]:
tfidf = TfidfVectorizer(
    lowercase=True, 
    stop_words=None, 
    max_features=MAX_FEATURES,
    norm='l2',            # normalize each review
    use_idf=True) 

In [38]:
train_words = pd.concat([movie_reviews['train']['positive']['reviewText'],
                     movie_reviews['train']['negative']['reviewText']])
y_train = np.concatenate([np.ones((movie_reviews['train']['positive'].shape[0],)), 
                          np.zeros((movie_reviews['train']['negative'].shape[0],))])
test_words = pd.concat([movie_reviews['test']['positive']['reviewText'],
                     movie_reviews['test']['negative']['reviewText']])
y_test = np.concatenate([np.ones((movie_reviews['test']['positive'].shape[0],)), 
                          np.zeros((movie_reviews['test']['negative'].shape[0],))])

In [39]:
SPARSE = True

if SPARSE:
    # Optimization: add the review length while keeping sparse matrix
    tf_train = tfidf.fit_transform(train_words)
    tf_test = tfidf.transform(test_words)
else:
    tf_train = tfidf.fit_transform(train_words).todense()
    tf_test = tfidf.transform(test_words).todense()

In [40]:
# print(len(tfidf.vocabulary_))
# tfidf.vocabulary_

## Add review length to modeling input

In [41]:
ADD_LENGTH = False

if ADD_LENGTH:
    if SPARSE:
        # Hack: pick an existing word to store the count
        len_idx = 0
        test_lengths = [len(words) for words in test_words]

        for idx,words in enumerate(train_words):
            tf_train[idx][len_idx] = len(words)
        for idx,words in enumerate(test_words):
            tf_test[idx][len_idx] = len(words)
        X_train = tf_train
        X_test = tf_test
    else:
        train_lengths = np.array([len(words) for words in train_words]).reshape(-1,1)
        test_lengths = np.array([len(words) for words in test_words]).reshape(-1,1)
        X_train = np.concatenate([tf_train, train_lengths],axis=1)
        X_test = np.concatenate([tf_test, test_lengths],axis=1)
else:
    X_train = tf_train
    X_test = tf_test

### Test and save

In [42]:
if X_train.shape[0] != y_train.shape[0] or X_test.shape[0] != y_test.shape[0]:
    print('@@@ Problem! @@@')
    print(X_train.shape)
    print(y_train.shape)
    print(X_test.shape)
    print(y_test.shape)

In [43]:
if False:
    pickle_out = open(data_path 
                      + 'tfidf_' 
                      + str(X_train.shape[0]) + 'Pos_Neg_Samples_'
                      + str(X_train.shape[1]) + 'Feats.pkl'
                      ,"wb")
    pickle.dump(tfidf, pickle_out)
    pickle_out.close()

## Gradient Boosting Classifier for Base

In [44]:
# Gradient Boosting Classifier parameters
# N_TREES = math.floor(np.sqrt(NB_SAMPLES) * 1.2)
N_TREES = 500
LEARN_RATE = 0.2
MAX_DEPTH = 8
MIN_IN_LEAF = 5 #7
MAX_FEATURES = 'sqrt'

In [45]:
gbc = GradientBoostingClassifier(learning_rate=LEARN_RATE, 
                                n_estimators=N_TREES, 
                                min_samples_leaf=MIN_IN_LEAF,
                                max_depth=MAX_DEPTH,
                                max_features=MAX_FEATURES)

In [46]:
gbc.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.2, loss='deviance', max_depth=8,
                           max_features='sqrt', max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=5, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=500,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [47]:
if False:
    pickle.dump(gbc, open(data_path + 'GBC_'
                       + str(NB_SAMPLES) + '_samples_'
                       + str(N_TREES) + '_trees_' 
                       + str(LEARN_RATE) + '_lr_' 
                       + str(MAX_DEPTH) + '_maxdpth_'
                       + str(MIN_IN_LEAF) + '_minleaf_'
                       + str(MAX_FEATURES) + '_feats_'
                       + '.pkl', 'wb'))

In [48]:
%reload_ext autoreload

print(MAX_FEATURES, ' features', N_TREES,'trees; ',
      LEARN_RATE,'learn_rate; ', MAX_DEPTH, 'max_dpth; ',
      MIN_IN_LEAF, 'min_in_leaf')
classifier_report(gbc, X_train, y_train,
                  'Gradient Boosting Classifier on training set')
classifier_report(gbc, X_test, y_test, 
                  'Gradient Boosting Classifier on test set')

sqrt  features 500 trees;  0.2 learn_rate;  8 max_dpth;  5 min_in_leaf


### Report for Gradient Boosting Classifier on training set:

##### Off diagonal: 5%

#### Confusion Matrix:

[[54241  2978]
 [ 2356 54837]]


#### Classification Report:

              precision    recall  f1-score   support

         0.0       0.96      0.95      0.95     57219
         1.0       0.95      0.96      0.95     57193

    accuracy                           0.95    114412
   macro avg       0.95      0.95      0.95    114412
weighted avg       0.95      0.95      0.95    114412



### Report for Gradient Boosting Classifier on test set:

##### Off diagonal: 11%

#### Confusion Matrix:

[[12608  1711]
 [ 1373 12936]]


#### Classification Report:

              precision    recall  f1-score   support

         0.0       0.90      0.88      0.89     14319
         1.0       0.88      0.90      0.89     14309

    accuracy                           0.89     28628
   macro avg       0.89      0.89      0.89     28628
weighted avg       0.89      0.89      0.89     28628



In [49]:
print('SAMPLE_FRACTION:', SAMPLE_FRACTION,'ADD_LENGTH:',ADD_LENGTH,' SPARSE:',SPARSE,' MAX_FEATURES:',MAX_FEATURES)

SAMPLE_FRACTION: 0.4 ADD_LENGTH: False  SPARSE: True  MAX_FEATURES: sqrt


## Grid search

In [None]:
if False:
    gb_pipe = Pipeline([('vect', tfidf), ('gb', gbc)])
    gb_pipe.fit(X_train, y_train)
    pickle.dump(gb_pipe, open('pickles/GBCpipe_balanced_comments_'
                           + str(N_TREES) + '_trees_' 
                           + str(LEARN_RATE) + '_lr_' 
                           + str(MAX_DEPTH) + '_maxdpth_'
                           + str(MIN_IN_LEAF) + '_minleaf_'
                           + str(MAX_FEATURES) + '_feats_'
                           + '.pkl', 'wb'))
else:
#     pickle_in = open("pickles/GBC_balanced_comments_300_trees_0.1_lr_15_maxdpth_2_minleaf_20000_feats_.pkl",
#                      "rb")
#     gb_pipe = pickle.load(pickle_in)

In [None]:
# if True:
#     grid = {
#         'learning_rate': [.1,0.2,0.3],
#         'max_depth': [8],
#         'min_samples_leaf': [5],
#         'max_features': ['sqrt'],
#         'n_estimators': [300],
#         'random_state': [0]
#     }
# else:  # TEST
#     grid = {
#     'learning_rate': [1],
#     'max_depth': [2], 
#     'min_samples_leaf': [2],
# #     'max_features': ['sqrt', None],
#     'n_estimators': [2],
#     'random_state': [0]
# }
    
# # confusion_score = make_scorer(confusion_rmse, greater_is_better=False)

# gbc_grid_cv = GridSearchCV(
#     GradientBoostingClassifier(), 
#     grid,
#     cv=4,  # number of folds
#     return_train_score=True,
#     verbose=1, 
#     n_jobs=-1)
# gbc_grid_cv.fit(X_train, y_train)

In [None]:
# y_pred = gbc_grid_cv.predict(X_test)

In [None]:
# print('SAMPLE_FRACTION:', SAMPLE_FRACTION,'ADD_LENGTH:',ADD_LENGTH,' SPARSE:',SPARSE,' MAX_FEATURES:',MAX_FEATURES)

In [None]:
# print(gbc_grid_cv.best_params_)
# print(gbc_grid_cv.best_score_)
# res_df = pd.DataFrame(gbc_grid_cv.cv_results_)
# res_df