# Hypothesis to test:
### Removing objective sentences from reviews helps predict star rating from reviews

In [1]:
import numpy as np
import pandas as pd
import pickle
import gzip
import math
import random
from IPython.display import Markdown, display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor, \
GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, \
classification_report, make_scorer
import statsmodels.api as sm

In [2]:
import sys
sys.path.append('..')

In [3]:
# From this project
from utils import rmse, rmse_train_cv, classifier_report, confusion_rmse
from NLP import WordBag, AboutMovie
from subjective_filter import SubjectiveFilter

# Avoid restarting Kernel
%load_ext autoreload
%autoreload 2

pd.set_option('display.max_colwidth', -1)

# %autosave 50

## Configuration

In [4]:
data_path = '../../../datasets/'


In [5]:
pickle_in = open(data_path
                + 'movie_reviews_144000Pos_Neg_Samples.pkl'
                , "rb")
test_dic = pickle.load(pickle_in)

## Remove objective sentences for case B

In [6]:
OBJ_THRESHOLD = 0.8

In [7]:
%reload_ext autoreload
obj_path = '../obj_subj_dev/'
fit_obj_tf = obj_path + 'fit_tfidf_vectorizer_for_obj_subj_sentences_classification.pkl'
fit_obj_model = obj_path + 'GBC_300_0.5_5_0.88cv.pkl'
subj_filter = SubjectiveFilter(fit_obj_tf, fit_obj_model)

In [10]:
# for tt in ['train','test']:
#     for pn in ['positive','negative']:
#         print(tt,pn,test_dic[tt][np].shape)

for tt in test_dic.values():
    for df in tt.values():
        display(df.shape)

(40164, 5)

(41953, 5)

(10071, 5)

(10431, 5)

In [15]:
test_dic['train']['positive'].iloc[40162:40200,:]

Unnamed: 0,reviewerID,asin,reviewText,overall,words
1897726,A3CVEWCVAKE93O,B00020BVYW,"It was a good movie showing the harshness of Alaska, and the human spirit to keep going. I liked it and would recommend it.",4.0,"[wa, good, movie, showing, harshness, alaska, and, human, spirit, keep, going, liked, and, would, recommend]"
1897729,A12KWSWHLN5YE1,B00020BVYW,"We really enjoyed this movie..scary survival scenario under any circumstances, gripping right to the end. Just the right mix of fear and feel good for is.",4.0,"[really, enjoyed, movie..scary, survival, scenario, circumstance, gripping, right, end, right, mix, fear, and, feel, good]"


In [None]:
# from IPython.display import Markdown, display

# for tt in test_dic.values():
#     for df in tt.values():
#         display(df.head(3))

In [None]:
# type(test_dic['train']['positive']['reviewText'])

In [None]:
# test = test_dic['train']['positive'].drop(['sentence','words'],axis=1).iloc[:2,:]  # 
# test

In [None]:
# res = subj_filter.transform(
#             test,
#             'reviewText', 
#             threshold=OBJ_THRESHOLD,
#             debug_level=2)

In [None]:
tp = subj_filter.transform(
            test_dic['train']['positive'].drop(['sentence','words'],axis=1,errors='ignore'),
            'reviewText', 
            threshold=OBJ_THRESHOLD,
            debug_level=2)

In [None]:
pickle_out = open(data_path + 'tp_subj_movie_reviews_0.8.pkl', "wb")
pickle.dump(movie_reviews, pickle_out)

In [19]:
df.iloc[:5,:]

Unnamed: 0,reviewerID,asin,reviewText,overall
861912,A3KEABH9BZ1MPB,6303625800,The adults found it very boring and the children ages 8 and 10 said it was just ok,1.0
861931,A1EBEAG4PJOIB2,6303625800,"Love this film, especially at Thanksgiving. But as long as it's only available in the phony and insulting colorized version I won't touch it. This is a piece of art. Shame on those who tamper with this and other films. It's like someone drawing a mustache on the Mona Lisa.",1.0
862419,A3MV1KKHX51FYT,6303626475,"This is the film version of the J.B. Priestly horror novel. It was a dark and stormy night, a couple is driving on a muddy country road through the Welsh mountains. Their touring car does not have side curtains. They are lost, and pull into a house with lights. The Femm household has some strange characters. This house has its own electricity. There is conflict among the household, like when prayers are offered before a meal. Then a knock on the door brings another couple to this house; they are also lost. The guests tell about themselves. [Filming in the rain must have been a technical breakthrough that year.]When they go to fetch a lamp they hear a strange sound. Is there someone in a locked room? Does the butler have a drinking problem? Will something terrible happen? If a madman is locked up what will happen if he is released? Will the innocent survive and the guilty die?This is boring and almost unwatchable film, a waste of time. It does show the performances of some actors who became more famous years later. Some of the scenes reflect the hidden quirks of the actors.",1.0
862466,AM55WQ075SIK7,6303626475,the movie called The Old Dark House should be called The Old Dark Horrible Movie. Terrible!!!!! I'm sorry I listened to the other reviews. I gave it to Goodwill.,1.0
862856,A9XKE4OE48BNK,6303631940,"This is actually neck-in-neck with Colin Baker's horrible regeneration story 'The Twin Dilemma'. This mess isn't really Sylvester McCoy's fault. The script is wretched, the plot is nonsensical (even by Who standards), and the Rani actually seems to be looking at the TV audience at one point. Ugh. It, like all of McCoy's first season, also commits the cardinal sin of being both excruciating and boring. Don't give up on McCoy though. Check out some of his stories from later in his run like Remembrance of the Daleks and Curse of Fenric.",1.0


In [24]:
res['test']['negative']

Unnamed: 0,reviewerID,asin,reviewText,overall
0,A1EBEAG4PJOIB2,6303625800,"Love this film, especially at Thanksgiving. But as long as it's only available in the phony and insulting colorized version I won't touch it. This is a piece of art. Shame on those who tamper with this and other films. It's like someone drawing a mustache on the Mona Lisa. Love this film, especially at Thanksgiving. But as long as it's only available in the phony and insulting colorized version I won't touch it. This is a piece of art. Shame on those who tamper with this and other films. It's like someone drawing a mustache on the Mona Lisa.",1.0
1,A3KEABH9BZ1MPB,6303625800,The adults found it very boring and the children ages 8 and 10 said it was just ok The adults found it very boring and the children ages 8 and 10 said it was just ok,1.0
2,A3MV1KKHX51FYT,6303626475,"This is the film version of the J.B. Priestly horror novel. The Femm household has some strange characters. This house has its own electricity. There is conflict among the household, like when prayers are offered before a meal. The guests tell about themselves. [Filming in the rain must have been a technical breakthrough that year. Is there someone in a locked room? Does the butler have a drinking problem? Will something terrible happen? If a madman is locked up what will happen if he is released? Will the innocent survive and the guilty die?This is boring and almost unwatchable film, a waste of time. It does show the performances of some actors who became more famous years later. Some of the scenes reflect the hidden quirks of the actors.",1.0
3,AM55WQ075SIK7,6303626475,the movie called The Old Dark House should be called The Old Dark Horrible Movie. Terrible!!!!! I'm sorry I listened to the other reviews. I gave it to Goodwill. the movie called The Old Dark House should be called The Old Dark Horrible Movie. Terrible!!!!! I'm sorry I listened to the other reviews. I gave it to Goodwill.,1.0
4,A9XKE4OE48BNK,6303631940,"This is actually neck-in-neck with Colin Baker's horrible regeneration story 'The Twin Dilemma'. This mess isn't really Sylvester McCoy's fault. The script is wretched, the plot is nonsensical (even by Who standards), and the Rani actually seems to be looking at the TV audience at one point. Ugh. It, like all of McCoy's first season, also commits the cardinal sin of being both excruciating and boring. Don't give up on McCoy though. Check out some of his stories from later in his run like Remembrance of the Daleks and Curse of Fenric. This is actually neck-in-neck with Colin Baker's horrible regeneration story 'The Twin Dilemma'. This mess isn't really Sylvester McCoy's fault. The script is wretched, the plot is nonsensical (even by Who standards), and the Rani actually seems to be looking at the TV audience at one point. Ugh. It, like all of McCoy's first season, also commits the cardinal sin of being both excruciating and boring. Don't give up on McCoy though. Check out some of his stories from later in his run like Remembrance of the Daleks and Curse of Fenric.",1.0


In [25]:
CHUNK_SZ = 5000
res = {'train':{},'test':{}}

for ttname, tt in test_dic.items():
    for pn, df in tt.items():
        df = df.drop(['sentence','words'],axis=1,errors='ignore')
        df_list = []
        start = 0
        while start < df.shape[0]:
            end = start + CHUNK_SZ
            df_list.append(
                subj_filter.transform(
                    df.iloc[start:end,:],
                    'reviewText', 
                    threshold=OBJ_THRESHOLD,
                    debug_level=0) \
                .rename(columns={"sentence": "reviewText"}))
            start = end

        res[ttname][pn] = df_list.pop()
        while len(df_list) > 0:
            res[ttname][pn] = pd.merge(df_list.pop(), res[ttname][pn], how='outer')
#         res[ttname][pn] = reduce(lambda x, y: pd.merge(x, y, on = ['reviewerID','asin']), dfList)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['sentence'] = df['reviewText'].map(


#### => Removed 5902 (15%) objective sentences

#### => Removed 37 (1%) reviews with no emotional content

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['sentence'] = df['reviewText'].map(


#### => Removed 6064 (15%) objective sentences

#### => Removed 37 (1%) reviews with no emotional content

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['sentence'] = df['reviewText'].map(


#### => Removed 5923 (16%) objective sentences

#### => Removed 37 (1%) reviews with no emotional content

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['sentence'] = df['reviewText'].map(


#### => Removed 6436 (16%) objective sentences

#### => Removed 42 (1%) reviews with no emotional content

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['sentence'] = df['reviewText'].map(


#### => Removed 6141 (16%) objective sentences

#### => Removed 38 (1%) reviews with no emotional content

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['sentence'] = df['reviewText'].map(


#### => Removed 5373 (15%) objective sentences

#### => Removed 38 (1%) reviews with no emotional content

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['sentence'] = df['reviewText'].map(


#### => Removed 5286 (15%) objective sentences

#### => Removed 38 (1%) reviews with no emotional content

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['sentence'] = df['reviewText'].map(


#### => Removed 6017 (16%) objective sentences

#### => Removed 36 (1%) reviews with no emotional content

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['sentence'] = df['reviewText'].map(


#### => Removed 72 (11%) objective sentences

#### => Removed 2 (1%) reviews with no emotional content

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['sentence'] = df['reviewText'].map(


#### => Removed 3218 (9%) objective sentences

#### => Removed 17 (0%) reviews with no emotional content

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['sentence'] = df['reviewText'].map(


#### => Removed 3119 (8%) objective sentences

#### => Removed 14 (0%) reviews with no emotional content

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['sentence'] = df['reviewText'].map(


#### => Removed 3359 (9%) objective sentences

#### => Removed 17 (0%) reviews with no emotional content

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['sentence'] = df['reviewText'].map(


#### => Removed 3725 (9%) objective sentences

#### => Removed 13 (0%) reviews with no emotional content

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['sentence'] = df['reviewText'].map(


#### => Removed 3690 (9%) objective sentences

#### => Removed 19 (0%) reviews with no emotional content

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['sentence'] = df['reviewText'].map(


#### => Removed 4048 (10%) objective sentences

#### => Removed 9 (0%) reviews with no emotional content

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['sentence'] = df['reviewText'].map(


#### => Removed 3405 (9%) objective sentences

#### => Removed 21 (0%) reviews with no emotional content

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['sentence'] = df['reviewText'].map(


#### => Removed 3721 (10%) objective sentences

#### => Removed 21 (0%) reviews with no emotional content

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['sentence'] = df['reviewText'].map(


#### => Removed 1338 (9%) objective sentences

#### => Removed 5 (0%) reviews with no emotional content

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['sentence'] = df['reviewText'].map(


#### => Removed 6199 (16%) objective sentences

#### => Removed 34 (1%) reviews with no emotional content

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['sentence'] = df['reviewText'].map(


#### => Removed 5785 (15%) objective sentences

#### => Removed 36 (1%) reviews with no emotional content

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['sentence'] = df['reviewText'].map(


#### => Removed 48 (13%) objective sentences

#### => Removed 3 (4%) reviews with no emotional content

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['sentence'] = df['reviewText'].map(


#### => Removed 3411 (9%) objective sentences

#### => Removed 10 (0%) reviews with no emotional content

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['sentence'] = df['reviewText'].map(


#### => Removed 3850 (9%) objective sentences

#### => Removed 14 (0%) reviews with no emotional content

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['sentence'] = df['reviewText'].map(


#### => Removed 280 (9%) objective sentences

#### => Removed 3 (1%) reviews with no emotional content

In [35]:
total = 0
old_total = 0
for tt in ['train', 'test']:
    for pn in ['positive', 'negative']:
        print('\n {} {}'.format(tt,pn))
        tot = test_dic[tt][pn].shape[0]
        new = res[tt][pn].shape[0]
        dropped = tot - new
        print('Dropped: {0} ({1:.1%})'.format(dropped, dropped/tot))
        total += new
        old_total += tot
print('\nOld total:', old_total)
print('New total:', total)
print('Removed a total of:', old_total - total)


 train positive
Dropped: 305 (0.8%)

 train negative
Dropped: 136 (0.3%)

 test positive
Dropped: 73 (0.7%)

 test negative
Dropped: 27 (0.3%)

Old total: 102619
New total: 102078
Removed a total of: 541


In [34]:
pickle_out = open(data_path
                    + 'subj_movie_reviews_0.8.pkl'
                    , "wb")
pickle.dump(res, pickle_out)
pickle_out.close()

In [None]:
if 'A' in CASES:
    abtest['A'] = {
            'train': {
                'reviews_df': reviews['train']
            },
            'test': {
                'reviews_df': reviews['test']
            }
        }

if 'B' in CASES:
    'B': {
        'train': {
            'reviews_df': subj_filter.transform(reviews['train'],'reviewText', 
                                        threshold=OBJ_THRESHOLD,
                                        debug_level=2)
                                    .rename(columns={"sentence": "reviewText"}),
        },
        'test': {
            'reviews_df': subj_filter.transform(reviews['test'],'reviewText',
                                        threshold=OBJ_THRESHOLD,
                                        debug_level=2)
                                    .rename(columns={"sentence": "reviewText"}),
        }
    }

In [None]:
# res = subj_filter.transform(test_reviews.iloc[:200,:],'reviewText', debug_level=10)

In [None]:
def info(ab, train_test, df, nb_lines=3):
    print(abtest[ab][train_test][df].shape)
    display(abtest[ab][train_test][df].head(nb_lines))
info('A','test','reviews_df', 2)
info('B','test','reviews_df', 2)

## Create bag of words
Remove accents  
Tokenize  
Lower the case  
Apply custom stop words (keep all negations)  
Remove all non alphabetic characters  
Lematize  
 
Output:  
One list of words for each review 

In [None]:
abtest.pop('y', None)

In [None]:
word_bag = WordBag()
for train_test_dico in abtest.values():
    for dico in train_test_dico.values():
        # print(dico['reviews_df'].keys())
        dico['reviews_df']['word_bag'] = word_bag.create_word_bag(
            dico['reviews_df']['reviewText'], 
            remove_stop_words=True, 
            lemmatize=True)

In [None]:
info('A','test','reviews_df', 1)
# info('B','test','reviews_df', 2)

In [None]:
# %reload_ext autoreload
# word_bag = WordBag()

# for i in ['train','test']:
#     for j in ['positive','negative']:
#         train_test_dic[i][j]['words'] = \
#             word_bag.create(train_test_dic[i][j]['reviewText'])

## Remove reviews that may not be on the movie, but on Amazon/support instead
Input: 
* word tokens 
* one line per review 

In [None]:
%reload_ext autoreload
about_movie = AboutMovie()

for train_test_dico in abtest.values():
    for dico in train_test_dico.values():
        dico['movie_reviews'] = dico['reviews_df'][[about_movie.check(words) \
                                for words in dico['reviews_df']['word_bag']]]
        

In [None]:
info('A','test','movie_reviews', 1)
# info('B','test','reviews_df', 2)

In [None]:
# %reload_ext autoreload
# about_movie = AboutMovie()
# movie_reviews = {'train':{}, 'test':{}}
# for i in ['train','test']:
#     for j in ['positive','negative']:
#          movie_reviews[i][j] = train_test_dic[i][j][[about_movie.check(words) \
#                                                     for words in train_test_dic[i][j]['words']]]

In [None]:
# train_test_dic['test']['positive'][[not i for i in \
#                                     [about_movie.check(words) for words in train_test_dic[i][j]['words']]]]

In [None]:
tot_reviews = 0

for ab_name, train_test_dico in abtest.items():
    for train_test_name, dico in train_test_dico.items():
        removed = dico['reviews_df'].shape[0] - dico['movie_reviews'].shape[0]
        tot_reviews += dico['reviews_df'].shape[0]
        print('Removed {0} ({1:.0%}) {2} {3} reviews'
              .format(removed, removed / dico['reviews_df'].shape[0],
                      train_test_name, ab_name))
        

In [None]:
# tot_reviews = 0
# for i in ['train','test']:
#     for j in ['positive','negative']:
#         removed = train_test_dic[i][j].shape[0] - movie_reviews[i][j].shape[0]
#         tot_reviews += train_test_dic[i][j].shape[0]
#         print('Removed {0} ({1:.0%}) {2} {3} reviews'.format(removed, removed / train_test_dic[i][j].shape[0],
#                                                 i, j))

In [None]:
# save results
import datetime
if False:
    currentDT = datetime.datetime.now()
    for i in ['train','test']:
        for j in ['positive','negative']:
            train_test_dic[i][j].to_hdf(data_path + currentDT.strftime("%Y-%m-%d-%H-%M-%S") + '_' + 
                      str(tot_reviews) + '_cleaned_reviews_before_B_' + i + '_' + j + '.pkl'
              , key='df', mode='w', complevel=9)

### TF-IDF setup

In [None]:
MAX_FEATURES = 10000

In [None]:
def dummy_fun(doc):
    return doc

tfidf = TfidfVectorizer(
    analyzer='word',       # Feed a list of words to TF-IDF
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None,
    lowercase=False, 
    stop_words=None, 
    max_features=MAX_FEATURES,
    norm='l2',            # normalize each review
    use_idf=True)        # Keep high weight for most common words

In [None]:
SPARSE = True

for ab_dico in abtest.values():
    for dico in ab_dico.values():
        dico['tf_transform'] = tfidf.fit_transform(dico['movie_reviews']['word_bag'])
        if not SPARSE:
            dico['tf_transform'] = dico['tf_transform'].todense()

In [None]:
def print_ab():
    for abname, ab_dico in abtest.items():
        print(abname)
        for ttname, tt in ab_dico.items():
            print('--',ttname)
            if type(tt) == dict:
                for key in tt.keys():
                    print('    ',key)
print_ab()

In [None]:
for ab_dico in abtest.values():
    for tt in ['train','test']:
        print(ab_dico[tt]['tf_transform'].shape)

In [None]:
# SPARSE = True

# if SPARSE:
#     # Optimization: add the review length while keeping sparse matrix
#     tf_train = tfidf.fit_transform(train_words)
#     tf_test = tfidf.transform(test_words)
# else:
#     tf_train = tfidf.fit_transform(train_words).todense()
#     tf_test = tfidf.transform(test_words).todense()

In [None]:
# print(len(tfidf.vocabulary_))
# tfidf.vocabulary_

## Add review length to modeling input

In [None]:
ADD_LENGTH = False

# if ADD_LENGTH:
#     if SPARSE:
#         # Hack: pick an existing word to store the count
#         len_idx = 0
#         test_lengths = [len(words) for words in test_words]

#         for idx,words in enumerate(train_words):
#             tf_train[idx][len_idx] = len(words)
#         for idx,words in enumerate(test_words):
#             tf_test[idx][len_idx] = len(words)
#         X_train = tf_train
#         X_test = tf_test
#     else:
#         train_lengths = np.array([len(words) for words in train_words]).reshape(-1,1)
#         test_lengths = np.array([len(words) for words in test_words]).reshape(-1,1)
#         X_train = np.concatenate([tf_train, train_lengths],axis=1)
#         X_test = np.concatenate([tf_test, test_lengths],axis=1)


## Create and test y

In [None]:
# [(abtest['A']['train']['movie_reviews']['overall'] > 3) * 1.0]
y = {'A':{},'B':{}}
start = 100
end = 103
for ab, dico in abtest.items():
    for tt in ['train','test']:
        y[ab][tt] = (dico[tt]['movie_reviews']['overall'] > 3) * 1.0

        print('\n--------\n{} {} {}'.format(ab, tt, len(y[ab][tt])))
        print(dico[tt]['movie_reviews'][['reviewText','overall']]
              .iloc[start:end,:]) #[:5])
        print(y[ab][tt][start:end]) #[-5:])    

### Test and save

In [None]:
# for train_test_dico in abtest.values():
#     for dico in train_test_dico.values():
#         if dico['tf_transform'].shape[0] != 

In [None]:
# if abtest['B']['train'].shape[0] != y['train'].shape[0] \
#     or abtest['B']['test'] != y['test'].shape[0]:
#     print('@@@ Problem! @@@')
#     print(abtest['B']['train'].shape)
#     print(y['train'].shape)
#     print(abtest['B']['test'].shape)
#     print(y['test'].shape)
# else:
#     print('OK!')

In [None]:
if False:
    pickle_out = open(data_path 
                      + 'tfidf_' 
                      + str(X_train.shape[0]) + 'Pos_Neg_Samples_'
                      + str(X_train.shape[1]) + 'Feats.pkl'
                      ,"wb")
    pickle.dump(tfidf, pickle_out)
    pickle_out.close()

## Gradient Boosting Classifier

In [None]:
# Gradient Boosting Classifier parameters
# N_TREES = math.floor(np.sqrt(NB_SAMPLES) * 1.2)
N_TREES = 500
LEARN_RATE = 0.2
MAX_DEPTH = 8
MIN_IN_LEAF = 5 #7
MAX_FEATURES = 'sqrt'

In [None]:
for ab, ab_dico in abtest.items():
    ab_dico['gbc'] = GradientBoostingClassifier(learning_rate=LEARN_RATE, 
                                n_estimators=N_TREES, 
                                min_samples_leaf=MIN_IN_LEAF,
                                max_depth=MAX_DEPTH,
                                max_features=MAX_FEATURES)

    
    ab_dico['gbc'].fit(ab_dico['train']['tf_transform'], y[ab]['train'])

In [None]:
# gbc = GradientBoostingClassifier(learning_rate=LEARN_RATE, 
#                                 n_estimators=N_TREES, 
#                                 min_samples_leaf=MIN_IN_LEAF,
#                                 max_depth=MAX_DEPTH,
#                                 max_features=MAX_FEATURES)

In [None]:
# gbc.fit(X_train, y_train)

In [None]:
if False:
    pickle.dump(gbc, open(data_path + 'GBC_'
                       + str(NB_SAMPLES) + '_samples_'
                       + str(N_TREES) + '_trees_' 
                       + str(LEARN_RATE) + '_lr_' 
                       + str(MAX_DEPTH) + '_maxdpth_'
                       + str(MIN_IN_LEAF) + '_minleaf_'
                       + str(MAX_FEATURES) + '_feats_'
                       + '.pkl', 'wb'))

In [None]:
for ab, ab_dico in abtest.items():
    classifier_report(ab_dico['gbc'], 
                      ab_dico['train']['tf_transform'], 
                      y[ab]['train'],
                      'Gradient Boosting Classifier on training set {}'.format(ab))

In [None]:
print('SAMPLE_FRACTION:', SAMPLE_FRACTION,'ADD_LENGTH:',ADD_LENGTH,' SPARSE:',SPARSE,' MAX_FEATURES:',MAX_FEATURES)

print(MAX_FEATURES, ' features', N_TREES,'trees; ',
      LEARN_RATE,'learn_rate; ', MAX_DEPTH, 'max_dpth; ',
      MIN_IN_LEAF, 'min_in_leaf')

for ab, ab_dico in abtest.items():
    classifier_report(ab_dico['gbc'], 
                      ab_dico['test']['tf_transform'], 
                      y[ab]['test'],
                      'Gradient Boosting Classifier on test set {}'.format(ab))



In [None]:
# print('SAMPLE_FRACTION:', SAMPLE_FRACTION,'ADD_LENGTH:',ADD_LENGTH,' SPARSE:',SPARSE,' MAX_FEATURES:',MAX_FEATURES)


## Grid search

In [None]:
if False:
    gb_pipe = Pipeline([('vect', tfidf), ('gb', gbc)])
    gb_pipe.fit(X_train, y_train)
    pickle.dump(gb_pipe, open('pickles/GBCpipe_balanced_comments_'
                           + str(N_TREES) + '_trees_' 
                           + str(LEARN_RATE) + '_lr_' 
                           + str(MAX_DEPTH) + '_maxdpth_'
                           + str(MIN_IN_LEAF) + '_minleaf_'
                           + str(MAX_FEATURES) + '_feats_'
                           + '.pkl', 'wb'))
else:
#     pickle_in = open("pickles/GBC_balanced_comments_300_trees_0.1_lr_15_maxdpth_2_minleaf_20000_feats_.pkl",
#                      "rb")
#     gb_pipe = pickle.load(pickle_in)

In [None]:
if False:
    if True:
        grid = {
            'learning_rate': [.1,0.2,0.3],
            'max_depth': [8],
            'min_samples_leaf': [5],
            'max_features': ['sqrt'],
            'n_estimators': [300],
            'random_state': [0]
        }
    else:  # TEST
        grid = {
        'learning_rate': [1],
        'max_depth': [2], 
        'min_samples_leaf': [2],
    #     'max_features': ['sqrt', None],
        'n_estimators': [2],
        'random_state': [0]
    }

    # confusion_score = make_scorer(confusion_rmse, greater_is_better=False)

    gbc_grid_cv = GridSearchCV(
        GradientBoostingClassifier(), 
        grid,
        cv=4,  # number of folds
        return_train_score=True,
        verbose=1, 
        n_jobs=-1)
    gbc_grid_cv.fit(X_train, y_train)

In [None]:
if False:
    y_pred = gbc_grid_cv.predict(X_test)

    print('SAMPLE_FRACTION:', SAMPLE_FRACTION,'ADD_LENGTH:',ADD_LENGTH,' SPARSE:',SPARSE,' MAX_FEATURES:',MAX_FEATURES)

    print(gbc_grid_cv.best_params_)
    print(gbc_grid_cv.best_score_)
    res_df = pd.DataFrame(gbc_grid_cv.cv_results_)
    res_df