# Hypothesis to test:
### Removing objective sentences from reviews helps predict star rating from reviews

In [1]:
import numpy as np
import pandas as pd
import pickle
import gzip
import math
import random
from IPython.display import Markdown, display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor, \
GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, \
classification_report, make_scorer
import statsmodels.api as sm

In [2]:
import sys
sys.path.append('..')

In [3]:
# From this project
from utils import rmse, rmse_train_cv, classifier_report, confusion_rmse
from NLP import WordBag, AboutMovie
from subjective_filter import SubjectiveFilter

# Avoid restarting Kernel
%load_ext autoreload
%autoreload 2

pd.set_option('display.max_colwidth', -1)

# %autosave 50

## Configuration

In [7]:
# Subsampling from Amazon reviews
NB_SAMPLES = 360000 #4000  # up to 200k, then change the input file
SAMPLE_FRACTION = 0.1
CASES = ['A']

data_path = '../../../datasets/'


## Get users' positive and negative reviews

In [5]:
# file_name = '360000_balanced_train_test_reviews.pkl'
file_name = '_balanced_pos_neg_train_test_reviews.pkl'

In [6]:
pickle_in = open(data_path + str(NB_SAMPLES) + file_name,"rb")
train_test_dic0 = pickle.load(pickle_in)

## Subsample

In [8]:
train_test_dic = {'train': {}, 'test':{}}

In [9]:
for i in ['train','test']:
    for j in ['positive','negative']:
         train_test_dic[i][j] = train_test_dic0[i][j] \
            .iloc[:math.floor(len(train_test_dic0[i][j].index) * SAMPLE_FRACTION), :] \
            .drop(['reviewerName', 'helpful', 'summary', 'unixReviewTime', 'reviewTime'], axis=1)
            

## Create Train-CV and Test sets

In [10]:
reviews = {}
y = {}
for i in ['train','test']:
    reviews[i] = pd.concat([train_test_dic[i]['positive'],
                     train_test_dic[i]['negative']]).reset_index(drop=True)
#     y[i] = np.concatenate([np.ones((train_test_dic[i]['positive'].shape[0],)), 
#                           np.zeros((train_test_dic[i]['negative'].shape[0],))])

In [11]:
def info0(dic, nb_lines=2):
    for i in ['train','test']:
        display(Markdown('#### {}:'.format(i)))
        print(dic[i].shape)
        display(dic[i].head(nb_lines))

In [12]:
info0(reviews)

#### train:

(28800, 4)


Unnamed: 0,reviewerID,asin,reviewText,overall
0,A32244V7CQUBD6,B00005QFEK,This video actually focuses mostly on one of the characters that Emmanuelle (Krista Allen) is trying to teach about sex & love. It's still pretty entertaining but if you are mostly interested in Kirsta Allen then you should know that she's not really in much of this episode.,4.0
1,A32244V7CQUBD6,B00005QFEN,"This episode pretty much has Hafron and Emmanuelle teleporting to different parts of the world and &quot;doing it&quot;. There is the continuing plot from an earlier episode of some group on Earth trying to track them down. That's the main reason for Emmanuelle and Hafron to jump to different parts of the world. Otherwise, this episode is mostly sex scenes.",4.0


#### test:

(7200, 4)


Unnamed: 0,reviewerID,asin,reviewText,overall
0,A2D832OA6Q5ZAS,B00005QFFP,"What a pleasure to see this peerless diva perform -- she is the purest example of her art that I have ever known. Like a great actress, she dissolves into her role, and yet her voice and style are easily recognizable for the effortless simplicity with which she nails every note and figure. I wish they could have taped in color in those days, and I would have liked to see her perform more examples of the coloratura repertoire and less of the popular romantic themes. Overall, I am delighted with this video.",4.0
1,A1JF78EP4GPAOO,B00005QG2N,The concert is fantastic as are the videos and Cradle of Fear trailer. The rest however seems to be uninspired filler. I would have liked this better if it included the Pandaemonaeon music videos and Her Ghost in the Fog. Where was the BBC documentary?,4.0


## Remove objective sentences for case B

In [13]:
OBJ_THRESHOLD = 0.8

In [14]:
%reload_ext autoreload
obj_path = '../obj_subj_dev/'
fit_obj_tf = obj_path + 'fit_tfidf_vectorizer_for_obj_subj_sentences_classification.pkl'
fit_obj_model = obj_path + 'GBC_300_0.5_5_0.88cv.pkl'
subj_filter = SubjectiveFilter(fit_obj_tf, fit_obj_model)

In [15]:
if 'A' in CASES:
    abtest['A'] = {
            'train': {
                'reviews_df': reviews['train']
            },
            'test': {
                'reviews_df': reviews['test']
            }
        }

if 'B' in CASES:
    'B': {
        'train': {
            'reviews_df': subj_filter.transform(reviews['train'],'reviewText', 
                                        threshold=OBJ_THRESHOLD,
                                        debug_level=2)
                                    .rename(columns={"sentence": "reviewText"}),
        },
        'test': {
            'reviews_df': subj_filter.transform(reviews['test'],'reviewText',
                                        threshold=OBJ_THRESHOLD,
                                        debug_level=2)
                                    .rename(columns={"sentence": "reviewText"}),
        }
    }

#### => Removed 26412 (11%) objective sentences

#### => Removed 130 (0%) reviews with no emotional content

OK! Split-merge match


##### A few objective sentences removed:

Unnamed: 0,reviewerID,asin,overall,sentence
3,A33P47VEH0YULL,B00005QG2N,4.0,"A well put together DVD for the ""Stinkiest band"" in the world."
18,A20EEWWSFMZ1PN,B00005QIVV,4.0,"Based on Hugh Wiley's inscrutable Oriental detective Mr. Wong, Boris Karloff is called upon by his detective friend to solve the murder of undercover detective Dan O'Grady that includes smuggling."
21,A3NKLBL5XYHRH1,B00005QJHJ,4.0,"This concert shows Bonnie's versatility as a performer, from her brand of country-rock to blues to pure vocalist."


#### => Removed 7015 (12%) objective sentences

#### => Removed 24 (0%) reviews with no emotional content

OK! Split-merge match


##### A few objective sentences removed:

Unnamed: 0,reviewerID,asin,overall,sentence
0,A2D832OA6Q5ZAS,B00005QFFP,4.0,What a pleasure to see this peerless diva perform -- she is the purest example of her art that I have ever known.
1,A1JF78EP4GPAOO,B00005QG2N,4.0,The concert is fantastic as are the videos and Cradle of Fear trailer.
3,AINEXVPR5094O,B00005QIVM,4.0,storyline: eking cheng is like former gang boss that retired and is now restaurant worker or somthjing...his gf turns into a gangsta re[prsentin...so she tries to bring him back to thegangsta world...pretty ok action...


In [16]:
# res = subj_filter.transform(test_reviews.iloc[:200,:],'reviewText', debug_level=10)

In [17]:
def info(ab, train_test, df, nb_lines=3):
    print(abtest[ab][train_test][df].shape)
    display(abtest[ab][train_test][df].head(nb_lines))
info('A','test','reviews_df', 2)
info('B','test','reviews_df', 2)

(7200, 5)


Unnamed: 0,reviewerID,asin,reviewText,overall,sentence
0,A2D832OA6Q5ZAS,B00005QFFP,"What a pleasure to see this peerless diva perform -- she is the purest example of her art that I have ever known. Like a great actress, she dissolves into her role, and yet her voice and style are easily recognizable for the effortless simplicity with which she nails every note and figure. I wish they could have taped in color in those days, and I would have liked to see her perform more examples of the coloratura repertoire and less of the popular romantic themes. Overall, I am delighted with this video.",4.0,"[What a pleasure to see this peerless diva perform -- she is the purest example of her art that I have ever known., Like a great actress, she dissolves into her role, and yet her voice and style are easily recognizable for the effortless simplicity with which she nails every note and figure., I wish they could have taped in color in those days, and I would have liked to see her perform more examples of the coloratura repertoire and less of the popular romantic themes., Overall, I am delighted with this video.]"
1,A1JF78EP4GPAOO,B00005QG2N,The concert is fantastic as are the videos and Cradle of Fear trailer. The rest however seems to be uninspired filler. I would have liked this better if it included the Pandaemonaeon music videos and Her Ghost in the Fog. Where was the BBC documentary?,4.0,"[The concert is fantastic as are the videos and Cradle of Fear trailer., The rest however seems to be uninspired filler., I would have liked this better if it included the Pandaemonaeon music videos and Her Ghost in the Fog., Where was the BBC documentary?]"


(7176, 4)


Unnamed: 0,reviewerID,asin,reviewText,overall
0,A03950922R6OEJTOJ49IY,B000063TQS,Was a great gift for my wife. Not really my show but I've watched a few and it is humorous. Was a great gift for my wife. Not really my show but I've watched a few and it is humorous.,4.0
1,A08280701W5RIXUPR6BZT,6304107641,I never gotr it and Amazon still charged me for it and then declined my card and now what do i think i think this whole thing stinks and i want my money back I never gotr it and Amazon still charged me for it and then declined my card and now what do i think i think this whole thing stinks and i want my money back,1.0


## Create bag of words
Remove accents  
Tokenize  
Lower the case  
Apply custom stop words (keep all negations)  
Remove all non alphabetic characters  
Lematize  
 
Output:  
One list of words for each review 

In [18]:
abtest.pop('y', None)

In [19]:
word_bag = WordBag()
for train_test_dico in abtest.values():
    for dico in train_test_dico.values():
        # print(dico['reviews_df'].keys())
        dico['reviews_df']['word_bag'] = word_bag.create_word_bag(
            dico['reviews_df']['reviewText'], 
            remove_stop_words=True, 
            lemmatize=True)

In [20]:
info('A','test','reviews_df', 1)
# info('B','test','reviews_df', 2)

(7200, 6)


Unnamed: 0,reviewerID,asin,reviewText,overall,sentence,word_bag
0,A2D832OA6Q5ZAS,B00005QFFP,"What a pleasure to see this peerless diva perform -- she is the purest example of her art that I have ever known. Like a great actress, she dissolves into her role, and yet her voice and style are easily recognizable for the effortless simplicity with which she nails every note and figure. I wish they could have taped in color in those days, and I would have liked to see her perform more examples of the coloratura repertoire and less of the popular romantic themes. Overall, I am delighted with this video.",4.0,"[What a pleasure to see this peerless diva perform -- she is the purest example of her art that I have ever known., Like a great actress, she dissolves into her role, and yet her voice and style are easily recognizable for the effortless simplicity with which she nails every note and figure., I wish they could have taped in color in those days, and I would have liked to see her perform more examples of the coloratura repertoire and less of the popular romantic themes., Overall, I am delighted with this video.]","[pleasure, see, peerless, diva, perform, purest, example, art, ever, known, like, great, actress, dissolve, role, and, yet, voice, and, style, easily, recognizable, effortless, simplicity, nail, every, note, and, figure, wish, could, taped, color, day, and, would, liked, see, perform, more, example, coloratura, repertoire, and, le, popular, romantic, theme, overall, delighted, video]"


In [21]:
# %reload_ext autoreload
# word_bag = WordBag()

# for i in ['train','test']:
#     for j in ['positive','negative']:
#         train_test_dic[i][j]['words'] = \
#             word_bag.create(train_test_dic[i][j]['reviewText'])

## Remove reviews that may not be on the movie, but on Amazon/support instead
Input: 
* word tokens 
* one line per review 

In [22]:
%reload_ext autoreload
about_movie = AboutMovie()

for train_test_dico in abtest.values():
    for dico in train_test_dico.values():
        dico['movie_reviews'] = dico['reviews_df'][[about_movie.check(words) \
                                for words in dico['reviews_df']['word_bag']]]
        

In [23]:
info('A','test','movie_reviews', 1)
# info('B','test','reviews_df', 2)

(5083, 6)


Unnamed: 0,reviewerID,asin,reviewText,overall,sentence,word_bag
0,A2D832OA6Q5ZAS,B00005QFFP,"What a pleasure to see this peerless diva perform -- she is the purest example of her art that I have ever known. Like a great actress, she dissolves into her role, and yet her voice and style are easily recognizable for the effortless simplicity with which she nails every note and figure. I wish they could have taped in color in those days, and I would have liked to see her perform more examples of the coloratura repertoire and less of the popular romantic themes. Overall, I am delighted with this video.",4.0,"[What a pleasure to see this peerless diva perform -- she is the purest example of her art that I have ever known., Like a great actress, she dissolves into her role, and yet her voice and style are easily recognizable for the effortless simplicity with which she nails every note and figure., I wish they could have taped in color in those days, and I would have liked to see her perform more examples of the coloratura repertoire and less of the popular romantic themes., Overall, I am delighted with this video.]","[pleasure, see, peerless, diva, perform, purest, example, art, ever, known, like, great, actress, dissolve, role, and, yet, voice, and, style, easily, recognizable, effortless, simplicity, nail, every, note, and, figure, wish, could, taped, color, day, and, would, liked, see, perform, more, example, coloratura, repertoire, and, le, popular, romantic, theme, overall, delighted, video]"


In [24]:
# %reload_ext autoreload
# about_movie = AboutMovie()
# movie_reviews = {'train':{}, 'test':{}}
# for i in ['train','test']:
#     for j in ['positive','negative']:
#          movie_reviews[i][j] = train_test_dic[i][j][[about_movie.check(words) \
#                                                     for words in train_test_dic[i][j]['words']]]

In [25]:
# train_test_dic['test']['positive'][[not i for i in \
#                                     [about_movie.check(words) for words in train_test_dic[i][j]['words']]]]

In [26]:
tot_reviews = 0

for ab_name, train_test_dico in abtest.items():
    for train_test_name, dico in train_test_dico.items():
        removed = dico['reviews_df'].shape[0] - dico['movie_reviews'].shape[0]
        tot_reviews += dico['reviews_df'].shape[0]
        print('Removed {0} ({1:.0%}) {2} {3} reviews'
              .format(removed, removed / dico['reviews_df'].shape[0],
                      train_test_name, ab_name))
        

Removed 8420 (29%) train A reviews
Removed 2117 (29%) test A reviews
Removed 8286 (29%) train B reviews
Removed 2098 (29%) test B reviews


In [27]:
# tot_reviews = 0
# for i in ['train','test']:
#     for j in ['positive','negative']:
#         removed = train_test_dic[i][j].shape[0] - movie_reviews[i][j].shape[0]
#         tot_reviews += train_test_dic[i][j].shape[0]
#         print('Removed {0} ({1:.0%}) {2} {3} reviews'.format(removed, removed / train_test_dic[i][j].shape[0],
#                                                 i, j))

In [28]:
# save results
import datetime
if False:
    currentDT = datetime.datetime.now()
    for i in ['train','test']:
        for j in ['positive','negative']:
            train_test_dic[i][j].to_hdf(data_path + currentDT.strftime("%Y-%m-%d-%H-%M-%S") + '_' + 
                      str(tot_reviews) + '_cleaned_reviews_before_B_' + i + '_' + j + '.pkl'
              , key='df', mode='w', complevel=9)

### TF-IDF setup

In [29]:
MAX_FEATURES = 10000

In [30]:
def dummy_fun(doc):
    return doc

tfidf = TfidfVectorizer(
    analyzer='word',       # Feed a list of words to TF-IDF
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None,
    lowercase=False, 
    stop_words=None, 
    max_features=MAX_FEATURES,
    norm='l2',            # normalize each review
    use_idf=True)        # Keep high weight for most common words

In [31]:
SPARSE = True

for ab_dico in abtest.values():
    for dico in ab_dico.values():
        dico['tf_transform'] = tfidf.fit_transform(dico['movie_reviews']['word_bag'])
        if not SPARSE:
            dico['tf_transform'] = dico['tf_transform'].todense()

In [32]:
def print_ab():
    for abname, ab_dico in abtest.items():
        print(abname)
        for ttname, tt in ab_dico.items():
            print('--',ttname)
            if type(tt) == dict:
                for key in tt.keys():
                    print('    ',key)
print_ab()

A
-- train
     reviews_df
     movie_reviews
     tf_transform
-- test
     reviews_df
     movie_reviews
     tf_transform
B
-- train
     reviews_df
     movie_reviews
     tf_transform
-- test
     reviews_df
     movie_reviews
     tf_transform


In [33]:
for ab_dico in abtest.values():
    for tt in ['train','test']:
        print(ab_dico[tt]['tf_transform'].shape)

(20380, 10000)
(5083, 10000)
(20384, 10000)
(5078, 10000)


In [34]:
# SPARSE = True

# if SPARSE:
#     # Optimization: add the review length while keeping sparse matrix
#     tf_train = tfidf.fit_transform(train_words)
#     tf_test = tfidf.transform(test_words)
# else:
#     tf_train = tfidf.fit_transform(train_words).todense()
#     tf_test = tfidf.transform(test_words).todense()

In [35]:
# print(len(tfidf.vocabulary_))
# tfidf.vocabulary_

## Add review length to modeling input

In [36]:
ADD_LENGTH = False

# if ADD_LENGTH:
#     if SPARSE:
#         # Hack: pick an existing word to store the count
#         len_idx = 0
#         test_lengths = [len(words) for words in test_words]

#         for idx,words in enumerate(train_words):
#             tf_train[idx][len_idx] = len(words)
#         for idx,words in enumerate(test_words):
#             tf_test[idx][len_idx] = len(words)
#         X_train = tf_train
#         X_test = tf_test
#     else:
#         train_lengths = np.array([len(words) for words in train_words]).reshape(-1,1)
#         test_lengths = np.array([len(words) for words in test_words]).reshape(-1,1)
#         X_train = np.concatenate([tf_train, train_lengths],axis=1)
#         X_test = np.concatenate([tf_test, test_lengths],axis=1)


## Create and test y

In [37]:
# [(abtest['A']['train']['movie_reviews']['overall'] > 3) * 1.0]
y = {'A':{},'B':{}}
start = 100
end = 103
for ab, dico in abtest.items():
    for tt in ['train','test']:
        y[ab][tt] = (dico[tt]['movie_reviews']['overall'] > 3) * 1.0

        print('\n--------\n{} {} {}'.format(ab, tt, len(y[ab][tt])))
        print(dico[tt]['movie_reviews'][['reviewText','overall']]
              .iloc[start:end,:]) #[:5])
        print(y[ab][tt][start:end]) #[-5:])    


--------
A train 20380
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

### Test and save

In [38]:
# for train_test_dico in abtest.values():
#     for dico in train_test_dico.values():
#         if dico['tf_transform'].shape[0] != 

In [39]:
# if abtest['B']['train'].shape[0] != y['train'].shape[0] \
#     or abtest['B']['test'] != y['test'].shape[0]:
#     print('@@@ Problem! @@@')
#     print(abtest['B']['train'].shape)
#     print(y['train'].shape)
#     print(abtest['B']['test'].shape)
#     print(y['test'].shape)
# else:
#     print('OK!')

In [40]:
if False:
    pickle_out = open(data_path 
                      + 'tfidf_' 
                      + str(X_train.shape[0]) + 'Pos_Neg_Samples_'
                      + str(X_train.shape[1]) + 'Feats.pkl'
                      ,"wb")
    pickle.dump(tfidf, pickle_out)
    pickle_out.close()

## Gradient Boosting Classifier

In [41]:
# Gradient Boosting Classifier parameters
# N_TREES = math.floor(np.sqrt(NB_SAMPLES) * 1.2)
N_TREES = 500
LEARN_RATE = 0.2
MAX_DEPTH = 8
MIN_IN_LEAF = 5 #7
MAX_FEATURES = 'sqrt'

In [42]:
for ab, ab_dico in abtest.items():
    ab_dico['gbc'] = GradientBoostingClassifier(learning_rate=LEARN_RATE, 
                                n_estimators=N_TREES, 
                                min_samples_leaf=MIN_IN_LEAF,
                                max_depth=MAX_DEPTH,
                                max_features=MAX_FEATURES)

    
    ab_dico['gbc'].fit(ab_dico['train']['tf_transform'], y[ab]['train'])

In [43]:
# gbc = GradientBoostingClassifier(learning_rate=LEARN_RATE, 
#                                 n_estimators=N_TREES, 
#                                 min_samples_leaf=MIN_IN_LEAF,
#                                 max_depth=MAX_DEPTH,
#                                 max_features=MAX_FEATURES)

In [44]:
# gbc.fit(X_train, y_train)

In [45]:
if False:
    pickle.dump(gbc, open(data_path + 'GBC_'
                       + str(NB_SAMPLES) + '_samples_'
                       + str(N_TREES) + '_trees_' 
                       + str(LEARN_RATE) + '_lr_' 
                       + str(MAX_DEPTH) + '_maxdpth_'
                       + str(MIN_IN_LEAF) + '_minleaf_'
                       + str(MAX_FEATURES) + '_feats_'
                       + '.pkl', 'wb'))

In [46]:
for ab, ab_dico in abtest.items():
    classifier_report(ab_dico['gbc'], 
                      ab_dico['train']['tf_transform'], 
                      y[ab]['train'],
                      'Gradient Boosting Classifier on training set {}'.format(ab))

### Report for Gradient Boosting Classifier on training set A:

##### Off diagonal: 1%

#### Confusion Matrix:

[[ 9987   175]
 [  119 10099]]


#### Classification Report:

              precision    recall  f1-score   support

         0.0       0.99      0.98      0.99     10162
         1.0       0.98      0.99      0.99     10218

    accuracy                           0.99     20380
   macro avg       0.99      0.99      0.99     20380
weighted avg       0.99      0.99      0.99     20380



### Report for Gradient Boosting Classifier on training set B:

##### Off diagonal: 2%

#### Confusion Matrix:

[[ 9937   217]
 [  121 10109]]


#### Classification Report:

              precision    recall  f1-score   support

         0.0       0.99      0.98      0.98     10154
         1.0       0.98      0.99      0.98     10230

    accuracy                           0.98     20384
   macro avg       0.98      0.98      0.98     20384
weighted avg       0.98      0.98      0.98     20384



In [47]:
print('SAMPLE_FRACTION:', SAMPLE_FRACTION,'ADD_LENGTH:',ADD_LENGTH,' SPARSE:',SPARSE,' MAX_FEATURES:',MAX_FEATURES)

print(MAX_FEATURES, ' features', N_TREES,'trees; ',
      LEARN_RATE,'learn_rate; ', MAX_DEPTH, 'max_dpth; ',
      MIN_IN_LEAF, 'min_in_leaf')

for ab, ab_dico in abtest.items():
    classifier_report(ab_dico['gbc'], 
                      ab_dico['test']['tf_transform'], 
                      y[ab]['test'],
                      'Gradient Boosting Classifier on test set {}'.format(ab))



SAMPLE_FRACTION: 0.1 ADD_LENGTH: False  SPARSE: True  MAX_FEATURES: sqrt
sqrt  features 500 trees;  0.2 learn_rate;  8 max_dpth;  5 min_in_leaf


### Report for Gradient Boosting Classifier on test set A:

##### Off diagonal: 46%

#### Confusion Matrix:

[[1241 1281]
 [1044 1517]]


#### Classification Report:

              precision    recall  f1-score   support

         0.0       0.54      0.49      0.52      2522
         1.0       0.54      0.59      0.57      2561

    accuracy                           0.54      5083
   macro avg       0.54      0.54      0.54      5083
weighted avg       0.54      0.54      0.54      5083



### Report for Gradient Boosting Classifier on test set B:

##### Off diagonal: 47%

#### Confusion Matrix:

[[1213 1309]
 [1083 1473]]


#### Classification Report:

              precision    recall  f1-score   support

         0.0       0.53      0.48      0.50      2522
         1.0       0.53      0.58      0.55      2556

    accuracy                           0.53      5078
   macro avg       0.53      0.53      0.53      5078
weighted avg       0.53      0.53      0.53      5078



In [48]:
# print('SAMPLE_FRACTION:', SAMPLE_FRACTION,'ADD_LENGTH:',ADD_LENGTH,' SPARSE:',SPARSE,' MAX_FEATURES:',MAX_FEATURES)


## Grid search

In [None]:
if False:
    gb_pipe = Pipeline([('vect', tfidf), ('gb', gbc)])
    gb_pipe.fit(X_train, y_train)
    pickle.dump(gb_pipe, open('pickles/GBCpipe_balanced_comments_'
                           + str(N_TREES) + '_trees_' 
                           + str(LEARN_RATE) + '_lr_' 
                           + str(MAX_DEPTH) + '_maxdpth_'
                           + str(MIN_IN_LEAF) + '_minleaf_'
                           + str(MAX_FEATURES) + '_feats_'
                           + '.pkl', 'wb'))
else:
#     pickle_in = open("pickles/GBC_balanced_comments_300_trees_0.1_lr_15_maxdpth_2_minleaf_20000_feats_.pkl",
#                      "rb")
#     gb_pipe = pickle.load(pickle_in)

In [None]:
if False:
    if True:
        grid = {
            'learning_rate': [.1,0.2,0.3],
            'max_depth': [8],
            'min_samples_leaf': [5],
            'max_features': ['sqrt'],
            'n_estimators': [300],
            'random_state': [0]
        }
    else:  # TEST
        grid = {
        'learning_rate': [1],
        'max_depth': [2], 
        'min_samples_leaf': [2],
    #     'max_features': ['sqrt', None],
        'n_estimators': [2],
        'random_state': [0]
    }

    # confusion_score = make_scorer(confusion_rmse, greater_is_better=False)

    gbc_grid_cv = GridSearchCV(
        GradientBoostingClassifier(), 
        grid,
        cv=4,  # number of folds
        return_train_score=True,
        verbose=1, 
        n_jobs=-1)
    gbc_grid_cv.fit(X_train, y_train)

In [None]:
if False:
    y_pred = gbc_grid_cv.predict(X_test)

    print('SAMPLE_FRACTION:', SAMPLE_FRACTION,'ADD_LENGTH:',ADD_LENGTH,' SPARSE:',SPARSE,' MAX_FEATURES:',MAX_FEATURES)

    print(gbc_grid_cv.best_params_)
    print(gbc_grid_cv.best_score_)
    res_df = pd.DataFrame(gbc_grid_cv.cv_results_)
    res_df