# Hypothesis to test: 
### Removing objective sentences from reviews helps predict star rating from reviews

In [7]:
import numpy as np
import pandas as pd
import pickle
import math
from IPython.display import Markdown, display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

# From this project
from utils import confusion_off_diagonal
from NLPv0 import WordBag, AboutMovie


# Avoid restarting Kernel
%load_ext autoreload
%autoreload 2

pd.set_option('display.max_colwidth', -1)

# %autosave 50

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
%reload_ext autoreload
word_bag = WordBag()

## Configuration

In [9]:
# Subsampling from Amazon reviews
NB_SAMPLES = 360000 #4000  # up to 200k, then change the input file

data_path = '../../datasets/'
xl_report = 'gbc_500_trees_02_rate_8_depth_5_leaf_sqrt_10k_tfidf.xlsx'

In [10]:
# TIDF setup
MAX_FEATURES = 10000

tfidf = TfidfVectorizer(
    lowercase=True, 
    stop_words=None, 
    max_features=MAX_FEATURES,
    norm='l2',            # normalize each review
    use_idf=True) 

In [11]:
N_TREES = 500
LEARN_RATE = 0.2
MAX_DEPTH = 8
MIN_IN_LEAF = 5 #7
MAX_FEATURES = 'sqrt'

gbc = GradientBoostingClassifier(learning_rate=LEARN_RATE, 
                                n_estimators=N_TREES, 
                                min_samples_leaf=MIN_IN_LEAF,
                                max_depth=MAX_DEPTH,
                                max_features=MAX_FEATURES)

In [13]:
report = {'case':[], 'remove':[], 'percent': [], 
          'in_train_p': [], 'in_train_n': [], 'in_test_p': [],'in_test_n': [], 'in_total': [],
          'xy_check': [],
          'precision': [], 'recall': [], 'f1': [], 'support': [], 'off_diag': []
         }
    
for CASE in ['A', 'B']:
    for REMOVE in ['obj', 'subj']:
        for PCT in ['20', '40', '60', '80']:
            report['case'].append(CASE)
            report['remove'].append(REMOVE)
            report['percent'].append(PCT)
            root = 'reviews_wout_top_' + PCT + 'pct_' + REMOVE
            if CASE == 'A':
                pickle_in = open(data_path + root + '_A.pkl', "rb")
                movie_reviews = pickle.load(pickle_in)
            else:
                pickle_in = open(data_path + root + '_B.pkl', "rb")
                movie_reviews = pickle.load(pickle_in)
            pickle_in.close()

            for i in ['train','test']:
                for j in ['positive','negative']:
                    movie_reviews[i][j] = movie_reviews[i][j].rename(columns={'sentence':'reviewText'})

            total = 0
            for i in ['train','test']:
                for j in ['positive','negative']:
                    total += movie_reviews[i][j].shape[0]
            report['in_train_p'].append(movie_reviews['train']['positive'].shape[0])
            report['in_train_n'].append(movie_reviews['train']['negative'].shape[0])
            report['in_test_p'].append(movie_reviews['test']['positive'].shape[0])
            report['in_test_n'].append(movie_reviews['test']['negative'].shape[0])
            report['in_total'].append(total)

            train_words = pd.concat([movie_reviews['train']['positive']['reviewText'],
                                 movie_reviews['train']['negative']['reviewText']])
            y_train = np.concatenate([np.ones((movie_reviews['train']['positive'].shape[0],)), 
                                      np.zeros((movie_reviews['train']['negative'].shape[0],))])
            test_words = pd.concat([movie_reviews['test']['positive']['reviewText'],
                                 movie_reviews['test']['negative']['reviewText']])
            y_test = np.concatenate([np.ones((movie_reviews['test']['positive'].shape[0],)), 
                                      np.zeros((movie_reviews['test']['negative'].shape[0],))])

            SPARSE = True

            if SPARSE:
                # Optimization: add the review length while keeping sparse matrix
                tf_train = tfidf.fit_transform(train_words)
                tf_test = tfidf.transform(test_words)
            else:
                tf_train = tfidf.fit_transform(train_words).todense()
                tf_test = tfidf.transform(test_words).todense()

            # option: add length to input
            ADD_LENGTH = False

            if ADD_LENGTH:
                if SPARSE:
                    # Hack: pick an existing word to store the count
                    len_idx = 0
                    test_lengths = [len(words) for words in test_words]

                    for idx,words in enumerate(train_words):
                        tf_train[idx][len_idx] = len(words)
                    for idx,words in enumerate(test_words):
                        tf_test[idx][len_idx] = len(words)
                    X_train = tf_train
                    X_test = tf_test
                else:
                    train_lengths = np.array([len(words) for words in train_words]).reshape(-1,1)
                    test_lengths = np.array([len(words) for words in test_words]).reshape(-1,1)
                    X_train = np.concatenate([tf_train, train_lengths],axis=1)
                    X_test = np.concatenate([tf_test, test_lengths],axis=1)
            else:
                X_train = tf_train
                X_test = tf_test

            if X_train.shape[0] != y_train.shape[0] or X_test.shape[0] != y_test.shape[0]:
                report['xy_check'].append('problem!!!')
            else:
                report['xy_check'].append('OK')

            gbc.fit(X_train, y_train)

            y_pred = gbc.predict(X_test)
            prec, rec, f1, supp = precision_recall_fscore_support(y_test, y_pred, average='binary')
            report['precision'].append(prec)
            report['recall'].append(rec)
            report['f1'].append(f1)
            report['support'].append(supp)
            report['off_diag'].append(confusion_off_diagonal(confusion_matrix(y_test, y_pred)))

#             for key, val in report.items():
#                 print(' ')
#                 print(key)
#                 print(val)
            
            pd.DataFrame(report).to_excel(xl_report)


 
case
[]
 
remove
[]
 
percent
[]
 
in_train_p
[5759]
 
in_train_n
[5760]
 
in_test_p
[1440]
 
in_test_n
[1440]
 
in_total
[14399]
 
xy_check
['OK']
 
precision
[0.85809906291834]
 
recall
[0.8902777777777777]
 
f1
[0.8738922972051806]
 
support
[None]
 
off_diag
[0.1284722222222222]
 
case
[]
 
remove
[]
 
percent
[]
 
in_train_p
[5759, 5759]
 
in_train_n
[5760, 5760]
 
in_test_p
[1440, 1440]
 
in_test_n
[1440, 1440]
 
in_total
[14399, 14399]
 
xy_check
['OK', 'OK']
 
precision
[0.85809906291834, 0.8640321500334897]
 
recall
[0.8902777777777777, 0.8958333333333334]
 
f1
[0.8738922972051806, 0.8796454142516195]
 
support
[None, None]
 
off_diag
[0.1284722222222222, 0.12256944444444444]


KeyboardInterrupt: 

## Grid search

In [None]:
# if False:
#     gb_pipe = Pipeline([('vect', tfidf), ('gb', gbc)])
#     gb_pipe.fit(X_train, y_train)
#     pickle.dump(gb_pipe, open('pickles/GBCpipe_balanced_comments_'
#                            + str(N_TREES) + '_trees_' 
#                            + str(LEARN_RATE) + '_lr_' 
#                            + str(MAX_DEPTH) + '_maxdpth_'
#                            + str(MIN_IN_LEAF) + '_minleaf_'
#                            + str(MAX_FEATURES) + '_feats_'
#                            + '.pkl', 'wb'))
# else:
# #     pickle_in = open("pickles/GBC_balanced_comments_300_trees_0.1_lr_15_maxdpth_2_minleaf_20000_feats_.pkl",
# #                      "rb")
# #     gb_pipe = pickle.load(pickle_in)

In [None]:
# if True:
#     grid = {
#         'learning_rate': [0.2,0.3],
#         'max_depth': [8],
#         'min_samples_leaf': [5],
#         'max_features': [None],
#         'n_estimators': [300],
#         'random_state': [0]
#     }
# else:  # TEST
#     grid = {
#     'learning_rate': [1],
#     'max_depth': [2], 
#     'min_samples_leaf': [2],
# #     'max_features': ['sqrt', None],
#     'n_estimators': [2],
#     'random_state': [0]
# }
    
# # confusion_score = make_scorer(confusion_rmse, greater_is_better=False)

# gbc_grid_cv = GridSearchCV(
#     GradientBoostingClassifier(), 
#     grid,
#     cv=4,  # number of folds
#     return_train_score=True,
#     verbose=1, 
#     n_jobs=-1)
# gbc_grid_cv.fit(X_train, y_train)

In [None]:
y_pred = gbc_grid_cv.predict(X_test)

In [None]:
print('SAMPLE_FRACTION:', SAMPLE_FRACTION,'ADD_LENGTH:',ADD_LENGTH,' SPARSE:',SPARSE,' MAX_FEATURES:',MAX_FEATURES)

In [None]:
print(gbc_grid_cv.best_params_)
print(gbc_grid_cv.best_score_)
res_df = pd.DataFrame(gbc_grid_cv.cv_results_)
res_df