# Hypothesis to test:
### Removing objective sentences from reviews helps predict star rating from reviews

In [1]:
import numpy as np
import pandas as pd
import pickle
import math
from nltk.tokenize import sent_tokenize
import datetime

from IPython.display import Markdown, display

In [2]:
import sys
sys.path.append('..')

In [3]:
# Avoid restarting Kernel
%load_ext autoreload
%autoreload 2

pd.set_option('display.max_colwidth', -1)

# %autosave 50

In [4]:
# From this project
# from utils import rmse, rmse_train_cv, classifier_report, confusion_rmse
from NLP import WordBag, AboutMovie

## Configuration

In [5]:
# Subsampling from Amazon reviews
NB_SAMPLES = 200000 #4000  # up to 200k, then change the input file

data_path = '../../../datasets/'


In [6]:
# file_name = '360000_balanced_train_test_reviews.pkl'
file_name = '_balanced_pos_neg_train_test_reviews_5_sentences.pkl'

pickle_in = open(data_path + str(NB_SAMPLES) + file_name,"rb")
train_test_dic0 = pickle.load(pickle_in)

In [7]:
train_test_dic0['train']['positive'].shape

(80000, 9)

In [13]:
200 * 0.05 / 14

0.7142857142857143

In [22]:
SAMPLE_FRACTION = 0.15

test_dic = {'train': {}, 'test':{}}

for i in ['train','test']:
    for j in ['positive','negative']:
         test_dic[i][j] = train_test_dic0[i][j] \
            .iloc[:math.floor(len(train_test_dic0[i][j].index) * SAMPLE_FRACTION), :] \
            .reset_index() \
            .reset_index() \
            .drop(['reviewerName', 'helpful', 'asin', 'index',
                   'summary', 'unixReviewTime', 'reviewTime'], axis=1) \
            .rename(columns={'level_0': 'asin'})


In [23]:
test_dic['train']['positive'].head(3)

Unnamed: 0,asin,reviewerID,reviewText,overall
0,0,A3MW5OB6IMNQI7,"Hank Greenberg was more than just a baseball superstar. He was an icon -- the pride and joy of the Jewish Americans in the 1930s and 40s amidst rampant anti-Semitism. As the first prominent Jewish player in the Major League Baseball, Greenberg not only established himself as one of the best sluggers in baseball history, but he also gave the Jewish Americans something to cheer about. He also paved a path for other baseball pioneers like Jackie Robinson as he quietly fought discrimination by letting his stats speak for themselves.This movie isn't your typical boring documentary either. It shows a nice balance of Greenberg's baseball achievements and personal life (though I wish they focused a bit more on the baseball part) with a good mixture of old baseball footage, interviews with fans, family, friends and Greenberg himself, as well as clips from classic baseball movies such as The Pride of the Yankees. You don't have to be a baseball fan/historian to enjoy and appreciate this movie. Besides, there is nothing quite like listening to &quot;Take Me out to the Ball Game&quot; sung in Yiddish.",4.0
1,1,A1BKGGKR8FI97M,"Demi Moore plays a naive, sheltered young woman who ""dreams"" the man she will marry will visit the small island she inhabits off the east coast. He turns out to be a butcher from NY. They marry quickly and problems arise as people discover that she is psychic, leading to all kinds of comical mix-ups. Mary Steenberger is terrific as a shy church choir leader who dreams of being a blues singer. Cute, silly, chick flick.",4.0
2,2,A2X7QZS1SSPQ8N,"This was a cute mix of reality and fantasy, with a moral of no matter what you use to rule your life, fate has a plan for you. Demi Moore plays Marina, a naive southern girl with a gift, who dreams her new husband (sort of) and ends up in New York married to the neighborhood Butcher (George Dzundza)! Jeff Daniels is the non-commital psychiatrist across the street and Mary Steenbergen is a mild-mannered church chior director who wants to sing in a Lounge. Between the four of them, they turn their lives upside-down and all around, affecting the lives of all those around them. It's a chick flick and very good!",4.0


In [24]:
total = 0
for i in ['train','test']:
    for j in ['positive','negative']:
        print(test_dic[i][j].shape)
        total += test_dic[i][j].shape[0]
total

(12000, 4)
(12000, 4)
(3000, 4)
(3000, 4)


30000

## Remove objective sentences for case B

In [25]:
%reload_ext autoreload
from subjective_filter import SubjectiveFilter

In [26]:
%reload_ext autoreload
obj_path = '../obj_subj_dev/'
fit_obj_tf = obj_path + 'fit_tfidf_vectorizer_for_obj_subj_sentences_classification.pkl'
fit_obj_model = obj_path + 'GBC_300_0.5_5_0.88cv.pkl'
subj_filter = SubjectiveFilter(fit_obj_tf, fit_obj_model)

In [27]:
# for tt in ['train','test']:
#     for pn in ['positive','negative']:
#         print(tt,pn,test_dic[tt][np].shape)
total = 0
for tt in test_dic.values():
    for df in tt.values():
#         display(df.head(1))
        total += df.shape[0]
        display(df.shape)
print(total)

(12000, 4)

(12000, 4)

(3000, 4)

(3000, 4)

30000


In [28]:
CHUNK_SZ = 1

for REMOVE in ['subj', 'obj']:
    for REMOVE_FRACTION in [0.8, 0.6, 0.4, 0.2]:
        print('Starting computations for {} {}'.format(REMOVE, REMOVE_FRACTION))
        print (str(datetime.datetime.now()))
        sent_dfs = {'train':{},'test':{}}
        nb_sentences_removed = 0

        for ttname, tt in test_dic.items():
            for pn, df in tt.items():
                df_list = []
                start = 0
                while start < df.shape[0]:
                    end = start + CHUNK_SZ
                    df1 = df.iloc[start:end,:]
                    df2 = subj_filter.to_one_sent_per_row(df1)
                    df3, removed = subj_filter.transform(
                            df2,
                            'sentence', 
                            remove_fraction = REMOVE_FRACTION,
                            debug_level=0,
                            remove=REMOVE)
                    if removed == -1:
                        start = end
                        continue
                    df_list.append(df3)
                    nb_sentences_removed += removed
                    start = end

                if len(df_list) == 0:
                    sent_dfs[ttname][pn] = None
                    print('No reviews for {} {}'.format(ttname, pn))
                    continue
                sent_dfs[ttname][pn] = df_list.pop()
                while len(df_list) > 0:
                    sent_dfs[ttname][pn] = pd.merge(df_list.pop(), 
                                                    sent_dfs[ttname][pn], how='outer')
        # Save B
        print('Saving B for {} {}'.format(REMOVE, REMOVE_FRACTION))
        print (str(datetime.datetime.now()))
        pickle_out = open(data_path
                            + 'reviews_wout_top_' + str(int(round(REMOVE_FRACTION * 100))) + 'pct_' + REMOVE + '_B.pkl'
                            , "wb")
        pickle.dump(sent_dfs, pickle_out)
        pickle_out.close()

        # Create A with same number of reviews
        A_dic = {'train': {}, 'test':{}}

        for tt in ['train', 'test']:
            for pn in ['positive', 'negative']:
                df = test_dic[tt][pn]
                A_dic[tt][pn] = df[df['asin'].isin(sent_dfs[tt][pn]['asin'])]

        # Save A
        print('Saving A for {} {}'.format(REMOVE, REMOVE_FRACTION))
        print (str(datetime.datetime.now()))
        pickle_out = open(data_path
                        + 'reviews_wout_top_' + str(int(round(REMOVE_FRACTION * 100))) + 'pct_' + REMOVE + '_A.pkl'
                            , "wb")
        pickle.dump(A_dic, pickle_out)
        pickle_out.close()

Starting computations for subj 0.8
2019-07-24 23:03:36.975795
Saving B for subj 0.8
2019-07-24 23:54:16.024403
Saving A for subj 0.8
2019-07-24 23:54:16.068284
Starting computations for subj 0.6
2019-07-24 23:54:16.173581
Saving B for subj 0.6
2019-07-25 00:44:48.795600
Saving A for subj 0.6
2019-07-25 00:44:48.865019
Starting computations for subj 0.4
2019-07-25 00:44:49.003034
Saving B for subj 0.4
2019-07-25 01:35:30.221232
Saving A for subj 0.4
2019-07-25 01:35:30.305414
Starting computations for subj 0.2
2019-07-25 01:35:30.437522
Saving B for subj 0.2
2019-07-25 02:26:07.558121
Saving A for subj 0.2
2019-07-25 02:26:07.664769
Starting computations for obj 0.8
2019-07-25 02:26:07.758983
Saving B for obj 0.8
2019-07-25 03:16:41.300029
Saving A for obj 0.8
2019-07-25 03:16:41.341071
Starting computations for obj 0.6
2019-07-25 03:16:41.466941
Saving B for obj 0.6
2019-07-25 04:07:10.156751
Saving A for obj 0.6
2019-07-25 04:07:10.224793
Starting computations for obj 0.4
2019-07-25 0