# Hypothesis to test:
### Removing objective sentences from reviews helps predict star rating from reviews

In [1]:
import numpy as np
import pandas as pd
import pickle
import math
from nltk.tokenize import sent_tokenize
import datetime

from IPython.display import Markdown, display

In [2]:
import sys
sys.path.append('..')

In [3]:
# Avoid restarting Kernel
%load_ext autoreload
%autoreload 2

pd.set_option('display.max_colwidth', -1)

# %autosave 50

In [4]:
# From this project
# from utils import rmse, rmse_train_cv, classifier_report, confusion_rmse
from NLP import WordBag, AboutMovie

## Configuration

In [5]:
# Subsampling from Amazon reviews
NB_SAMPLES = 200000 #4000  # up to 200k, then change the input file

data_path = '../../../datasets/'


In [8]:
# file_name = '360000_balanced_train_test_reviews.pkl'
file_name = '_balanced_pos_neg_train_test_reviews.pkl'

pickle_in = open(data_path + str(NB_SAMPLES) + file_name,"rb")
train_test_dic0 = pickle.load(pickle_in)

In [9]:
train_test_dic0['train']['positive'].shape

(80000, 10)

In [None]:
200 * 0.05 / 14

In [21]:
SAMPLE_FRACTION = 1

test_dic = {'train': {}, 'test':{}}

for i in ['train','test']:
    for j in ['positive','negative']:
         
         test_dic[i][j] = train_test_dic0[i][j][train_test_dic0[i][j]['nb_sentences'] == 5] \
            .reset_index() \
            .reset_index() \
            .drop(['reviewerName', 'helpful', 'asin', 'index', 'nb_sentences',
                   'summary', 'unixReviewTime', 'reviewTime'], axis=1) \
            .rename(columns={'level_0': 'asin'})


In [22]:
test_dic['train']['positive'].head(3)

Unnamed: 0,asin,reviewerID,reviewText,overall
0,0,A1BKGGKR8FI97M,"Demi Moore plays a naive, sheltered young woman who ""dreams"" the man she will marry will visit the small island she inhabits off the east coast. He turns out to be a butcher from NY. They marry quickly and problems arise as people discover that she is psychic, leading to all kinds of comical mix-ups. Mary Steenberger is terrific as a shy church choir leader who dreams of being a blues singer. Cute, silly, chick flick.",4.0
1,1,A2X7QZS1SSPQ8N,"This was a cute mix of reality and fantasy, with a moral of no matter what you use to rule your life, fate has a plan for you. Demi Moore plays Marina, a naive southern girl with a gift, who dreams her new husband (sort of) and ends up in New York married to the neighborhood Butcher (George Dzundza)! Jeff Daniels is the non-commital psychiatrist across the street and Mary Steenbergen is a mild-mannered church chior director who wants to sing in a Lounge. Between the four of them, they turn their lives upside-down and all around, affecting the lives of all those around them. It's a chick flick and very good!",4.0
2,2,A2WDMII0ZK0O5X,"Lots of Magic and chance encounters are interpreted by Demi Moore with Blond Hair (I am sure it is a wig lol). In truth, she mistakens the Butcher for being her knight in shining armor but he served her purpose in her leaving the place where she was born and raised. It's not a bad environment but I guess she got tired of the beach. lol In any case, it is a delightful movie, and it does leave you wondering whom she really ends up with, not only for her but for the other characters in the movie as well. Good entertainment.",4.0


In [23]:
total = 0
for i in ['train','test']:
    for j in ['positive','negative']:
        print(test_dic[i][j].shape)
        total += test_dic[i][j].shape[0]
total

(15175, 4)
(13491, 4)
(3746, 4)
(3418, 4)


35830

## Remove objective sentences for case B

In [24]:
%reload_ext autoreload
from subjective_filter import SubjectiveFilter

In [25]:
%reload_ext autoreload
obj_path = '../obj_subj_dev/'
fit_obj_tf = obj_path + 'fit_tfidf_vectorizer_for_obj_subj_sentences_classification.pkl'
fit_obj_model = obj_path + 'GBC_300_0.5_5_0.88cv.pkl'
subj_filter = SubjectiveFilter(fit_obj_tf, fit_obj_model)

In [26]:
# for tt in ['train','test']:
#     for pn in ['positive','negative']:
#         print(tt,pn,test_dic[tt][np].shape)
total = 0
for tt in test_dic.values():
    for df in tt.values():
#         display(df.head(1))
        total += df.shape[0]
        display(df.shape)
print(total)

(15175, 4)

(13491, 4)

(3746, 4)

(3418, 4)

35830


In [27]:
CHUNK_SZ = 1

for REMOVE in ['subj', 'obj']:
    for REMOVE_FRACTION in [0.8, 0.6, 0.4, 0.2]:
        print('Starting computations for {} {}'.format(REMOVE, REMOVE_FRACTION))
        print (str(datetime.datetime.now()))
        sent_dfs = {'train':{},'test':{}}
        nb_sentences_removed = 0

        for ttname, tt in test_dic.items():
            for pn, df in tt.items():
                df_list = []
                start = 0
                while start < df.shape[0]:
                    end = start + CHUNK_SZ
                    df1 = df.iloc[start:end,:]
                    df2 = subj_filter.to_one_sent_per_row(df1)
                    df3, removed = subj_filter.transform(
                            df2,
                            'sentence', 
                            remove_fraction = REMOVE_FRACTION,
                            debug_level=0,
                            remove=REMOVE)
                    if removed == -1:
                        start = end
                        continue
                    df_list.append(df3)
                    nb_sentences_removed += removed
                    start = end

                if len(df_list) == 0:
                    sent_dfs[ttname][pn] = None
                    print('No reviews for {} {}'.format(ttname, pn))
                    continue
                sent_dfs[ttname][pn] = df_list.pop()
                while len(df_list) > 0:
                    sent_dfs[ttname][pn] = pd.merge(df_list.pop(), 
                                                    sent_dfs[ttname][pn], how='outer')
        # Save B
        print('Saving B for {} {}'.format(REMOVE, REMOVE_FRACTION))
        print (str(datetime.datetime.now()))
        pickle_out = open(data_path
                            + 'reviews_wout_top_' + str(int(round(REMOVE_FRACTION * 100))) + 'pct_' + REMOVE + '_B.pkl'
                            , "wb")
        pickle.dump(sent_dfs, pickle_out)
        pickle_out.close()

        # Create A with same number of reviews
        A_dic = {'train': {}, 'test':{}}

        for tt in ['train', 'test']:
            for pn in ['positive', 'negative']:
                df = test_dic[tt][pn]
                A_dic[tt][pn] = df[df['asin'].isin(sent_dfs[tt][pn]['asin'])]

        # Save A
        print('Saving A for {} {}'.format(REMOVE, REMOVE_FRACTION))
        print (str(datetime.datetime.now()))
        pickle_out = open(data_path
                        + 'reviews_wout_top_' + str(int(round(REMOVE_FRACTION * 100))) + 'pct_' + REMOVE + '_A.pkl'
                            , "wb")
        pickle.dump(A_dic, pickle_out)
        pickle_out.close()

Starting computations for subj 0.8
2019-07-26 18:18:27.155797


KeyboardInterrupt: 