# Hypothesis to test:
### Removing objective sentences from reviews helps predict star rating from reviews

In [1]:
import numpy as np
import pandas as pd
import pickle
import math
from nltk.tokenize import sent_tokenize
import datetime

from IPython.display import Markdown, display

In [2]:
import sys
sys.path.append('..')

In [3]:
# Avoid restarting Kernel
%load_ext autoreload
%autoreload 2

pd.set_option('display.max_colwidth', -1)

# %autosave 50

In [4]:
# From this project
# from utils import rmse, rmse_train_cv, classifier_report, confusion_rmse
from NLP import WordBag, AboutMovie

## Configuration

In [5]:
# Subsampling from Amazon reviews
NB_SAMPLES = 360000 #4000  # up to 200k, then change the input file

data_path = '../../../datasets/'


In [6]:
# file_name = '360000_balanced_train_test_reviews.pkl'
file_name = '_balanced_pos_neg_train_test_reviews.pkl'

pickle_in = open(data_path + str(NB_SAMPLES) + file_name,"rb")
train_test_dic0 = pickle.load(pickle_in)

In [None]:
SAMPLE_FRACTION = 0.04

test_dic = {'train': {}, 'test':{}}

for i in ['train','test']:
    for j in ['positive','negative']:
         test_dic[i][j] = train_test_dic0[i][j] \
            .iloc[:math.floor(len(train_test_dic0[i][j].index) * SAMPLE_FRACTION), :] \
            .reset_index() \
            .reset_index() \
            .drop(['reviewerName', 'helpful', 'asin', 'index',
                   'summary', 'unixReviewTime', 'reviewTime'], axis=1) \
            .rename(columns={'level_0': 'asin'})


In [None]:
test_dic['train']['positive'].head(3)

In [None]:
total = 0
for i in ['train','test']:
    for j in ['positive','negative']:
        print(test_dic[i][j].shape)
        total += test_dic[i][j].shape[0]
total

## Remove objective sentences for case B

In [None]:
%reload_ext autoreload
obj_path = '../obj_subj_dev/'
fit_obj_tf = obj_path + 'fit_tfidf_vectorizer_for_obj_subj_sentences_classification.pkl'
fit_obj_model = obj_path + 'GBC_300_0.5_5_0.88cv.pkl'
subj_filter = SubjectiveFilter(fit_obj_tf, fit_obj_model)

In [None]:
# for tt in ['train','test']:
#     for pn in ['positive','negative']:
#         print(tt,pn,test_dic[tt][np].shape)
total = 0
for tt in test_dic.values():
    for df in tt.values():
#         display(df.head(1))
        total += df.shape[0]
        display(df.shape)
print(total)

In [None]:
%reload_ext autoreload
from subjective_filter import SubjectiveFilter

In [None]:
CHUNK_SZ = 1

for REMOVE in ['subj', 'obj']:
    for REMOVE_FRACTION in [0.8, 0.6, 0.4, 0.2]:
        print('Starting computations for {} {}'.format(REMOVE, REMOVE_FRACTION))
        print (str(datetime.datetime.now()))
        sent_dfs = {'train':{},'test':{}}
        nb_sentences_removed = 0

        for ttname, tt in test_dic.items():
            for pn, df in tt.items():
                df_list = []
                start = 0
                while start < df.shape[0]:
                    end = start + CHUNK_SZ
                    df1 = df.iloc[start:end,:]
                    df2 = subj_filter.to_one_sent_per_row(df1)
                    df3, removed = subj_filter.transform(
                            df2,
                            'sentence', 
                            remove_fraction = REMOVE_FRACTION,
                            debug_level=0,
                            remove=REMOVE)
                    if removed == -1:
                        start = end
                        continue
                    df_list.append(df3)
                    nb_sentences_removed += removed
                    start = end

                if len(df_list) == 0:
                    sent_dfs[ttname][pn] = None
                    print('No reviews for {} {}'.format(ttname, pn))
                    continue
                sent_dfs[ttname][pn] = df_list.pop()
                while len(df_list) > 0:
                    sent_dfs[ttname][pn] = pd.merge(df_list.pop(), 
                                                    sent_dfs[ttname][pn], how='outer')
        # Save B
        print('Saving B for {} {}'.format(REMOVE, REMOVE_FRACTION))
        print (str(datetime.datetime.now()))
        pickle_out = open(data_path
                            + 'reviews_wout_top_' + str(int(round(REMOVE_FRACTION * 100))) + 'pct_' + REMOVE + '_B.pkl'
                            , "wb")
        pickle.dump(sent_dfs, pickle_out)
        pickle_out.close()

        # Create A with same number of reviews
        A_dic = {'train': {}, 'test':{}}

        for tt in ['train', 'test']:
            for pn in ['positive', 'negative']:
                df = test_dic[tt][pn]
                A_dic[tt][pn] = df[df['asin'].isin(sent_dfs[tt][pn]['asin'])]

        # Save A
        print('Saving A for {} {}'.format(REMOVE, REMOVE_FRACTION))
        print (str(datetime.datetime.now()))
        pickle_out = open(data_path
                        + 'reviews_wout_top_' + str(int(round(REMOVE_FRACTION * 100))) + 'pct_' + REMOVE + '_A.pkl'
                            , "wb")
        pickle.dump(A_dic, pickle_out)
        pickle_out.close()