# Creation of shortened reviews with reviewer or plot emotions removed

In [1]:
import numpy as np
import pandas as pd
import pickle
import math
from nltk.tokenize import sent_tokenize
import datetime

from IPython.display import Markdown, display

import sys
sys.path.append('..')

pd.set_option('display.max_colwidth', -1)

# From this project
from src.NLP import WordBag, AboutMovie

## Configuration

In [4]:
# Subsampling from Amazon reviews
NB_SAMPLES = 164000 #4000  # up to 200k, then change the input file

data_path = '../datasets/'


In [5]:
# file_name = '360000_balanced_train_test_reviews.pkl'
file_name = '_balanced_pos_neg_train_test_reviews_5_no_support.pkl'

pickle_in = open(data_path + str(NB_SAMPLES) + file_name,"rb")
train_test_dic0 = pickle.load(pickle_in)

In [6]:
train_test_dic0['train']['positive'].shape

(65600, 10)

In [7]:
SAMPLE_FRACTION = 1

test_dic = {'train': {}, 'test':{}}

for i in ['train','test']:
    for j in ['positive','negative']:
         
         test_dic[i][j] = train_test_dic0[i][j][train_test_dic0[i][j]['nb_sentences'] == 5] \
            .reset_index() \
            .reset_index() \
            .drop(['reviewerName', 'helpful', 'asin', 'index', 'nb_sentences',
                   'summary', 'unixReviewTime', 'reviewTime'], axis=1) \
            .rename(columns={'level_0': 'asin'})


In [8]:
test_dic['train']['positive'].head(3)

Unnamed: 0,asin,reviewerID,reviewText,overall
0,0,A3OPUUL9DQP8QL,"It started out so bad that I nearly left the theatre! After having positively loved the first one, I was downright shocked to see such a mess made of the second. But after 15 minutes it started to improve, and so much that by the end of the movie we were all howling with laughter and really enjoying ourselves. Silly beginning but incredible improvement. So to make a long story short: quite a good sequel.",4.0
1,1,AFWFAFKZZTCVW,"This is either a love it or hate it movie. It has a bit of a cliched and predictable plot, but it will none-the-less keep you watching. If you're a fan of the novels each characters to come from, its especially good. The special effects really add to the already classic stories in this lovely cross over. I really enjoyed it, and everyone should at least give it a try.",4.0
2,2,A265CL5S3XTVT7,"I would have put this at a 3 but the effects were amazing enough for another star. I enjoyed the movie and found the characters riviting BUT the ending was missing something. I cannot really put my finger on it but I was left with a puzzled look on my face. I think the most intriguing character is Jekyll and Hyde. Overall, it was not a disappointing buy, but I don't see it as one I would watch over and over again.",4.0


In [9]:
total = 0
for i in ['train','test']:
    for j in ['positive','negative']:
        print(test_dic[i][j].shape)
        total += test_dic[i][j].shape[0]
total

(13757, 4)
(11364, 4)
(3428, 4)
(2857, 4)


31406

## Remove objective sentences for case B

In [17]:
from src.subjective_filter import SubjectiveFilter

In [18]:
%reload_ext autoreload
obj_path = 'src/obj_subj_dev/'
fit_obj_tf = obj_path + 'fit_tfidf_vectorizer_for_obj_subj_sentences_classification.pkl'
fit_obj_model = obj_path + 'GBC_300_0.5_5_0.88cv.pkl'
subj_filter = SubjectiveFilter(fit_obj_tf, fit_obj_model)

In [19]:
# for tt in ['train','test']:
#     for pn in ['positive','negative']:
#         print(tt,pn,test_dic[tt][np].shape)
total = 0
for tt in test_dic.values():
    for df in tt.values():
#         display(df.head(1))
        total += df.shape[0]
        display(df.shape)
print(total)

(13757, 4)

(11364, 4)

(3428, 4)

(2857, 4)

31406


# Save A

In [20]:
pickle_out = open(data_path
                + 'reviews_A.pkl'
                , "wb")
pickle.dump(test_dic, pickle_out)
pickle_out.close()

# Create & save various B cases

In [None]:
CHUNK_SZ = 1

for REMOVE in ['subj', 'obj']:
    for REMOVE_FRACTION in [0.8, 0.6, 0.4, 0.2]:
        print('Starting computations for {} {}'.format(REMOVE, REMOVE_FRACTION))
        print (str(datetime.datetime.now()))
        sent_dfs = {'train':{},'test':{}}
        nb_sentences_removed = 0

        for ttname, tt in test_dic.items():
            for pn, df in tt.items():
                df_list = []
                start = 0
                while start < df.shape[0]:
                    end = start + CHUNK_SZ
                    df1 = df.iloc[start:end,:]
                    df2 = subj_filter.to_one_sent_per_row(df1)
                    df3, removed = subj_filter.transform(
                            df2,
                            'sentence', 
                            remove_fraction = REMOVE_FRACTION,
                            debug_level=0,
                            remove=REMOVE)
                    if removed == -1:
                        print('Warning: skipping a review, not enough sentences')
                        start = end
                        continue
                    df_list.append(df3)
                    nb_sentences_removed += removed
                    start = end

                if len(df_list) == 0:
                    sent_dfs[ttname][pn] = None
                    print('No reviews for {} {}'.format(ttname, pn))
                    continue
                sent_dfs[ttname][pn] = df_list.pop()
                while len(df_list) > 0:
                    sent_dfs[ttname][pn] = pd.merge(df_list.pop(), 
                                                    sent_dfs[ttname][pn], how='outer')
        # Save B
        print('Saving B for {} {}'.format(REMOVE, REMOVE_FRACTION))
        print (str(datetime.datetime.now()))
        pickle_out = open(data_path
                            + 'reviews_wout_top_' + str(int(round(REMOVE_FRACTION * 100))) 
                            + 'pct_' + REMOVE + '_B.pkl'
                            , "wb")
        pickle.dump(sent_dfs, pickle_out)
        pickle_out.close()

Starting computations for subj 0.8
2019-08-30 17:31:28.155274
