In [5]:
import numpy as np
import pandas as pd
import pickle
import gzip
import math
import random
from sklearn.model_selection import train_test_split

data_path = '../../datasets/'

In [6]:
pickle_in = open(data_path + "amzn_2M_skip2M.pkl","rb")
comments_df = pickle.load(pickle_in)

# Configuration

In [31]:
NB_EACH_CLASS = 90000 # Number of Training + Test samples of each class
TEST_RATIO = 0.2 # Test set
POS_NEG = True  # Merge 1-2 stars into NEG, 4-5 stars into POS, drop 3

## Check that balanced classes can be reached for the requested configuration

In [32]:
minimum = NB_EACH_CLASS + 10
all = {}
for i in range(1,6):
    all[i] = comments_df[comments_df['overall'] == i]
    minimum = min(all[i].shape[0], minimum)
if minimum < NB_EACH_CLASS:
    print('Maximum for balanced classes is ', minimum)
else:
    print('No problem!')

No problem!


In [33]:
comments_df.head(2)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A3R5OBKS7OM2IR,143502,Rebecca L. Johnson,"[0, 0]",This has some great tips as always and is help...,5.0,Alton... nough said,1358380800,"01 17, 2013"
1,A3R5OBKS7OM2IR,143529,Rebecca L. Johnson,"[0, 0]",This is a great pastry guide. I love how Alto...,5.0,Ah Alton...,1380672000,"10 2, 2013"


## Create train and test sets at random, from most recent reviews

In [34]:
train_test = {'train': {}, 'test':{}}
random.seed(0) # for reproducibility
train_idx_filter = np.ones((NB_EACH_CLASS,), dtype=bool)

for i in random.sample(range(NB_EACH_CLASS), 
                                math.floor(NB_EACH_CLASS * TEST_RATIO)):
    train_idx_filter[i] = False

for i in range(1,6):
    #                          use the most recent movies
    train_test['train'][i] = all[i][-NB_EACH_CLASS:][train_idx_filter]
    train_test['test'][i] = all[i][-NB_EACH_CLASS:][~train_idx_filter]
    

In [35]:

if POS_NEG:
    train_test_pos_neg = {'train':{}, 'test':{}}
    train_test_pos_neg['train']['positive'] = pd.concat([
        train_test['train'][4],
        train_test['train'][5]
    ])
    train_test_pos_neg['train']['negative'] = pd.concat([
        train_test['train'][1],
        train_test['train'][2],
    ])
    train_test_pos_neg['test']['positive'] = pd.concat([
        train_test['test'][4],
        train_test['test'][5],
    ])    
    train_test_pos_neg['test']['negative'] = pd.concat([
        train_test['test'][1],
        train_test['test'][2]
    ]) 
else:
    train_test_merged_classes = {}
    train_test_merged_classes['train'] = pd.concat([
        train_test['train'][1],
        train_test['train'][2],
        train_test['train'][3],
        train_test['train'][4],
        train_test['train'][5]
    ])
    train_test_merged_classes['test'] = pd.concat([
        train_test['test'][1],
        train_test['test'][2],
        train_test['test'][3],
        train_test['test'][4],
        train_test['test'][5]
    ])

## Result check

In [36]:
pb = False
if POS_NEG:
    if (train_test_pos_neg['train']['positive'].shape[0] 
                        != NB_EACH_CLASS * 2 * (1-TEST_RATIO)):
        pb = True
        
else:
    if ((train_test_merged_classes['train'].shape[0] 
        + train_test_merged_classes['test'].shape[0] == NB_EACH_CLASS * 5) & 
        (train_test_merged_classes['train'].shape[0] / (1-TEST_RATIO) == NB_EACH_CLASS * 5)):
        pb = True
if pb:
    print('@@@ There was a problem during creation @@@')
else:
    print('No problems found')

No problems found


## Save the balanced training set

In [37]:
if POS_NEG:
    samples_nb = str(NB_EACH_CLASS * 4)
    pickle_out = open(data_path + samples_nb + "_balanced_pos_neg_train_test_reviews.pkl","wb")
    pickle.dump(train_test_pos_neg, pickle_out)    
else:
    samples_nb = str(NB_EACH_CLASS * 5)
    pickle_out = open(data_path + samples_nb + "_balanced_train_test_reviews.pkl","wb")
    pickle.dump(train_test_merged_classes, pickle_out)

pickle_out.close()