In [1]:
import numpy as np
import pandas as pd
import pickle
import gzip
import math
import random
from sklearn.model_selection import train_test_split

data_path = '../../datasets/'

In [2]:
pickle_in = open(data_path + "amzn_2M_skip2M.pkl","rb")
comments_df = pickle.load(pickle_in)

# Configuration

In [3]:
NB_EACH_CLASS = 50000 # Number of Training + Test samples of each class
TEST_RATIO = 0.2 # Test set
POS_NEG = True  # Merge 1-2 stars into NEG, 4-5 stars into POS, drop 3
MIN_NB_SENTENCES = 5

## Check that balanced classes can be reached for the requested configuration

In [4]:
%load_ext autoreload
%autoreload 2

In [None]:
# takes less than 15 minutes
%reload_ext autoreload
from utils import nb_sentences, not_about_support

minimum = NB_EACH_CLASS + 10
all = {}
for i in range(1,6):
    all[i] = comments_df[comments_df['overall'] == i]
    all[i]['nb_sentences'] = nb_sentences(all[i]['reviewText'])
    all[i] = all[i][all[i]['nb_sentences'] >= MIN_NB_SENTENCES]
    all[i] = all[i][not_about_support(all[i]['reviewText'])]
    
    minimum = min(all[i].shape[0], minimum)
if minimum < NB_EACH_CLASS:
    print('Maximum for balanced classes is ', minimum)
else:
    print('No problem!')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [None]:
from IPython.display import display
pd.set_option('display.max_colwidth', -1)

for i in range(1,6):
    display(all[i].head(5))

## Create train and test sets at random, from most recent reviews

In [None]:
train_test = {'train': {}, 'test':{}}
random.seed(0) # for reproducibility
train_idx_filter = np.ones((NB_EACH_CLASS,), dtype=bool)

for i in random.sample(range(NB_EACH_CLASS), 
                                math.floor(NB_EACH_CLASS * TEST_RATIO)):
    train_idx_filter[i] = False

for i in range(1,6):
    #                          use the most recent movies
    train_test['train'][i] = all[i][-NB_EACH_CLASS:][train_idx_filter]
    train_test['test'][i] = all[i][-NB_EACH_CLASS:][~train_idx_filter]
    

In [None]:

if POS_NEG:
    train_test_pos_neg = {'train':{}, 'test':{}}
    train_test_pos_neg['train']['positive'] = pd.concat([
        train_test['train'][4],
        train_test['train'][5]
    ])
    train_test_pos_neg['train']['negative'] = pd.concat([
        train_test['train'][1],
        train_test['train'][2],
    ])
    train_test_pos_neg['test']['positive'] = pd.concat([
        train_test['test'][4],
        train_test['test'][5],
    ])    
    train_test_pos_neg['test']['negative'] = pd.concat([
        train_test['test'][1],
        train_test['test'][2]
    ]) 
else:
    train_test_merged_classes = {}
    train_test_merged_classes['train'] = pd.concat([
        train_test['train'][1],
        train_test['train'][2],
        train_test['train'][3],
        train_test['train'][4],
        train_test['train'][5]
    ])
    train_test_merged_classes['test'] = pd.concat([
        train_test['test'][1],
        train_test['test'][2],
        train_test['test'][3],
        train_test['test'][4],
        train_test['test'][5]
    ])

## Result check

In [None]:
pb = False
if POS_NEG:
    if (train_test_pos_neg['train']['positive'].shape[0] 
                        != NB_EACH_CLASS * 2 * (1-TEST_RATIO)):
        pb = True
        
else:
    if ((train_test_merged_classes['train'].shape[0] 
        + train_test_merged_classes['test'].shape[0] == NB_EACH_CLASS * 5) & 
        (train_test_merged_classes['train'].shape[0] / (1-TEST_RATIO) == NB_EACH_CLASS * 5)):
        pb = True
if pb:
    print('@@@ There was a problem during creation @@@')
else:
    print('No problems found')

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
fig, axs = plt.subplots(2,2,figsize=(12,7))
for i, tt in enumerate(['train', 'test']):
    for j, pn in enumerate(['positive', 'negative']):
        axs[i][j].hist(train_test_pos_neg[tt][pn]['nb_sentences'],bins=100, range=(4,30));

## Save the balanced training set

In [None]:
if POS_NEG:
    samples_nb = str(NB_EACH_CLASS * 4)
    pickle_out = open(data_path + samples_nb + "_balanced_pos_neg_train_test_reviews.pkl","wb")
    pickle.dump(train_test_pos_neg, pickle_out)    
else:
    samples_nb = str(NB_EACH_CLASS * 5)
    pickle_out = open(data_path + samples_nb + "_balanced_train_test_reviews.pkl","wb")
    pickle.dump(train_test_merged_classes, pickle_out)

pickle_out.close()

In [None]:
train_test_pos_neg['train']['positive'].head(2)