## Overview

This notebook loads by-book HVAR values from the separate train/dev/test sets, combines them, and uses the the combined lists to generate per-book min-max HVAR values, and per-book HVAR standard deviation and HVAR mean values.

These min-max, standard deviation, and mean values are then used in the cluster-labeling notebooks to perform min-max rescaling, and z-score normalization of HVAR values.

Once those scaling and standardization operations are performed, labels can be assigned through 1-dimensional clustering.

**Note:** this entire process could be avoided by re-joining the separate dataframes, which were originally one large dataset, but such an operation proved too memory intesive.

In [1]:
# use info from train/dev sets to determine per-book global min-max, mean, and standard deviation of annual_hvar
# Do not use test set: label test set on its own

import pickle
import pandas as pd
import numpy as np


In [2]:
# Load by-book hvar lists for train/dev/test sets

train_pkl = open('train_hvar_list.pkl','rb')
dev_pkl = open('dev_hvar_list.pkl','rb')
test_pkl = open('test_hvar_list.pkl','rb') # uncomment if you change your mind...

train_hvar_list = pickle.load(train_pkl)
dev_hvar_list = pickle.load(dev_pkl)
test_hvar_list = pickle.load(test_pkl)

train_pkl.close()
dev_pkl.close()
test_pkl.close()

In [3]:
# how many unique books per collection?
print(train_hvar_list.shape)
print(dev_hvar_list.shape)
print(test_hvar_list.shape)

(197298, 1)
(116942, 1)
(116607, 1)


In [4]:
# Join them all on asin

combined = train_hvar_list.copy()
combined.head()

Unnamed: 0_level_0,annual_HVAR
asin,Unnamed: 1_level_1
000100039X,"[0.0, 0.5932203389830508, 0.6684981684981685, ..."
0001055178,"[0.0, 0.6228668941979523, 0.41351963746223563,..."
0001712772,"[1.0633132647115944, 0.08782483156881617, 1.40..."
0001714538,"[0.14158262218774242, 0.14807302231237324, 0.2..."
0002005395,"[0.32863145258103243, 1.0067422617223414, 0.73..."


In [5]:
# NaN indicates a book with reviews in one or more group but not all three
combined = combined.join(dev_hvar_list, rsuffix='dev')
combined.head()

Unnamed: 0_level_0,annual_HVAR,annual_HVARdev
asin,Unnamed: 1_level_1,Unnamed: 2_level_1
000100039X,"[0.0, 0.5932203389830508, 0.6684981684981685, ...","[0.14907085971002654, 0.07027339237581826, 0.9..."
0001055178,"[0.0, 0.6228668941979523, 0.41351963746223563,...",
0001712772,"[1.0633132647115944, 0.08782483156881617, 1.40...",[0.7479508196721312]
0001714538,"[0.14158262218774242, 0.14807302231237324, 0.2...",
0002005395,"[0.32863145258103243, 1.0067422617223414, 0.73...","[1.0080341451167463, 0.477124183006536, 0.2864..."


In [34]:
# Now how many unique?
combined.shape

(197298, 2)

In [6]:
# Ignore annual hvar values in calculations for train/dev sets

combined = combined.join(test_hvar_list, rsuffix="test")
combined.head()

Unnamed: 0_level_0,annual_HVAR,annual_HVARdev,annual_HVARtest
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
000100039X,"[0.0, 0.5932203389830508, 0.6684981684981685, ...","[0.14907085971002654, 0.07027339237581826, 0.9...","[0.34531693472090824, 0.10195530726256984, 0.4..."
0001055178,"[0.0, 0.6228668941979523, 0.41351963746223563,...",,[0.18770892260221136]
0001712772,"[1.0633132647115944, 0.08782483156881617, 1.40...",[0.7479508196721312],
0001714538,"[0.14158262218774242, 0.14807302231237324, 0.2...",,
0002005395,"[0.32863145258103243, 1.0067422617223414, 0.73...","[1.0080341451167463, 0.477124183006536, 0.2864...",[0.29137839276210753]


In [36]:
#combined.shape

In [7]:
# Add a column for the row min_max

combined.iloc[1].values



def group_values(row):
    flattened_list = []
    for x in row:
        if type(x) != list:
            flattened_list.append(x)
        else:
            for y in x:
                flattened_list.append(y)
                
    return flattened_list
            


#[inner for outer in combined.iloc[1].values for inner in outer]

In [38]:
#print(group_values(combined.iloc[1].values))

In [8]:
combined['global_values'] = combined.apply(group_values, axis=1)

In [9]:
combined.head()

Unnamed: 0_level_0,annual_HVAR,annual_HVARdev,annual_HVARtest,global_values
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
000100039X,"[0.0, 0.5932203389830508, 0.6684981684981685, ...","[0.14907085971002654, 0.07027339237581826, 0.9...","[0.34531693472090824, 0.10195530726256984, 0.4...","[0.0, 0.5932203389830508, 0.6684981684981685, ..."
0001055178,"[0.0, 0.6228668941979523, 0.41351963746223563,...",,[0.18770892260221136],"[0.0, 0.6228668941979523, 0.41351963746223563,..."
0001712772,"[1.0633132647115944, 0.08782483156881617, 1.40...",[0.7479508196721312],,"[1.0633132647115944, 0.08782483156881617, 1.40..."
0001714538,"[0.14158262218774242, 0.14807302231237324, 0.2...",,,"[0.14158262218774242, 0.14807302231237324, 0.2..."
0002005395,"[0.32863145258103243, 1.0067422617223414, 0.73...","[1.0080341451167463, 0.477124183006536, 0.2864...",[0.29137839276210753],"[0.32863145258103243, 1.0067422617223414, 0.73..."


In [10]:
combined['min_max'] = combined['global_values'].apply(lambda x: (min(x), max(x)))

In [11]:
combined['mean_sd'] = combined['global_values'].apply(lambda x: (np.nanmean(x), np.nanstd(x)))

In [12]:
combined.head()

Unnamed: 0_level_0,annual_HVAR,annual_HVARdev,annual_HVARtest,global_values,min_max,mean_sd
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
000100039X,"[0.0, 0.5932203389830508, 0.6684981684981685, ...","[0.14907085971002654, 0.07027339237581826, 0.9...","[0.34531693472090824, 0.10195530726256984, 0.4...","[0.0, 0.5932203389830508, 0.6684981684981685, ...","(0.0, 24.302026375040207)","(0.9683401157041025, 2.6272423654684296)"
0001055178,"[0.0, 0.6228668941979523, 0.41351963746223563,...",,[0.18770892260221136],"[0.0, 0.6228668941979523, 0.41351963746223563,...","(0.0, 1.0454458659537904)","(0.3033125129582897, 0.30804662529148785)"
0001712772,"[1.0633132647115944, 0.08782483156881617, 1.40...",[0.7479508196721312],,"[1.0633132647115944, 0.08782483156881617, 1.40...","(0.08782483156881617, 1.4056482670089858)","(0.7099407251829095, 0.49265711624864933)"
0001714538,"[0.14158262218774242, 0.14807302231237324, 0.2...",,,"[0.14158262218774242, 0.14807302231237324, 0.2...","(0.0, 0.2407651715039578)","(0.13260520400101836, 0.08602698143460721)"
0002005395,"[0.32863145258103243, 1.0067422617223414, 0.73...","[1.0080341451167463, 0.477124183006536, 0.2864...",[0.29137839276210753],"[0.32863145258103243, 1.0067422617223414, 0.73...","(0.20402459474566798, 1.0080341451167463)","(0.5565624088756161, 0.27842222517588405)"


In [13]:
# "global" NOW refers to values sample statistics calculated with ALL annual HVAR values for reviews in train and dev sets only
global_min_max = pd.DataFrame(combined[['min_max','mean_sd']])

In [14]:
global_min_max.head()

Unnamed: 0_level_0,min_max,mean_sd
asin,Unnamed: 1_level_1,Unnamed: 2_level_1
000100039X,"(0.0, 24.302026375040207)","(0.9683401157041025, 2.6272423654684296)"
0001055178,"(0.0, 1.0454458659537904)","(0.3033125129582897, 0.30804662529148785)"
0001712772,"(0.08782483156881617, 1.4056482670089858)","(0.7099407251829095, 0.49265711624864933)"
0001714538,"(0.0, 0.2407651715039578)","(0.13260520400101836, 0.08602698143460721)"
0002005395,"(0.20402459474566798, 1.0080341451167463)","(0.5565624088756161, 0.27842222517588405)"


In [15]:
global_min_max.isnull().sum()

min_max    0
mean_sd    0
dtype: int64

In [16]:
global_min_max.describe()

Unnamed: 0,min_max,mean_sd
count,197298,197298
unique,157843,197291
top,"(0.0, 3.7244897959183674)","(0.23127648663033926, 0.000284259970479301)"
freq,38,3


In [17]:
global_min_max.to_pickle('global_min_max.pkl')