<!-- # Overview
- Project
    - data engineer 
        - keep acceptable sessions
        - create prediction classes and eliminate end of each session
        - make same input size for rnn
        - add features if this is suitable
    - code each rnn
        - vanilla rnn
        - lstm
        - gru -->

# Overview
- Data Engineering
    - Load Data
    - Initial Data Examination
    - Dataset for Sessions for Entire Session Prediction Problem
        - Remove inacceptable sessions
        - add label
    - (Dataset for Sessions for Early Session Prediction Problem)
        - Remove inacceptable sessions
        - add label
    - Handle NaN Values
    - Feature Encoding
    - Feature Creation
    - Put into Acceptable Input Form
        - Scale Features
        - Padding
        - (Word Embedding)
    - Create Dataloader
- Data Analysis
    - plots

# Imports

In [51]:
# manipulating data
import pandas as pd
import numpy as np

# Neural Networks
import torch
from torch.utils.data import  TensorDataset
from torch import Tensor

# handling time data
import time # for timestamps
import datetime

# data analysis
import matplotlib.pyplot as plt 

#
import math



# Data Engineering
#### Will be done with Pandas

## Load Data

In [2]:
# global path to data (varies for user)
data_path = './shopper_data/release_10_23_2020.csv'

# load dataset
df = pd.read_csv(data_path)


In [3]:
# get column names
col_names = [c for c in df.columns]
features = col_names[:-1]

# sepearate column names into categorical and numerical
cat_cols = []
numer_cols = []

for col in col_names:
    if df[col].dtype == 'O':
        cat_cols.append(col)
    else:
        numer_cols.append(col)

## Initial Data Examination

In [4]:
df.head()

Unnamed: 0,session_id_hash,event_type,product_action,product_skus_hash,server_timestamp_epoch_ms,hashed_url
0,00000005c19449b57d8d05dfc8b5474be0654032,pageview,,,1544469996111,da99729886aff70a02733b6cd69ee7df35622d9302347e...
1,00006a0ada94a5186163a25e9ed9c94481c820d9,pageview,,,1545737959865,e2f7e0cee4272e804f0d323a3513dd01716a5a40ab9abf...
2,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,,,1544908835402,ea7b2493be61ff454f8cce412f9dc281e605daec8c43b5...
3,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,,,1544908847263,ea7b2493be61ff454f8cce412f9dc281e605daec8c43b5...
4,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,detail,90add4dd072e160034e98ddea257e0b59441eae00d8955...,1544909035879,8fa1ecf31ececb27ebe9c529966f3d1f907542fe138d5d...


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5433611 entries, 0 to 5433610
Data columns (total 6 columns):
 #   Column                     Dtype 
---  ------                     ----- 
 0   session_id_hash            object
 1   event_type                 object
 2   product_action             object
 3   product_skus_hash          object
 4   server_timestamp_epoch_ms  int64 
 5   hashed_url                 object
dtypes: int64(1), object(5)
memory usage: 248.7+ MB


#### NAN values in each column

In [6]:
df.isna().sum()

session_id_hash                    0
event_type                         0
product_action               2918789
product_skus_hash            3547557
server_timestamp_epoch_ms          0
hashed_url                         0
dtype: int64

#### Value Counts (for reasonable number of classes)

In [7]:
print("Number of Unique Values for each Categoical Column:\n")
num_unique = []
for col in cat_cols:
    num_unique.append(len(df[col].unique()))
    print("{}: ".format(col), num_unique[-1])
    if num_unique[-1] < 100:
        print('\t')
        print(df[col].value_counts())
    print('\n')

Number of Unique Values for each Categoical Column:

session_id_hash:  443660


event_type:  2
	
pageview    4565253
event        868358
Name: event_type, dtype: int64


product_action:  6
	
detail      1640190
add          743363
click         69831
remove        51512
purchase       9926
Name: product_action, dtype: int64


product_skus_hash:  38345


hashed_url:  256598




## Dataset for Sessions for Whole Session Prediction Problem
- Keep sessons with:
    - length >= 5 clicks
    - length <= 155 clicks
- set nan values in actions = 'pageview'
- remove last few actions and add labels

In [8]:
df

Unnamed: 0,session_id_hash,event_type,product_action,product_skus_hash,server_timestamp_epoch_ms,hashed_url
0,00000005c19449b57d8d05dfc8b5474be0654032,pageview,,,1544469996111,da99729886aff70a02733b6cd69ee7df35622d9302347e...
1,00006a0ada94a5186163a25e9ed9c94481c820d9,pageview,,,1545737959865,e2f7e0cee4272e804f0d323a3513dd01716a5a40ab9abf...
2,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,,,1544908835402,ea7b2493be61ff454f8cce412f9dc281e605daec8c43b5...
3,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,,,1544908847263,ea7b2493be61ff454f8cce412f9dc281e605daec8c43b5...
4,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,detail,90add4dd072e160034e98ddea257e0b59441eae00d8955...,1544909035879,8fa1ecf31ececb27ebe9c529966f3d1f907542fe138d5d...
...,...,...,...,...,...,...
5433606,ffffc2bce69fd5da3e9998fd5c788450a726ba4d,pageview,detail,81001bca02486bcecc944bc88412be729260f00a54d583...,1545571237554,a498ddf07e784d7d00df18a8f51a81e45aedf57ef8a45a...
5433607,ffffc2bce69fd5da3e9998fd5c788450a726ba4d,pageview,,,1545571243349,aaf8c8581bfc7373f63dbdde27304c619bc7db1755ec50...
5433608,ffffc776defb14e6ba4c2635ba87251cb414573c,pageview,,,1545748104469,54f2670e3703a7b85cf5015dc130bc6c1011d7f2fce07c...
5433609,ffffce103f74909ba29cc8f4e40d82583b2e1898,pageview,,,1544536113737,f0ba8800a3e7fc0a3ea6904ad219ec44b964658817c52d...


### Keep sessions with length 5 <= L <= 155

In [9]:
maxSeqCutoffSize = 155
minSeqCutoffSize = 5

dfW = df.groupby('session_id_hash')
dfW = dfW.filter(lambda x: (len(x) >= minSeqCutoffSize and len(x) <= maxSeqCutoffSize))
dfW = dfW.reset_index()
dfW.drop('index', axis=1, inplace = True)

In [10]:
# in same statement
dfW

Unnamed: 0,session_id_hash,event_type,product_action,product_skus_hash,server_timestamp_epoch_ms,hashed_url
0,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,,,1544908835402,ea7b2493be61ff454f8cce412f9dc281e605daec8c43b5...
1,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,,,1544908847263,ea7b2493be61ff454f8cce412f9dc281e605daec8c43b5...
2,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,detail,90add4dd072e160034e98ddea257e0b59441eae00d8955...,1544909035879,8fa1ecf31ececb27ebe9c529966f3d1f907542fe138d5d...
3,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,,,1544909049946,5481ac8074c7ecec6818281b0d12cfddfd005667637925...
4,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,detail,bd81fd361f0ae5438cb712c3a4e0344d05c6ebd4b7aae6...,1544909231588,05b9fd85f8201a96bd8dd102ae83288aae394c91fd53d1...
...,...,...,...,...,...,...
4198650,ffffc2bce69fd5da3e9998fd5c788450a726ba4d,pageview,,,1545570913853,6ddd53c2e8a4513d63b5393db16475c8aefcfbcea0e7c5...
4198651,ffffc2bce69fd5da3e9998fd5c788450a726ba4d,pageview,,,1545571133716,a2494e01b9fdc0cb2ac36472f19f6ace950ba23d043990...
4198652,ffffc2bce69fd5da3e9998fd5c788450a726ba4d,pageview,detail,81001bca02486bcecc944bc88412be729260f00a54d583...,1545571171266,a498ddf07e784d7d00df18a8f51a81e45aedf57ef8a45a...
4198653,ffffc2bce69fd5da3e9998fd5c788450a726ba4d,pageview,detail,81001bca02486bcecc944bc88412be729260f00a54d583...,1545571237554,a498ddf07e784d7d00df18a8f51a81e45aedf57ef8a45a...


In [11]:
print('Number of unique sessions: {} \n'.format(len(dfW['session_id_hash'].unique())))

Number of unique sessions: 203127 



## Handle NaN Values
- product action
- product_skus_hash

In [12]:
# change to pageview in product_action
dfW.loc[dfW['product_action'].isna(), "product_action"] = 'pageview'

In [13]:
# change NaN to 'None'
dfW.loc[dfW['product_skus_hash'].isna(), "product_skus_hash"] = 'None'


In [14]:
# No values should be NaN
dfW.isna().sum()

session_id_hash              0
event_type                   0
product_action               0
product_skus_hash            0
server_timestamp_epoch_ms    0
hashed_url                   0
dtype: int64

In [15]:
dfW.head()

Unnamed: 0,session_id_hash,event_type,product_action,product_skus_hash,server_timestamp_epoch_ms,hashed_url
0,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,pageview,,1544908835402,ea7b2493be61ff454f8cce412f9dc281e605daec8c43b5...
1,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,pageview,,1544908847263,ea7b2493be61ff454f8cce412f9dc281e605daec8c43b5...
2,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,detail,90add4dd072e160034e98ddea257e0b59441eae00d8955...,1544909035879,8fa1ecf31ececb27ebe9c529966f3d1f907542fe138d5d...
3,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,pageview,,1544909049946,5481ac8074c7ecec6818281b0d12cfddfd005667637925...
4,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,detail,bd81fd361f0ae5438cb712c3a4e0344d05c6ebd4b7aae6...,1544909231588,05b9fd85f8201a96bd8dd102ae83288aae394c91fd53d1...


## Feature Creation

### boolean of whether a product is being looked at (for Nan Values in product_skus_hash)

In [16]:
dfW['viewing_product'] = 0
dfW.loc[dfW['product_skus_hash'] != 'None', 'viewing_product'] = 1

### Time sitting on page (time between timestamps)

In [17]:
dfW['time_sitting'] = df.groupby('session_id_hash')['server_timestamp_epoch_ms'].diff().fillna(0)

In [18]:
dfW.head()

Unnamed: 0,session_id_hash,event_type,product_action,product_skus_hash,server_timestamp_epoch_ms,hashed_url,viewing_product,time_sitting
0,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,pageview,,1544908835402,ea7b2493be61ff454f8cce412f9dc281e605daec8c43b5...,0,0.0
1,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,pageview,,1544908847263,ea7b2493be61ff454f8cce412f9dc281e605daec8c43b5...,0,0.0
2,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,detail,90add4dd072e160034e98ddea257e0b59441eae00d8955...,1544909035879,8fa1ecf31ececb27ebe9c529966f3d1f907542fe138d5d...,1,0.0
3,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,pageview,,1544909049946,5481ac8074c7ecec6818281b0d12cfddfd005667637925...,0,11861.0
4,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,detail,bd81fd361f0ae5438cb712c3a4e0344d05c6ebd4b7aae6...,1544909231588,05b9fd85f8201a96bd8dd102ae83288aae394c91fd53d1...,1,188616.0


### Timestamps to Categorical Time Features (i.e. month, day, hour, minutes)

In [19]:
def timestampToFeatures(data, time_col='server_timestamp_epoch_ms'):
    new_date_cols = ['year', 'month', 'day']
    new_time_cols = ['hour', 'minute', 'second']
    
    data['datetime_string'] = data[time_col].apply(lambda x: time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(x/1000)))
    
    data[['date', 'time']] = data['datetime_string'].str.split(' ', expand = True)
    data[new_date_cols] = data['date'].str.split('-', expand = True).astype(int)
    data[new_time_cols] = data['time'].str.split(':', expand = True).astype(int)
    
    data.drop(['datetime_string', 'date', 'time'], axis=1, inplace=True)
    
    return data

In [20]:
dfW = timestampToFeatures(dfW, 'server_timestamp_epoch_ms')

In [21]:
dfW

Unnamed: 0,session_id_hash,event_type,product_action,product_skus_hash,server_timestamp_epoch_ms,hashed_url,viewing_product,time_sitting,year,month,day,hour,minute,second
0,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,pageview,,1544908835402,ea7b2493be61ff454f8cce412f9dc281e605daec8c43b5...,0,0.0,2018,12,15,21,20,35
1,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,pageview,,1544908847263,ea7b2493be61ff454f8cce412f9dc281e605daec8c43b5...,0,0.0,2018,12,15,21,20,47
2,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,detail,90add4dd072e160034e98ddea257e0b59441eae00d8955...,1544909035879,8fa1ecf31ececb27ebe9c529966f3d1f907542fe138d5d...,1,0.0,2018,12,15,21,23,55
3,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,pageview,,1544909049946,5481ac8074c7ecec6818281b0d12cfddfd005667637925...,0,11861.0,2018,12,15,21,24,9
4,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,detail,bd81fd361f0ae5438cb712c3a4e0344d05c6ebd4b7aae6...,1544909231588,05b9fd85f8201a96bd8dd102ae83288aae394c91fd53d1...,1,188616.0,2018,12,15,21,27,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4198650,ffffc2bce69fd5da3e9998fd5c788450a726ba4d,pageview,pageview,,1545570913853,6ddd53c2e8a4513d63b5393db16475c8aefcfbcea0e7c5...,0,23421.0,2018,12,23,13,15,13
4198651,ffffc2bce69fd5da3e9998fd5c788450a726ba4d,pageview,pageview,,1545571133716,a2494e01b9fdc0cb2ac36472f19f6ace950ba23d043990...,0,127644.0,2018,12,23,13,18,53
4198652,ffffc2bce69fd5da3e9998fd5c788450a726ba4d,pageview,detail,81001bca02486bcecc944bc88412be729260f00a54d583...,1545571171266,a498ddf07e784d7d00df18a8f51a81e45aedf57ef8a45a...,1,50187.0,2018,12,23,13,19,31
4198653,ffffc2bce69fd5da3e9998fd5c788450a726ba4d,pageview,detail,81001bca02486bcecc944bc88412be729260f00a54d583...,1545571237554,a498ddf07e784d7d00df18a8f51a81e45aedf57ef8a45a...,1,17317.0,2018,12,23,13,20,37


### Frequency Encoding of the Product

In [22]:
def FreqEncodeCol(data, columns):
    freq_enc_str = 'freq_enc_'
    for col in columns:
        new_col_name = freq_enc_str + col
        freq_enc_smap = (data.groupby(col).size()) / len(data[col])# series mapping
        data[new_col_name] = dfW[col].apply(lambda x : freq_enc_smap[x])
        
        if ((data[col].unique() == 'None').any()):
            data.loc[data[col] != 'None', new_col_name] = data.loc[data[col] != 'None', new_col_name] / (1 - freq_enc_smap['None'])
            data.loc[data[col] == 'None', new_col_name] = 0
    return data

In [23]:
dfW = FreqEncodeCol(dfW, ['product_skus_hash', 'hashed_url'])

In [24]:
dfW

Unnamed: 0,session_id_hash,event_type,product_action,product_skus_hash,server_timestamp_epoch_ms,hashed_url,viewing_product,time_sitting,year,month,day,hour,minute,second,freq_enc_product_skus_hash,freq_enc_hashed_url
0,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,pageview,,1544908835402,ea7b2493be61ff454f8cce412f9dc281e605daec8c43b5...,0,0.0,2018,12,15,21,20,35,0.000000,0.002244
1,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,pageview,,1544908847263,ea7b2493be61ff454f8cce412f9dc281e605daec8c43b5...,0,0.0,2018,12,15,21,20,47,0.000000,0.002244
2,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,detail,90add4dd072e160034e98ddea257e0b59441eae00d8955...,1544909035879,8fa1ecf31ececb27ebe9c529966f3d1f907542fe138d5d...,1,0.0,2018,12,15,21,23,55,0.000137,0.000052
3,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,pageview,,1544909049946,5481ac8074c7ecec6818281b0d12cfddfd005667637925...,0,11861.0,2018,12,15,21,24,9,0.000000,0.000308
4,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,detail,bd81fd361f0ae5438cb712c3a4e0344d05c6ebd4b7aae6...,1544909231588,05b9fd85f8201a96bd8dd102ae83288aae394c91fd53d1...,1,188616.0,2018,12,15,21,27,11,0.000159,0.000061
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4198650,ffffc2bce69fd5da3e9998fd5c788450a726ba4d,pageview,pageview,,1545570913853,6ddd53c2e8a4513d63b5393db16475c8aefcfbcea0e7c5...,0,23421.0,2018,12,23,13,15,13,0.000000,0.003274
4198651,ffffc2bce69fd5da3e9998fd5c788450a726ba4d,pageview,pageview,,1545571133716,a2494e01b9fdc0cb2ac36472f19f6ace950ba23d043990...,0,127644.0,2018,12,23,13,18,53,0.000000,0.000047
4198652,ffffc2bce69fd5da3e9998fd5c788450a726ba4d,pageview,detail,81001bca02486bcecc944bc88412be729260f00a54d583...,1545571171266,a498ddf07e784d7d00df18a8f51a81e45aedf57ef8a45a...,1,50187.0,2018,12,23,13,19,31,0.000670,0.000257
4198653,ffffc2bce69fd5da3e9998fd5c788450a726ba4d,pageview,detail,81001bca02486bcecc944bc88412be729260f00a54d583...,1545571237554,a498ddf07e784d7d00df18a8f51a81e45aedf57ef8a45a...,1,17317.0,2018,12,23,13,20,37,0.000670,0.000257


## Feature Encoding

### Function to ordinally encode desired columns

In [25]:
def ord_encoder(data, enc_cols, series=False):
    maps = {}
    for col in enc_cols:
        # encoding value for NaN values

        # put all unique values into list
        myList = data[col].unique()

        # dictionary to map values to encoded integers by index
        mapper = dict(map(reversed,enumerate(myList))) 

        # encode values with value > 0 (0 is value used for padding)
        for key in mapper:
            mapper[key] = mapper[key] + 1

        maps[col] = mapper

        # map to encoded values
        data[col] = data[col].apply(lambda x: mapper[x])        
        
    return maps, data

In [26]:
enc_cols = ['session_id_hash', 'event_type', 'product_action', 'product_skus_hash', 'hashed_url']
col_maps, dfW = ord_encoder(dfW, enc_cols)

## Put into Acceptable Input Form

In [27]:
maps = {}
enc_cols = ['product_action']
for col in enc_cols:
    # encoding value for NaN values

    # put all unique values into list
    myList = df[col].unique()

#     # dictionary to map values to encoded integers by index
    mapper = dict(map(reversed,enumerate(myList))) 

    # encode values with value > 0 (0 is value used for padding)
    for key in mapper:
        mapper[key] = mapper[key] + 1

    maps[col] = mapper

#     # map to encoded values
#     data[col] = data[col].apply(lambda x: mapper[x])

### Create Labels

#### Create Labels for Purchase (True) or No Purchase (False)

In [28]:
## this code was here, but not sure what it did

#df['Expected'] = (g['Value1'].transform(lambda x: x.eq(7).any()))&(g['Value2'].transform(lambda x: x.eq(9).any()))

In [29]:
def CreateTwoLabels(data):
    data['Label'] = data.groupby('session_id_hash')['product_action'].transform(lambda x: x.eq(col_maps['product_action']['purchase']).any())
    labels = data.groupby('session_id_hash')['Label'].unique().apply(list)
    data.drop('Label', axis=1, inplace=True)
    
    new_labels = np.empty([labels.shape[0],1])
    for i, v in enumerate(labels):
        new_labels[i] = v[0]
    return new_labels.astype(int)
    

In [30]:
my2Labels = CreateTwoLabels(dfW)

assert(len(my2Labels) == len(dfW['session_id_hash'].unique()))

In [31]:
np.unique(my2Labels, return_counts=True)

(array([0, 1]), array([194755,   8372]))

#### Create Labels for Purchase, No Purchase, or Abandon Cart

In [32]:
def check_df(data):
  if (data['Label1'] == False and data['Label2'] == False):
    return 0
  elif (data['Label1'] == False and data['Label2'] == True):
    return 1
  else:
    return 2

In [33]:
def CreateThreeLabels(data):
    data['Label1'] = data.groupby('session_id_hash')['product_action'].transform(lambda x: x.eq(col_maps['product_action']['purchase']).any())
    data['Label2'] = data['Label1']
    data.loc[~data['Label1'], 'Label2'] = data.loc[~data['Label1'], ['session_id_hash', 'product_action', 'Label2']].groupby('session_id_hash')['product_action'].transform(lambda x: x.eq(col_maps['product_action']['add']).any())
    data['Label3'] = data.apply(check_df, axis=1)
    labels = data.groupby('session_id_hash')['Label3'].unique().apply(list)
    data.drop(['Label1', 'Label2', 'Label3'], axis=1, inplace=True)
    new_labels = np.empty([labels.shape[0],1])
    for i, v in enumerate(labels):
        new_labels[i] = v[0]
    return new_labels.astype(int)
    

In [34]:
my3Labels = CreateThreeLabels(dfW)
assert(len(my3Labels) == len(dfW['session_id_hash'].unique()))

In [35]:
np.unique(my3Labels, return_counts = True)

(array([0, 1, 2]), array([164674,  30081,   8372]))

### Scale Desired Columns

In [36]:
def NormalizeCols(data, norm_cols):
    data[norm_cols] = (data[norm_cols]-data[norm_cols].min())/(data[norm_cols].max() - data[norm_cols].min())
    return data


In [37]:
norm_cols = ['time_sitting', 'day', 'hour', 'minute']
dfW = NormalizeCols(dfW, norm_cols)

## Remove Unecessary Columns

In [38]:
cols_to_drop = ['server_timestamp_epoch_ms', 'product_skus_hash', 'hashed_url', 'year', 'month', 'second']
dfW.drop(cols_to_drop, axis=1, inplace=True)

### Convert to Data Numpy Array (Later to Tensor)

In [39]:
# find the maximum sequence length to pad to
maxSeqLen = dfW.groupby('session_id_hash').count().max().max() # should be 155 or a bit less than 155

# convert to numpy array
# https://stackoverflow.com/questions/65767833/pandas-dataframe-to-tensor
seq_arrays = np.asarray(dfW.groupby(['session_id_hash']).apply(np.array))


### Get Rid of Purchase Actions and all actions after

In [40]:
# arr3D[x][y][z], x = session, y = sequence_row, z is column... z=1 is 'product action'
# this function removes all rows after purchase is made (including purchase row) for 3d numpy array with above dims
def removeRowsAfterPurchase(arr3D, column, value):
    for i, x in enumerate(arr3D):
        for j, y in enumerate(arr3D[i]):
            if (arr3D[i][j][column] == value):
                arr3D[i] = arr3D[i][1:j]
                break
    return arr3D

In [41]:
colName = 'product_action'
colInd = dfW.columns.get_loc(colName)
seq_arrays = removeRowsAfterPurchase(seq_arrays, colInd, col_maps[colName]['purchase'])


In [42]:
# check that all purchase actions have been removed
colName = 'product_action'
colInd = dfW.columns.get_loc(colName)
maxSeqLen = 0

for i, x in enumerate(seq_arrays):
    for j, y in enumerate(seq_arrays[i]):
        if (j > maxSeqLen):
            maxSeqLen = j
        assert(seq_arrays[i][j][colInd] != col_maps[colName]['purchase'])
maxSeqLen = maxSeqLen + 1
print(maxSeqLen)

155


### Put into Correct Input Shape
- Input data: RNN should have 3 dimensions. (Batch Size, Sequence Length and Input Dimension) (https://www.analyticsvidhya.com/blog/2021/07/understanding-rnn-step-by-step-with-pytorch/#:~:text=Here%20input%20size%20is%202,otherwise%20it%20will%20be%201.)
    - Batch Size is the number of samples we send to the model at a time. (this is chosen)
    - Sequence Length is the length of the sequence of input data (should be about <= 155)
    - Input Dimension is the number of features or dimensions you are using in your data set (number of columns)

#### Padding and Removing SessionID

In [43]:
# drop first column (session_id_hash)
# pad with zeros at end
for i, seq in enumerate(seq_arrays):
    seq_arrays[i] = np.delete(seq_arrays[i], 0, axis = 1)
    seq_arrays[i] = np.pad(seq_arrays[i], ((0, maxSeqLen - seq_arrays[i].shape[0]),(0,0)), 'constant')

In [44]:
seq_arrays = np.asarray([i for i in seq_arrays])

### Convert to Shuffled Numpy Array to Split into Train and Test

In [45]:
train_ratio = 0.8
test_ratio = (1 - train_ratio)

# number of samples in train, val, test
tot_num_samp = len(seq_arrays)
train_sz = math.ceil(train_ratio * tot_num_samp)
test_sz = math.floor(test_ratio * tot_num_samp)

# set seed for randomized shuffling
np.random.seed(2)

# randomize data for splitting
randomize = np.arange(len(seq_arrays))
np.random.shuffle(randomize)
seq_arrays = seq_arrays[randomize]
my2Labels = my2Labels[randomize]
my3Labels = my3Labels[randomize]

# split into train val test (for 2 labels and 3 labels)
train_x, test_x = seq_arrays[:train_sz, :, :], seq_arrays[train_sz:train_sz+test_sz, :, :]
train_y_2, test_y_2 = my2Labels[:train_sz, :], my2Labels[train_sz:train_sz+test_sz, :]
train_y_3, test_y_3 = my3Labels[:train_sz, :], my3Labels[train_sz:train_sz+test_sz, :]

assert(len(train_x) == train_sz & len(train_y_2) == train_sz & len(train_y_3) == train_sz)
assert(len(test_x) == test_sz & len(test_y_2) == test_sz & len(test_y_3) == test_sz)



In [46]:
# see class splits 
print(np.unique(train_y_2, return_counts=True), "should be similar to [136329, 21056, 5860]")
print(np.unique(test_y_2, return_counts=True), "should be similar to [29213, 1255]")

print(np.unique(train_y_3, return_counts=True), "should be similar to [115272, 21056, 5860]")
print(np.unique(test_y_3, return_counts=True),  "should be similar to [24701, 4512, 1255]")

print("Data split evenly")

(array([0, 1]), array([155800,   6702])) should be similar to [136329, 21056, 5860]
(array([0, 1]), array([38955,  1670])) should be similar to [29213, 1255]
(array([0, 1, 2]), array([131761,  24039,   6702])) should be similar to [115272, 21056, 5860]
(array([0, 1, 2]), array([32913,  6042,  1670])) should be similar to [24701, 4512, 1255]
Data split evenly


In [47]:
def checkEvenSplit(tot_lbl, split_lbl, samp_ratio, thresh):

    # count of each class in total dataset vs split dataset
    tot_lbl_counts = np.unique(tot_lbl, return_counts=True)[1]
    split_lbl_counts = np.unique(split_lbl, return_counts=True)[1]
    
    print("Total Unique Labels: ", np.unique(tot_lbl, return_counts=True))
    print("Split Unique Labels: ", np.unique(split_lbl, return_counts=True))
    print("\n")
    
    for i in np.unique(my2Labels, return_counts=True)[0]:
        split_cts = split_lbl_counts[i]
        tot_cts = tot_lbl_counts[i]
        
        lbound = tot_cts * samp_ratio - tot_cts * thresh
        ubound = tot_cts * samp_ratio + tot_cts * thresh
        
        assert((split_cts > lbound) & (split_cts < ubound))
        

In [48]:
# assert that the classes are split well
checkEvenSplit(my2Labels, train_y_2, train_ratio, 0.02)
checkEvenSplit(my2Labels, test_y_2, test_ratio, 0.01)
checkEvenSplit(my3Labels, train_y_3, train_ratio, 0.02)
checkEvenSplit(my3Labels, test_y_3, test_ratio, 0.01)


Total Unique Labels:  (array([0, 1]), array([194755,   8372]))
Split Unique Labels:  (array([0, 1]), array([155800,   6702]))


Total Unique Labels:  (array([0, 1]), array([194755,   8372]))
Split Unique Labels:  (array([0, 1]), array([38955,  1670]))


Total Unique Labels:  (array([0, 1, 2]), array([164674,  30081,   8372]))
Split Unique Labels:  (array([0, 1, 2]), array([131761,  24039,   6702]))


Total Unique Labels:  (array([0, 1, 2]), array([164674,  30081,   8372]))
Split Unique Labels:  (array([0, 1, 2]), array([32913,  6042,  1670]))




In [49]:
# create torch variables
train_dataset_2 = TensorDataset( Tensor(train_x), Tensor(train_y_2) )
test_dataset_2 = TensorDataset( Tensor(test_x), Tensor(test_y_2) )

train_dataset_3 = TensorDataset( Tensor(train_x), Tensor(train_y_3) )
test_dataset_3 = TensorDataset( Tensor(test_x), Tensor(test_y_3) )

In [53]:
# save pytorch datasets for loading
save_datasets = [train_dataset_2, test_dataset_2, train_dataset_3, test_dataset_3]
save_dataset_names = ["train_dataset_2.pt", "test_dataset_2.pt", "train_dataset_3.pt", "test_dataset_3.pt"]

for i, d in enumerate(save_datasets):
    torch.save(d, "./cleaned_data/" + save_dataset_names[i])