<!-- # Overview
- Project
    - data engineer 
        - keep acceptable sessions
        - create prediction classes and eliminate end of each session
        - make same input size for rnn
        - add features if this is suitable
    - code each rnn
        - vanilla rnn
        - lstm
        - gru -->

# Overview
- Data Engineering
    - Load Data
    - Initial Data Examination
    - Dataset for Sessions for Entire Session Prediction Problem
        - Remove inacceptable sessions and add label
    - (Dataset for Sessions for Early Session Prediction Problem)
        - Remove inacceptable sessions and add label
    - Convert dataset to be readable for model (i.e. numerical)
    - Feature Creation
    - Put into Acceptable Input Form
        - Padding
        - (Word Embedding)
- Data Analysis
    - 

- Modelling
    - Train-Test-Split
    - RNN
        - Model Def
        - Model Training
    - LSTM
        - Model Def
        - Model Training
    - GRU
        - Model Def
        - Model Training
    - (Transformer Model Def)
        - Transfer Learning/Model Def
        - Model Training
        
- Evaluation and Comparison of Models
    - Numerical Evalutaion Metrics
        - AUC and F! Score
       - (Plotting number results comparing each as parameters change)
    - Plotted Evaluation Metrics
        - ROC Curve

# Imports

In [9]:
import pandas as pd
import numpy as np
import torch

# Data Engineering
#### Will be done with Pandas

## Load Data

In [10]:
# global path to data (varies for user)
data_path = '/Users/jonathanwozny/Documents/GraduateSchool/Fall2022/CAP6617/Project/shopper_intent_prediction/release_10_23_2020.csv'

# load dataset
df = pd.read_csv(data_path)


In [11]:
# get column names
col_names = [c for c in df.columns]
features = col_names[:-1]

# sepearate column names into categorical and numerical
cat_cols = []
numer_cols = []

for col in col_names:
    if df[col].dtype == 'O':
        cat_cols.append(col)
    else:
        numer_cols.append(col)

## Initial Data Examination

In [12]:
df.head()

Unnamed: 0,session_id_hash,event_type,product_action,product_skus_hash,server_timestamp_epoch_ms,hashed_url
0,00000005c19449b57d8d05dfc8b5474be0654032,pageview,,,1544469996111,da99729886aff70a02733b6cd69ee7df35622d9302347e...
1,00006a0ada94a5186163a25e9ed9c94481c820d9,pageview,,,1545737959865,e2f7e0cee4272e804f0d323a3513dd01716a5a40ab9abf...
2,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,,,1544908835402,ea7b2493be61ff454f8cce412f9dc281e605daec8c43b5...
3,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,,,1544908847263,ea7b2493be61ff454f8cce412f9dc281e605daec8c43b5...
4,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,detail,90add4dd072e160034e98ddea257e0b59441eae00d8955...,1544909035879,8fa1ecf31ececb27ebe9c529966f3d1f907542fe138d5d...


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5433611 entries, 0 to 5433610
Data columns (total 6 columns):
 #   Column                     Dtype 
---  ------                     ----- 
 0   session_id_hash            object
 1   event_type                 object
 2   product_action             object
 3   product_skus_hash          object
 4   server_timestamp_epoch_ms  int64 
 5   hashed_url                 object
dtypes: int64(1), object(5)
memory usage: 248.7+ MB


#### NAN values in each column

In [14]:
df.isna().sum()

session_id_hash                    0
event_type                         0
product_action               2918789
product_skus_hash            3547557
server_timestamp_epoch_ms          0
hashed_url                         0
dtype: int64

#### Value Counts (for reasonable number of classes)

In [15]:
print("Number of Unique Values for each Categoical Column:\n")
num_unique = []
for col in cat_cols:
    num_unique.append(len(df[col].unique()))
    print("{}: ".format(col), num_unique[-1])
    if num_unique[-1] < 100:
        print('\t')
        print(df[col].value_counts())
    print('\n')

Number of Unique Values for each Categoical Column:

session_id_hash:  443660


event_type:  2
	
pageview    4565253
event        868358
Name: event_type, dtype: int64


product_action:  6
	
detail      1640190
add          743363
click         69831
remove        51512
purchase       9926
Name: product_action, dtype: int64


product_skus_hash:  38345


hashed_url:  256598




## Dataset for Sessions for Whole Session Prediction Problem
- Keep sessons with:
    - length >= 5 clicks
    - length <= 155 clicks
- set nan values in actions = 'pageview'
- remove last few actions and add labels

In [16]:
df

Unnamed: 0,session_id_hash,event_type,product_action,product_skus_hash,server_timestamp_epoch_ms,hashed_url
0,00000005c19449b57d8d05dfc8b5474be0654032,pageview,,,1544469996111,da99729886aff70a02733b6cd69ee7df35622d9302347e...
1,00006a0ada94a5186163a25e9ed9c94481c820d9,pageview,,,1545737959865,e2f7e0cee4272e804f0d323a3513dd01716a5a40ab9abf...
2,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,,,1544908835402,ea7b2493be61ff454f8cce412f9dc281e605daec8c43b5...
3,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,,,1544908847263,ea7b2493be61ff454f8cce412f9dc281e605daec8c43b5...
4,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,detail,90add4dd072e160034e98ddea257e0b59441eae00d8955...,1544909035879,8fa1ecf31ececb27ebe9c529966f3d1f907542fe138d5d...
...,...,...,...,...,...,...
5433606,ffffc2bce69fd5da3e9998fd5c788450a726ba4d,pageview,detail,81001bca02486bcecc944bc88412be729260f00a54d583...,1545571237554,a498ddf07e784d7d00df18a8f51a81e45aedf57ef8a45a...
5433607,ffffc2bce69fd5da3e9998fd5c788450a726ba4d,pageview,,,1545571243349,aaf8c8581bfc7373f63dbdde27304c619bc7db1755ec50...
5433608,ffffc776defb14e6ba4c2635ba87251cb414573c,pageview,,,1545748104469,54f2670e3703a7b85cf5015dc130bc6c1011d7f2fce07c...
5433609,ffffce103f74909ba29cc8f4e40d82583b2e1898,pageview,,,1544536113737,f0ba8800a3e7fc0a3ea6904ad219ec44b964658817c52d...


### Keep sessions with length 5 <= L <= 155

In [17]:
dfW = df.groupby('session_id_hash')
dfW = dfW.filter(lambda x: (len(x) >= 5 and len(x) <=155))
dfW = dfW.reset_index()
dfW.drop('index', axis=1, inplace = True)

In [18]:
# in same statement
dfW

Unnamed: 0,session_id_hash,event_type,product_action,product_skus_hash,server_timestamp_epoch_ms,hashed_url
0,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,,,1544908835402,ea7b2493be61ff454f8cce412f9dc281e605daec8c43b5...
1,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,,,1544908847263,ea7b2493be61ff454f8cce412f9dc281e605daec8c43b5...
2,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,detail,90add4dd072e160034e98ddea257e0b59441eae00d8955...,1544909035879,8fa1ecf31ececb27ebe9c529966f3d1f907542fe138d5d...
3,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,,,1544909049946,5481ac8074c7ecec6818281b0d12cfddfd005667637925...
4,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,detail,bd81fd361f0ae5438cb712c3a4e0344d05c6ebd4b7aae6...,1544909231588,05b9fd85f8201a96bd8dd102ae83288aae394c91fd53d1...
...,...,...,...,...,...,...
4198650,ffffc2bce69fd5da3e9998fd5c788450a726ba4d,pageview,,,1545570913853,6ddd53c2e8a4513d63b5393db16475c8aefcfbcea0e7c5...
4198651,ffffc2bce69fd5da3e9998fd5c788450a726ba4d,pageview,,,1545571133716,a2494e01b9fdc0cb2ac36472f19f6ace950ba23d043990...
4198652,ffffc2bce69fd5da3e9998fd5c788450a726ba4d,pageview,detail,81001bca02486bcecc944bc88412be729260f00a54d583...,1545571171266,a498ddf07e784d7d00df18a8f51a81e45aedf57ef8a45a...
4198653,ffffc2bce69fd5da3e9998fd5c788450a726ba4d,pageview,detail,81001bca02486bcecc944bc88412be729260f00a54d583...,1545571237554,a498ddf07e784d7d00df18a8f51a81e45aedf57ef8a45a...


In [19]:
print('Number of unique sessions: {} \n'.format(len(dfW['session_id_hash'].unique())))

Number of unique sessions: 203127 



### Set nan values to pageview in actions

In [20]:
df[df['product_action'].isna()] = 'pageview'

In [21]:
df.head()

Unnamed: 0,session_id_hash,event_type,product_action,product_skus_hash,server_timestamp_epoch_ms,hashed_url
0,pageview,pageview,pageview,pageview,pageview,pageview
1,pageview,pageview,pageview,pageview,pageview,pageview
2,pageview,pageview,pageview,pageview,pageview,pageview
3,pageview,pageview,pageview,pageview,pageview,pageview
4,00007d15aeb741b3cdd873cb3933351d699cc320,pageview,detail,90add4dd072e160034e98ddea257e0b59441eae00d8955...,1544909035879,8fa1ecf31ececb27ebe9c529966f3d1f907542fe138d5d...


### Create Labels