In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
from fastai.structured import *
from fastai.column_data import *
np.set_printoptions(threshold=50, edgeitems=20)
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from math import sqrt

PATH='data/trivago/'

  from numpy.core.umath_tests import inner1d


In [3]:
table_names = ['trivago_cleaned', 'trivago_cleaned_test']

In [4]:
tables = [pd.read_csv(f'{PATH}{fname}.csv', low_memory=False) for fname in table_names]

In [5]:
from IPython.display import HTML

In [6]:
for t in tables: display(t.head())

Unnamed: 0,row_num,locale,day_of_week,hour_of_day,agent_id,entry_page,traffic_type,session_duration,path_1,path_2,path_3,path_4,path_5,no_of_ids,hits
0,988680,2,4,22,10,2113,2,49,31965,0,-1,-1,-1,2.0,14
1,988679,4,6,21,2,2100,1,1892,0,78464,-1,-1,-1,2.0,14
2,988678,3,6,19,8,2113,6,0,51462,-1,-1,-1,-1,1.0,1
3,988677,2,2,6,10,2116,1,2,31931,0,-1,-1,-1,2.0,3
4,988676,3,1,1,8,2100,1,0,0,-1,-1,-1,-1,1.0,2


Unnamed: 0,row_num,locale,day_of_week,hour_of_day,agent_id,entry_page,traffic_type,session_duration,path_1,path_2,path_3,path_4,path_5,no_of_ids
0,988681,6,1,17,1,2111,6,7037,31672,0,-1,-1,-1,2.0
1,988666,3,4,16,10,2700,1,5189,0,34387,84765,-1,-1,3.0
2,988665,4,7,16,10,2113,4,5,79148,0,-1,-1,-1,2.0
3,988664,3,2,19,6,2100,1,8041,0,34602,34604,-1,-1,3.0
4,988663,3,5,20,10,2111,2,117,34287,0,60579,-1,-1,3.0


In [8]:
train, test = tables

In [9]:
len(train),len(test)

(619235, 369446)

## Create features

In [10]:
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)
train.set_index('row_num', inplace=True)
test.set_index('row_num', inplace=True)

In [11]:
train.head().T.head(40)

row_num,988680,988679,988678,988677,988676
locale,2.0,4.0,3.0,2.0,3.0
day_of_week,4.0,6.0,6.0,2.0,1.0
hour_of_day,22.0,21.0,19.0,6.0,1.0
agent_id,10.0,2.0,8.0,10.0,8.0
entry_page,2113.0,2100.0,2113.0,2116.0,2100.0
traffic_type,2.0,1.0,6.0,1.0,1.0
session_duration,49.0,1892.0,0.0,2.0,0.0
path_1,31965.0,0.0,51462.0,31931.0,0.0
path_2,0.0,78464.0,-1.0,0.0,-1.0
path_3,-1.0,-1.0,-1.0,-1.0,-1.0


Now that we've engineered all our features, we need to convert to input compatible with a neural network.

This includes converting categorical variables into contiguous integers or one-hot encodings, normalizing continuous features to standard normal, etc...

In [12]:
cat_vars = ['locale', 'day_of_week', 'agent_id', 'traffic_type', 'entry_page', 'path_1',
    'path_2', 'path_3', 'path_4', 'path_5']

contin_vars = ['hour_of_day', 'session_duration', 'no_of_ids']

n = len(train); n

619235

In [13]:
dep = 'hits'
train = train[cat_vars+contin_vars+[dep]].copy()

In [14]:
test[dep] = 0
test = test[cat_vars+contin_vars+[dep]].copy()

In [15]:
train.head()

Unnamed: 0_level_0,locale,day_of_week,agent_id,traffic_type,entry_page,path_1,path_2,path_3,path_4,path_5,hour_of_day,session_duration,no_of_ids,hits
row_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
988680,2,4,10,2,2113,31965,0,-1,-1,-1,22,49,2.0,14
988679,4,6,2,1,2100,0,78464,-1,-1,-1,21,1892,2.0,14
988678,3,6,8,6,2113,51462,-1,-1,-1,-1,19,0,1.0,1
988677,2,2,10,1,2116,31931,0,-1,-1,-1,6,2,2.0,3
988676,3,1,8,1,2100,0,-1,-1,-1,-1,1,0,1.0,2


In [16]:
for v in cat_vars: train[v] = train[v].astype('category').cat.as_ordered()

In [17]:
apply_cats(test, train)

In [18]:
for v in contin_vars:
    train[v] = train[v].fillna(0).astype('float32')
    test[v] = test[v].fillna(0).astype('float32')

In [19]:
samp_size = n

In [20]:
train.head(2)

Unnamed: 0_level_0,locale,day_of_week,agent_id,traffic_type,entry_page,path_1,path_2,path_3,path_4,path_5,hour_of_day,session_duration,no_of_ids,hits
row_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
988680,2,4,10,2,2113,31965,0,-1,-1,-1,22.0,49.0,2.0,14
988679,4,6,2,1,2100,0,78464,-1,-1,-1,21.0,1892.0,2.0,14


In [21]:
df, y, nas, mapper = proc_df(train, 'hits', do_scale=True)
yl = np.log(y)

In [22]:
yl = yl.astype(np.float32)
np.save(f'{PATH}yl.npy', yl)

In [23]:
y=y.astype(float)

In [24]:
df_test, _, nas, mapper = proc_df(test, 'hits', do_scale=True, #skip_flds=['Id'],
                                  mapper=mapper, na_dict=nas)

In [25]:
df.head(2)

Unnamed: 0_level_0,locale,day_of_week,agent_id,traffic_type,entry_page,path_1,path_2,path_3,path_4,path_5,hour_of_day,session_duration,no_of_ids
row_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
988680,2,4,10,2,3,3763,2,1,1,1,1.301594,-0.197157,-0.06404
988679,4,6,3,1,1,2,7802,1,1,1,1.153811,0.560436,-0.06404


In [26]:
df1 = df.reset_index()
df2 = df_test.reset_index()
df1.to_feather(f'{PATH}train')
df2.to_feather(f'{PATH}test')

In [27]:
train_ratio = 0.8
train_size = int(samp_size * train_ratio); train_size
val_idx = list(range(train_size, len(df)))

In [28]:
len(val_idx)

123847

In [30]:
df.shape

(619235, 13)

In [32]:
cat_sz = [(c, len(train[c].cat.categories)+1) for c in cat_vars]

In [33]:
cat_sz

[('locale', 7),
 ('day_of_week', 8),
 ('agent_id', 16),
 ('traffic_type', 8),
 ('entry_page', 138),
 ('path_1', 21880),
 ('path_2', 12738),
 ('path_3', 12578),
 ('path_4', 7045),
 ('path_5', 4351)]

We use the *cardinality* of each variable (that is, its number of unique values) to decide how large to make its *embeddings*. Each level will be associated with a vector with length defined as below.

In [34]:
emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]

In [35]:
emb_szs

[(7, 4),
 (8, 4),
 (16, 8),
 (8, 4),
 (138, 50),
 (21880, 50),
 (12738, 50),
 (12578, 50),
 (7045, 50),
 (4351, 50)]

In [36]:
np_emb = np.asarray(emb_szs)
np.save(f'{PATH}emb_szs.npy', np_emb)

In [37]:
df_test.shape

(369446, 13)