# Trying modified mlp with embedding

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from tqdm import tqdm

In [2]:
data = pd.read_csv("data/train.csv")

In [None]:
training = data[~data['MISSING_DATA']]

# adding date time, modified from given feature engineering notebook
getdate = lambda x : [tuple([i.year, i.month, i.day, i.hour, i.weekday()]) for i in [datetime.fromtimestamp(x["TIMESTAMP"])]][0]
training[["YR", "MON", "DAY", "HR", "WK"]] = training[["TIMESTAMP"]].apply(getdate, axis=1, result_type="expand")
training = training.drop(columns = ['TIMESTAMP'])

# adding trip time, using the given 15 second formula on polyline
training['TRIP_TIME'] = training['POLYLINE'].apply(lambda x : max(x.count(".")/2-1,0)*15)
training = training.drop(columns = ['POLYLINE'])

In [None]:
og = training

In [None]:
ogs = og.sample(frac = 1)
training, testing = ogs[:ogs.shape[0]*8//10], ogs[ogs.shape[0]*8//10:]

In [11]:
training

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,DAY_TYPE,MISSING_DATA,YR,MON,DAY,HR,WK,TRIP_TIME
1432637,1399464281620000476,C,,,20000476,A,False,2014,5,7,5,2,930.0
413802,1380498370620000648,B,,15.0,20000648,A,False,2013,9,29,16,6,975.0
879676,1388867466620000002,A,4376.0,,20000002,A,False,2014,1,4,12,5,615.0
631016,1384334213620000118,A,2002.0,,20000118,A,False,2013,11,13,1,2,1515.0
1300963,1397116745620000337,B,,32.0,20000337,A,False,2014,4,10,0,3,1260.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1190713,1394960385620000657,B,,18.0,20000657,A,False,2014,3,16,1,6,180.0
1478047,1400208724620000671,C,,,20000671,A,False,2014,5,15,19,3,360.0
1167542,1394535523620000159,B,,15.0,20000159,A,False,2014,3,11,3,1,855.0
619125,1384061752620000430,C,,,20000430,A,False,2013,11,9,21,5,1140.0


In [15]:
days = [(8,14),(9,30),(10,6),(10,31),(12,21)]

narrows = [training[(training['MON'] == i[0])&(training['DAY']==i[1])].drop(columns = ['MON','DAY']).assign(date=j) for j,i in enumerate(days)]
tnarrows = [testing[(testing['MON'] == i[0])&(testing['DAY']==i[1])].drop(columns = ['MON','DAY']).assign(date=j) for j,i in enumerate(days)]

narrowed = pd.concat(narrows)
tnarrowed = pd.concat(tnarrows)
anarrowed = pd.concat([narrowed,tnarrowed])

In [16]:
# mlpset = narrowed.drop(columns = ['TRIP_ID','ORIGIN_CALL','MISSING_DATA','YR','DAY_TYPE'])
mlpset = training

In [17]:
anarrowed

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,DAY_TYPE,MISSING_DATA,YR,HR,WK,TRIP_TIME,date
200799,1376464254620000618,B,,9.0,20000618,A,False,2013,0,2,0.0,0
200816,1376463844620000281,A,23692.0,,20000281,A,False,2013,0,2,405.0,0
200824,1376463841620000271,B,,21.0,20000271,A,False,2013,0,2,405.0,0
200828,1376464905620000085,A,34648.0,,20000085,A,False,2013,0,2,0.0,0
200854,1376464260620000312,C,,,20000312,A,False,2013,0,2,1140.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
315,T323,A,70885.0,,20000430,A,False,2014,6,6,885.0,4
316,T324,B,,53.0,20000020,A,False,2014,6,6,2970.0,4
317,T325,C,,,20000207,A,False,2014,6,6,3660.0,4
318,T326,A,76232.0,,20000667,A,False,2014,6,6,1200.0,4


In [18]:
mlpin = mlpset.drop(columns = 'TRIP_TIME')
mlpin = mlpin.replace(np.nan, -1)

In [19]:
mappings = {}
mlpin = mlpin.drop(columns = ['TRIP_ID','ORIGIN_CALL','MISSING_DATA','YR','DAY_TYPE'])
cnames = mlpin.columns
for col in mlpin.columns:
    mappings[col] = dict(zip(mlpin[col].unique(), np.arange(mlpin[col].nunique())))
# mappings = {}
# mapset = anarrowed.replace(np.nan, -1)
# cnames = mapset.drop(columns = ['TRIP_ID','ORIGIN_CALL','MISSING_DATA','YR','DAY_TYPE','TRIP_TIME']).columns
# for col in cnames:
#     mappings[col] = dict(zip(mapset[col].unique(), np.arange(mapset[col].nunique())))

In [20]:
mappings.keys()

dict_keys(['CALL_TYPE', 'ORIGIN_STAND', 'TAXI_ID', 'MON', 'DAY', 'HR', 'WK'])

In [21]:
encoded_cols = [f'{col}_e' for col in cnames]
for col, e_col in zip(cnames, encoded_cols):
    mlpin[e_col] = mlpin[col].map(mappings[col])
mlpin = mlpin.drop(columns = cnames)

In [22]:
num_unique = [len(mappings[col]) for col in cnames]
num_unique

[3, 64, 448, 12, 31, 24, 7]

In [23]:
# Model

In [24]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Embedding, Flatten, concatenate, Dense, Dropout
from tensorflow.keras.models import Model

In [25]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [26]:
num_uni = list(mlpin.nunique())

In [27]:
num_uni

[3, 64, 448, 12, 31, 24, 7]

In [28]:
embed_dim = 128
lr = 0.01
optim = keras.optimizers.Adam(learning_rate=lr)

In [29]:
input_lay = [Input(shape=(1,)) for _ in mlpin.columns]

embed_lay = [Embedding(num_uni[i], embed_dim)(input_lay[i]) for i in range(len(mlpin.columns))]

flat_lay = [Flatten()(embed_lay[i]) for i in range(len(mlpin.columns))]

concat = concatenate(flat_lay)

drop_lay1 = Dropout(0.5)(concat)

dense_lay1 = Dense(512, activation = 'relu')(drop_lay1)

dense_lay2 = Dense(128, activation = 'relu')(dense_lay1)

dense_lay3 = Dense(32, activation = 'relu')(dense_lay2)

out = Dense(1, activation = 'relu')(dense_lay3)

model = Model(inputs = input_lay, outputs = out)

model.compile(optimizer=optim,loss='mean_squared_error')

In [30]:
X = [mlpin[col].values for col in mlpin.columns]
y = mlpset['TRIP_TIME'].values
model.fit(X,y,epochs = 20, batch_size=512)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
 193/3342 [>.............................] - ETA: 22s - loss: 387685.6875

KeyboardInterrupt: 

In [32]:
model.summary() 

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 1)]          0                                            
______________________________________________________________________________________________

In [445]:
mlptestset = testing.drop(columns = ['TRIP_ID','ORIGIN_CALL','DAY_TYPE','MISSING_DATA','YR','TRIP_TIME']) #change testing to tnarrowed
mlptestset = mlptestset.replace(np.nan, -1)
for col, e_col in zip(cnames, encoded_cols):
    mlptestset[e_col] = mlptestset[col].map(mappings[col])
mlptestset = mlptestset.drop(columns = cnames)
#mlptestset 

In [447]:
Xtest = [mlptestset[col].values for col in mlptestset.columns]
Ytest = testing['TRIP_TIME'].values

model.evaluate(Xtest,Ytest) # change ytest to tnarrwoed



2224513.75

In [398]:
model.evaluate(X,y)



436404.4375

In [392]:
mlptestset

Unnamed: 0,CALL_TYPE_e,ORIGIN_STAND_e,TAXI_ID_e,HR_e,WK_e,date_e
0,0,10,212,10,3,0
1,0,3,375,10,3,0
2,0,10,104,10,3,0
3,0,53,31,10,3,0
4,0,29,111,10,3,0
...,...,...,...,...,...,...
315,1,1,154,6,2,4
316,0,53,194,6,2,4
317,2,1,245,6,2,4
318,1,1,285,6,2,4


In [451]:
X

[array([0, 1, 0, ..., 0, 1, 1]),
 array([ 0,  1,  0, ...,  0, 17,  9]),
 array([  0,   1,   2, ..., 439, 443, 444]),
 array([0, 0, 0, ..., 7, 0, 0]),
 array([0, 0, 0, ..., 3, 0, 0]),
 array([ 0,  0,  0, ...,  9, 15, 19]),
 array([0, 0, 0, ..., 5, 1, 1])]