### 1 Importing libraries and data

In [1]:
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
np.random.seed(0)

import xgboost as xgb
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV

In [2]:
df_train = pd.read_hdf('tram.train.h5')
df_test = pd.read_hdf('tram.test.h5')

In [3]:
# function based on DataWorkshop masterclass excercise
def check_model(df, target, feats, model, cv=5, scoring='neg_mean_absolute_error'):
    
    df_train = df[ df["delay"].notnull() ].copy()
    df_test = df[ df["delay"].isnull() ].copy()

    X_train = df_train[feats].values
    y_train = df_train[target].values
    X_test = df_test[feats].values
    
    
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring=scoring)
    result = np.mean(scores)

    return result

In [4]:
df_train.head()

Unnamed: 0,id,delay,datetime,stop,stop_name,number,direction,planned_time,vehicle_id,trip_id,seq_num
0,0,0,2018-07-23 06:00:47,612,Borsucza,22,Walcownia,2018-07-23 06:00:00,6.352185295672181e+18,6351558574044899587,7.0
1,1,0,2018-07-23 06:00:48,572,Smolki,11,Czerwone Maki P+R,2018-07-23 06:00:00,6.352185295672181e+18,6351558574044670211,10.0
2,2,0,2018-07-23 06:00:49,322,Filharmonia,8,Bronowice Małe,2018-07-23 06:01:00,6.352185295672182e+18,6351558574044592386,15.0
3,3,0,2018-07-23 06:00:51,363,Hala Targowa,1,Salwator,2018-07-23 06:01:00,6.352185295672181e+18,6351558574044379394,24.0
4,4,0,2018-07-23 06:00:52,78,Batorego,24,Bronowice Małe,2018-07-23 06:00:00,6.352185295672181e+18,6351558574044948738,19.0


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 175986 entries, 0 to 308151
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            175986 non-null  int64  
 1   delay         175986 non-null  int64  
 2   datetime      175986 non-null  object 
 3   stop          175986 non-null  int64  
 4   stop_name     175986 non-null  object 
 5   number        175986 non-null  int64  
 6   direction     175986 non-null  object 
 7   planned_time  175986 non-null  object 
 8   vehicle_id    175986 non-null  float64
 9   trip_id       175986 non-null  int64  
 10  seq_num       174651 non-null  float64
dtypes: float64(2), int64(5), object(4)
memory usage: 16.1+ MB


In [6]:
df_train.describe()

Unnamed: 0,id,delay,stop,number,vehicle_id,trip_id,seq_num
count,175986.0,175986.0,175986.0,175986.0,175986.0,175986.0,174651.0
mean,141894.414,61.517,933.999,19.22,6.352185295672181e+18,6.351558574044353e+18,13.775
std,86137.04,82.488,1022.539,16.328,724.31,1134585.605,8.398
min,0.0,0.0,61.0,1.0,6.352185295672181e+18,6.351558574044353e+18,1.0
25%,43996.25,0.0,319.0,6.0,6.352185295672181e+18,6.351558574044619e+18,7.0
50%,133238.5,60.0,561.0,14.0,6.352185295672182e+18,6.351558574044836e+18,13.0
75%,220507.75,120.0,1049.0,24.0,6.352185295672182e+18,6.351558574045065e+18,20.0
max,308151.0,1140.0,3176.0,52.0,6.352185295672182e+18,6.351558574047342e+18,37.0


In [7]:
df_test.head()

Unnamed: 0,id,stop,stop_name,number,direction,planned_time,vehicle_id,trip_id,seq_num
47215,47215,681,Teligi,13,Nowy Bieżanów P+R,2018-07-24 00:01:00,6.352185295672182e+18,6351558574044715284,25.0
47216,47216,2582,Nowy Prokocim,13,Nowy Bieżanów P+R,2018-07-24 00:02:00,6.352185295672182e+18,6351558574044715284,26.0
47217,47217,112,Stella-Sawickiego,10,Kopiec Wandy,2018-07-24 00:00:00,6.352185295672181e+18,6351558574044641557,18.0
47218,47218,679,Ćwiklińskiej,13,Nowy Bieżanów P+R,2018-07-24 00:04:00,6.352185295672182e+18,6351558574044715284,27.0
47219,47219,407,Czyżyny,10,Kopiec Wandy,2018-07-24 00:01:00,6.352185295672181e+18,6351558574044641557,19.0


In [8]:
missing_columns = [x for x in df_train.columns if x not in df_test.columns]

missing_columns

['delay', 'datetime']

In [9]:
df_train.shape, df_test.shape

((175986, 11), (132166, 9))

### 2 Preprocessing data & feature engineering

In [10]:
# create combined df from test and train data to perform cleaning simultaneously

df = pd.concat([df_train, df_test])
print(df.shape)

(308152, 11)


In [11]:
df.columns

Index(['id', 'delay', 'datetime', 'stop', 'stop_name', 'number', 'direction',
       'planned_time', 'vehicle_id', 'trip_id', 'seq_num'],
      dtype='object')

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 308152 entries, 0 to 308145
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            308152 non-null  int64  
 1   delay         175986 non-null  float64
 2   datetime      175986 non-null  object 
 3   stop          308152 non-null  int64  
 4   stop_name     308152 non-null  object 
 5   number        308152 non-null  int64  
 6   direction     308152 non-null  object 
 7   planned_time  308152 non-null  object 
 8   vehicle_id    308152 non-null  float64
 9   trip_id       308152 non-null  int64  
 10  seq_num       305770 non-null  float64
dtypes: float64(3), int64(4), object(4)
memory usage: 28.2+ MB


In [13]:
df.seq_num.isnull().sum()

2382

In [14]:
missing_seq = df[df.seq_num.isnull()].trip_id.unique()

In [15]:
# imputing sequece_nr with numbers in order of stop occurrence per trip_id
for i in missing_seq:
    mask = df.loc[df.trip_id == i]
    mask['seq_num_impute'] = range(1, 1+mask.shape[0])

    df.loc[df.trip_id == i, 'seq_num_impute'] = mask['seq_num_impute']

df.seq_num = np.where(df.seq_num.isnull(), df.seq_num_impute, df.seq_num).astype('int')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mask['seq_num_impute'] = range(1, 1+mask.shape[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mask['seq_num_impute'] = range(1, 1+mask.shape[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mask['seq_num_impute'] = range(1, 1+mask.shape[0])
A value is trying to be set on a copy of a slice fro

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mask['seq_num_impute'] = range(1, 1+mask.shape[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mask['seq_num_impute'] = range(1, 1+mask.shape[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mask['seq_num_impute'] = range(1, 1+mask.shape[0])
A value is trying to be set on a copy of a slice fro

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mask['seq_num_impute'] = range(1, 1+mask.shape[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mask['seq_num_impute'] = range(1, 1+mask.shape[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mask['seq_num_impute'] = range(1, 1+mask.shape[0])
A value is trying to be set on a copy of a slice fro

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mask['seq_num_impute'] = range(1, 1+mask.shape[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mask['seq_num_impute'] = range(1, 1+mask.shape[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mask['seq_num_impute'] = range(1, 1+mask.shape[0])
A value is trying to be set on a copy of a slice fro

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mask['seq_num_impute'] = range(1, 1+mask.shape[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mask['seq_num_impute'] = range(1, 1+mask.shape[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mask['seq_num_impute'] = range(1, 1+mask.shape[0])
A value is trying to be set on a copy of a slice fro

In [16]:
# check
df[df.trip_id == 6351558574044985611]

Unnamed: 0,id,delay,datetime,stop,stop_name,number,direction,planned_time,vehicle_id,trip_id,seq_num,seq_num_impute
35942,35942,0.0,2018-07-23 18:23:51,89,Bronowice,44,Kombinat,2018-07-23 18:24:00,6.352185295672181e+18,6351558574044985611,1,1.0
36091,36091,0.0,2018-07-23 18:26:38,88,Uniwersytet Pedagogiczny,44,Kombinat,2018-07-23 18:26:00,6.352185295672181e+18,6351558574044985611,2,2.0
36161,36161,0.0,2018-07-23 18:28:07,84,Biprostal,44,Kombinat,2018-07-23 18:28:00,6.352185295672181e+18,6351558574044985611,3,3.0
36308,36308,0.0,2018-07-23 18:30:55,79,Plac Inwalidów,44,Kombinat,2018-07-23 18:31:00,6.352185295672181e+18,6351558574044985611,4,4.0
36602,36602,0.0,2018-07-23 18:36:54,3032,Stary Kleparz,44,Kombinat,2018-07-23 18:37:00,6.352185295672181e+18,6351558574044985611,5,5.0
36817,36817,0.0,2018-07-23 18:41:10,126,Lubicz,44,Kombinat,2018-07-23 18:41:00,6.352185295672181e+18,6351558574044985611,6,6.0
36916,36916,0.0,2018-07-23 18:43:15,125,Rondo Mogilskie,44,Kombinat,2018-07-23 18:43:00,6.352185295672181e+18,6351558574044985611,7,7.0
36987,36987,0.0,2018-07-23 18:44:50,129,Cystersów,44,Kombinat,2018-07-23 18:44:00,6.352185295672181e+18,6351558574044985611,8,8.0
37042,37042,0.0,2018-07-23 18:46:01,130,Białucha,44,Kombinat,2018-07-23 18:46:00,6.352185295672181e+18,6351558574044985611,9,9.0
37148,37148,0.0,2018-07-23 18:47:57,3040,TAURON Arena Kraków Wieczysta,44,Kombinat,2018-07-23 18:48:00,6.352185295672181e+18,6351558574044985611,10,10.0


In [17]:
# change datatypes for vehicle_id, datetime, planned_time
df[['vehicle_id']] = df[['vehicle_id']].astype('object')

In [18]:
df[['datetime', 'planned_time']] = df[['datetime', 'planned_time']].apply(pd.to_datetime)

In [19]:
# check
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 308152 entries, 0 to 308145
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   id              308152 non-null  int64         
 1   delay           175986 non-null  float64       
 2   datetime        175986 non-null  datetime64[ns]
 3   stop            308152 non-null  int64         
 4   stop_name       308152 non-null  object        
 5   number          308152 non-null  int64         
 6   direction       308152 non-null  object        
 7   planned_time    308152 non-null  datetime64[ns]
 8   vehicle_id      308152 non-null  object        
 9   trip_id         308152 non-null  int64         
 10  seq_num         308152 non-null  int32         
 11  seq_num_impute  2387 non-null    float64       
dtypes: datetime64[ns](2), float64(2), int32(1), int64(4), object(3)
memory usage: 29.4+ MB


In [20]:
# extreme delay values with one occurrence - 1140 & 960
df[df.delay == 1140]

Unnamed: 0,id,delay,datetime,stop,stop_name,number,direction,planned_time,vehicle_id,trip_id,seq_num,seq_num_impute
20704,20704,1140.0,2018-07-23 13:22:41,3036,Szpital Narutowicza,5,Krowodrza Górka,2018-07-23 13:03:00,6.352185295672181e+18,6351558574044494090,24,


In [21]:
(df.loc[20704, 'datetime'] - df.loc[20704, 'planned_time'])

Timedelta('0 days 00:19:41')

this delay is correct, however it is an outlier, so the row will be removed

In [22]:
df[df.delay == 960]

Unnamed: 0,id,delay,datetime,stop,stop_name,number,direction,planned_time,vehicle_id,trip_id,seq_num,seq_num_impute
32374,32374,960.0,2018-07-23 17:16:09,409,Centralna,14,Mistrzejowice,2018-07-23 17:00:00,6.352185295672181e+18,6351558574044760329,26,


In [23]:
(df.loc[32374, 'datetime'] - df.loc[32374, 'planned_time'])

Timedelta('0 days 00:16:09')

this delay is correct, however it is an outlier, so the row will be removed

In [24]:
# removing rows with values 1140 and 960
df = df.loc[ ~ df.delay.isin([1140, 960])]

#check
df.delay.isin([1140, 960]).sum()

0

In [25]:
# adding features representing time, day of week and weekend
df['hour'] = df['planned_time'].dt.hour
df['weekday'] = df['planned_time'].dt.weekday
weekend = [5, 6]
df['weekend'] = df['weekday'].isin(weekend).factorize()[0] 

df.head()

Unnamed: 0,id,delay,datetime,stop,stop_name,number,direction,planned_time,vehicle_id,trip_id,seq_num,seq_num_impute,hour,weekday,weekend
0,0,0.0,2018-07-23 06:00:47,612,Borsucza,22,Walcownia,2018-07-23 06:00:00,6.352185295672181e+18,6351558574044899587,7,,6,0,0
1,1,0.0,2018-07-23 06:00:48,572,Smolki,11,Czerwone Maki P+R,2018-07-23 06:00:00,6.352185295672181e+18,6351558574044670211,10,,6,0,0
2,2,0.0,2018-07-23 06:00:49,322,Filharmonia,8,Bronowice Małe,2018-07-23 06:01:00,6.352185295672182e+18,6351558574044592386,15,,6,0,0
3,3,0.0,2018-07-23 06:00:51,363,Hala Targowa,1,Salwator,2018-07-23 06:01:00,6.352185295672181e+18,6351558574044379394,24,,6,0,0
4,4,0.0,2018-07-23 06:00:52,78,Batorego,24,Bronowice Małe,2018-07-23 06:00:00,6.352185295672181e+18,6351558574044948738,19,,6,0,0


In [26]:
# factorize categoric columns and create features to train model
obj_feats = df.select_dtypes(object).columns
for feat in obj_feats:
        df["{}_cat".format(feat)] = df[feat].factorize()[0]
cat_feats = [x for x in df.columns if "_cat" in x]
cat_feats

['stop_name_cat', 'direction_cat', 'vehicle_id_cat']

In [27]:
# adding feature that calculates exact delay from planned time and datetime columns
df['delay_calc'] = np.where((df.datetime - df.planned_time).astype('timedelta64[s]') >0 , (df.datetime - df.planned_time).astype('timedelta64[s]'), 0)
df.head()

Unnamed: 0,id,delay,datetime,stop,stop_name,number,direction,planned_time,vehicle_id,trip_id,seq_num,seq_num_impute,hour,weekday,weekend,stop_name_cat,direction_cat,vehicle_id_cat,delay_calc
0,0,0.0,2018-07-23 06:00:47,612,Borsucza,22,Walcownia,2018-07-23 06:00:00,6.352185295672181e+18,6351558574044899587,7,,6,0,0,0,0,0,47.0
1,1,0.0,2018-07-23 06:00:48,572,Smolki,11,Czerwone Maki P+R,2018-07-23 06:00:00,6.352185295672181e+18,6351558574044670211,10,,6,0,0,1,1,0,48.0
2,2,0.0,2018-07-23 06:00:49,322,Filharmonia,8,Bronowice Małe,2018-07-23 06:01:00,6.352185295672182e+18,6351558574044592386,15,,6,0,0,2,2,1,0.0
3,3,0.0,2018-07-23 06:00:51,363,Hala Targowa,1,Salwator,2018-07-23 06:01:00,6.352185295672181e+18,6351558574044379394,24,,6,0,0,3,3,0,0.0
4,4,0.0,2018-07-23 06:00:52,78,Batorego,24,Bronowice Małe,2018-07-23 06:00:00,6.352185295672181e+18,6351558574044948738,19,,6,0,0,4,2,0,52.0


In [28]:
# checking if delays have differences in average delay
df.groupby('direction')['delay_calc'].agg('mean').sort_values()

direction
Cm. Rakowicki       28.801
Cichy Kącik         35.281
Łagiewniki          37.086
Czerwone Maki P+R   38.408
Kombinat            38.835
Krowodrza Górka     38.898
Bronowice           42.975
Os.Piastów          43.889
Dworzec Tow.        45.653
Wzgórza K.          47.415
Mistrzejowice       47.853
Nowy Bieżanów P+R   50.396
Salwator            51.346
Mały Płaszów        53.404
Bronowice Małe      54.918
Kurdwanów P+R       59.500
Kopiec Wandy        60.469
Prokocim            64.933
Borek Fałęcki       67.775
Walcownia           95.946
Name: delay_calc, dtype: float64

In [33]:
# defining high delay directions as directions with delay >60
high_delay_direction = ['Kurdwanów P+R', 'Kopiec Wandy', 'Prokocim', 'Borek Fałęcki', 'Walcownia']

In [34]:
# creating binary feature to distinguish between high and low delay directions
df['high_delay_direction'] = df.direction.apply(lambda x: x in high_delay_direction).factorize()[0] 
df.head()

Unnamed: 0,id,delay,datetime,stop,stop_name,number,direction,planned_time,vehicle_id,trip_id,seq_num,seq_num_impute,hour,weekday,weekend,stop_name_cat,direction_cat,vehicle_id_cat,delay_calc,high_delay_direction
0,0,0.0,2018-07-23 06:00:47,612,Borsucza,22,Walcownia,2018-07-23 06:00:00,6.352185295672181e+18,6351558574044899587,7,,6,0,0,0,0,0,47.0,0
1,1,0.0,2018-07-23 06:00:48,572,Smolki,11,Czerwone Maki P+R,2018-07-23 06:00:00,6.352185295672181e+18,6351558574044670211,10,,6,0,0,1,1,0,48.0,1
2,2,0.0,2018-07-23 06:00:49,322,Filharmonia,8,Bronowice Małe,2018-07-23 06:01:00,6.352185295672182e+18,6351558574044592386,15,,6,0,0,2,2,1,0.0,1
3,3,0.0,2018-07-23 06:00:51,363,Hala Targowa,1,Salwator,2018-07-23 06:01:00,6.352185295672181e+18,6351558574044379394,24,,6,0,0,3,3,0,0.0,1
4,4,0.0,2018-07-23 06:00:52,78,Batorego,24,Bronowice Małe,2018-07-23 06:00:00,6.352185295672181e+18,6351558574044948738,19,,6,0,0,4,2,0,52.0,1


In [35]:
# function to create additional features based on aggregation of a defined column (from dataworkshop's starter 5)
def df_group_delay(df_train, groupby_feats):
    agg_params = {
        "mean_{}_delay".format("_".join(groupby_feats)): ("delay_calc", "mean"),
        "median_{}_delay".format("_".join(groupby_feats)): ("delay_calc", "median"),
        "count_{}_delay".format("_".join(groupby_feats)): ("delay_calc", "count"),
        "std_{}_delay".format("_".join(groupby_feats)): ("delay_calc", "std"),
        "count_zeros_{}_delay".format("_".join(groupby_feats)): ("delay_calc", lambda vals: len([x for x in vals if x == 0]) ),
        "prob_zeros_{}_delay".format("_".join(groupby_feats)): ("delay_calc", lambda vals: np.mean([x == 0 for x in vals]) ),
    }
    
    return df_train[groupby_feats + ["delay_calc"]].groupby(groupby_feats).agg(
        **agg_params
    ).reset_index()

In [36]:
df_train = df[ df["delay"].notnull() ].copy()

In [37]:
# adding features representing mean delay, median delay, number of delays, std and probability of delays for a given stop name and direction
df_tmp = df_group_delay(df_train, ["stop_name", "direction"])
if "mean_stopname_direction_delay" not in df:
    df = pd.merge(df, df_tmp, on=["stop_name", "direction"], how="left")

In [38]:
#creating list of features to train model
black_list = ["id", "delay", "datetime", "planned_time", "trip_id", "seq_num_impute", "delay_calc", "weekend"]
feats = df.select_dtypes(['number', 'datetime64[ns]'])
feats = [x for x in feats if x not in black_list]
feats

['stop',
 'number',
 'seq_num',
 'hour',
 'weekday',
 'stop_name_cat',
 'direction_cat',
 'vehicle_id_cat',
 'high_delay_direction',
 'mean_stop_name_direction_delay',
 'median_stop_name_direction_delay',
 'count_stop_name_direction_delay',
 'std_stop_name_direction_delay',
 'count_zeros_stop_name_direction_delay',
 'prob_zeros_stop_name_direction_delay']

### 3 Parameter tuning

In [39]:
# model will be trained on delay_calc as target as it is more precise than the initial delay columns and yielded better results in tests
target="delay_calc"
# calculating baseline model score with default parameters
model = xgb.XGBRegressor(random_state=0)
check_model(df, target, feats, model)

-51.46561312738196

In [40]:
# tuning hyperparameters
feats = feats
target="delay_calc"

df_train = df[ df["delay"].notnull() ].copy()
df_test = df[ df["delay"].isnull() ].copy()

X_train = df_train[feats].values
y_train = df_train[target].values
X_test = df_test[feats].values

param_grid = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(50, 510, 50),
    'learning_rate': [0.1, 0.01, 0.05],
    'min_child_weight': range(1,6,2),
    'subsample':[i/10.0 for i in range(6,10)],
    'colsample_bytree':[i/10.0 for i in range(6,10)],
    'gamma':[i/10.0 for i in range(0,5)]
}

In [43]:
model = xgb.XGBRegressor(random_state=0)
grid_search = RandomizedSearchCV(model, param_grid, n_iter=10, scoring='neg_mean_absolute_error', verbose=1, refit=True, random_state=0)
grid_search.fit(X_train, y_train)

best_score = grid_search.best_score_
best_params = grid_search.best_params_

best_score, best_params

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed: 210.3min finished


(-48.182685649126356,
 {'subsample': 0.8,
  'n_estimators': 250,
  'min_child_weight': 5,
  'max_depth': 9,
  'learning_rate': 0.01,
  'gamma': 0.0,
  'colsample_bytree': 0.7})

### 4 Model Training

In [None]:
target = "delay_calc"
params = best_params
model = xgb.XGBRegressor(**params, random_state=0)

In [None]:
df_train = df[ df["delay"].notnull() ].copy()
df_test = df[ df["delay"].isnull() ].copy()

X_train = df_train[feats].values
y_train = df_train[target].values
X_test = df_test[feats].values

In [None]:
model = model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred[ y_pred < 0 ] = 0
df_test["delay"] = y_pred

In [None]:
feat_importances = pd.Series(model.feature_importances_, index=feats)
plt.figure(figsize=(10,10))
feat_importances.nlargest(23).plot(kind='barh');

In [None]:
# function that rounds values to 60 second multiples to comply with initial delay formatting
def round_60(x):
    if x<60:
        return 0
    if x>=60:
        return x-(x%60)

In [None]:
df_test['delay_2'] = df_test['delay'].apply(round_60)
df_test.sample(5)

In [None]:
df_test[ ["id", "delay_2"] ].to_csv('output/final.csv', header=["id", "delay"],  index=False) 