In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

import pickle
from geopy.geocoders import Nominatim
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)

## Import Data

In [2]:
# Read train file
train_df = pd.read_csv('Taxi-Trip-Duration-Data/train.csv')
train_df.shape

(1458644, 11)

In [3]:
train_df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


## Data Pre-Processing

In [4]:
# Convert character variables to numeric 
f = lambda x: 0 if x == 'N' else 1
train_df["store_and_fwd_flag"] = train_df["store_and_fwd_flag"].apply(lambda x: f(x))
train_df["store_and_fwd_flag"].value_counts()

0    1450599
1       8045
Name: store_and_fwd_flag, dtype: int64

## Feature Engineering

In [5]:
# First, convert datetime strings into datetime
train_df["dropoff_datetime"] = pd.to_datetime(train_df["dropoff_datetime"], format='%Y-%m-%d %H:%M:%S')
train_df["pickup_datetime"] = pd.to_datetime(train_df["pickup_datetime"], format='%Y-%m-%d %H:%M:%S')

In [6]:
# Now construct other variables, like month, date, etc.
train_df["pickup_month"] = train_df["pickup_datetime"].dt.month
train_df["pickup_day"] = train_df["pickup_datetime"].dt.day
train_df["pickup_weekday"] = train_df["pickup_datetime"].dt.weekday
train_df["pickup_hour"] = train_df["pickup_datetime"].dt.hour
train_df["pickup_minute"] = train_df["pickup_datetime"].dt.minute

In [7]:
# Get latitude and longitude differences 
train_df["latitude_difference"] = train_df["dropoff_latitude"] - train_df["pickup_latitude"]
train_df["longitude_difference"] = train_df["dropoff_longitude"] - train_df["pickup_longitude"]

In [8]:
# Convert duration to minutes for easier interpretation
train_df["trip_duration"] = train_df["trip_duration"].apply(lambda x: round(x/60))

In [9]:
# Convert trip distance from longitude and latitude differences to Manhattan distance.
train_df["trip_distance"] = 0.621371 * 6371 * \
(abs(2 * np.arctan2(np.sqrt(np.square(np.sin((abs(train_df["latitude_difference"]) * np.pi / 180) / 2))), 
np.sqrt(1-(np.square(np.sin((abs(train_df["latitude_difference"]) * np.pi / 180) / 2)))))) + \
abs(2 * np.arctan2(np.sqrt(np.square(np.sin((abs(train_df["longitude_difference"]) * np.pi / 180) / 2))), 
np.sqrt(1-(np.square(np.sin((abs(train_df["longitude_difference"]) * np.pi / 180) / 2)))))))

In [10]:
train_df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_month,pickup_day,pickup_weekday,pickup_hour,pickup_minute,latitude_difference,longitude_difference,trip_distance
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,0,8,3,14,0,17,24,-0.002335,0.017525,1.372146
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,0,11,6,12,6,0,43,-0.007412,-0.019066,1.82944
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,0,35,1,19,1,11,35,-0.053852,-0.026306,5.538397
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,0,7,4,6,2,19,32,-0.013252,-0.002228,1.069567
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,0,7,3,26,5,13,30,-0.010689,0.00013,0.747485


## Predictive Modeling

In [11]:
X = train_df.drop(["trip_duration", "id", "vendor_id", "pickup_datetime", "dropoff_datetime"], axis=1)
y = train_df["trip_duration"]

In [12]:
# Split the train data into training, test, and valdiation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2019)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=2019)

In [13]:
# Define evaluation metric
def rmsle(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    return np.square(np.log(y_pred + 1) - np.log(y_true + 1)).mean() ** 0.5

In [14]:
# XGBoost parameters 
params = {
    'booster':            'gbtree',
    'objective':          'reg:linear',
    'learning_rate':      0.05,
    'max_depth':          14,
    'subsample':          0.9,
    'colsample_bytree':   0.7,
    'colsample_bylevel':  0.7,
    'silent':             1,
    'feval':              'rmsle'
}

In [15]:
# Define train and validation sets
dtrain = xgb.DMatrix(X_train, np.log(y_train+1))
dval = xgb.DMatrix(X_val, np.log(y_val+1))

# this is for tracking the error
watchlist = [(dval, 'eval'), (dtrain, 'train')]

In [16]:
# Number of training rounds
nrounds = 1000

# Train model
gbm = xgb.train(params,
                dtrain,
                num_boost_round = nrounds,
                evals = watchlist,
                verbose_eval = True
                )

[0]	eval-rmse:2.00944	train-rmse:2.00596
[1]	eval-rmse:1.91338	train-rmse:1.90994
[2]	eval-rmse:1.82191	train-rmse:1.81841
[3]	eval-rmse:1.73527	train-rmse:1.73171
[4]	eval-rmse:1.65351	train-rmse:1.64992
[5]	eval-rmse:1.57753	train-rmse:1.5739
[6]	eval-rmse:1.50361	train-rmse:1.49983
[7]	eval-rmse:1.4336	train-rmse:1.42978
[8]	eval-rmse:1.36829	train-rmse:1.3644
[9]	eval-rmse:1.30584	train-rmse:1.30174
[10]	eval-rmse:1.24711	train-rmse:1.24293
[11]	eval-rmse:1.19076	train-rmse:1.18649
[12]	eval-rmse:1.13898	train-rmse:1.13457
[13]	eval-rmse:1.08854	train-rmse:1.0839
[14]	eval-rmse:1.04129	train-rmse:1.03637
[15]	eval-rmse:0.996349	train-rmse:0.991182
[16]	eval-rmse:0.954038	train-rmse:0.948487
[17]	eval-rmse:0.914116	train-rmse:0.908204
[18]	eval-rmse:0.877326	train-rmse:0.871156
[19]	eval-rmse:0.841526	train-rmse:0.83486
[20]	eval-rmse:0.80813	train-rmse:0.801134
[21]	eval-rmse:0.776705	train-rmse:0.76925
[22]	eval-rmse:0.747169	train-rmse:0.739316
[23]	eval-rmse:0.719725	train-rmse:

[187]	eval-rmse:0.334466	train-rmse:0.234712
[188]	eval-rmse:0.334385	train-rmse:0.234232
[189]	eval-rmse:0.334386	train-rmse:0.234014
[190]	eval-rmse:0.334356	train-rmse:0.23377
[191]	eval-rmse:0.334358	train-rmse:0.233545
[192]	eval-rmse:0.334352	train-rmse:0.233408
[193]	eval-rmse:0.334336	train-rmse:0.233274
[194]	eval-rmse:0.334333	train-rmse:0.23318
[195]	eval-rmse:0.33429	train-rmse:0.232826
[196]	eval-rmse:0.334235	train-rmse:0.232543
[197]	eval-rmse:0.334228	train-rmse:0.23237
[198]	eval-rmse:0.334234	train-rmse:0.232278
[199]	eval-rmse:0.334223	train-rmse:0.232187
[200]	eval-rmse:0.33421	train-rmse:0.231961
[201]	eval-rmse:0.334186	train-rmse:0.231716
[202]	eval-rmse:0.334178	train-rmse:0.23152
[203]	eval-rmse:0.334165	train-rmse:0.23142
[204]	eval-rmse:0.334117	train-rmse:0.231167
[205]	eval-rmse:0.334113	train-rmse:0.231109
[206]	eval-rmse:0.334066	train-rmse:0.230908
[207]	eval-rmse:0.334053	train-rmse:0.230573
[208]	eval-rmse:0.333972	train-rmse:0.230348
[209]	eval-rmse:0

[371]	eval-rmse:0.330998	train-rmse:0.201988
[372]	eval-rmse:0.330982	train-rmse:0.201881
[373]	eval-rmse:0.330974	train-rmse:0.201633
[374]	eval-rmse:0.330967	train-rmse:0.201566
[375]	eval-rmse:0.330967	train-rmse:0.201507
[376]	eval-rmse:0.330957	train-rmse:0.201406
[377]	eval-rmse:0.330958	train-rmse:0.201389
[378]	eval-rmse:0.330958	train-rmse:0.201382
[379]	eval-rmse:0.330954	train-rmse:0.201285
[380]	eval-rmse:0.330953	train-rmse:0.201224
[381]	eval-rmse:0.330934	train-rmse:0.201098
[382]	eval-rmse:0.330903	train-rmse:0.200911
[383]	eval-rmse:0.330884	train-rmse:0.200731
[384]	eval-rmse:0.330883	train-rmse:0.200651
[385]	eval-rmse:0.330853	train-rmse:0.200408
[386]	eval-rmse:0.330852	train-rmse:0.200324
[387]	eval-rmse:0.330849	train-rmse:0.200223
[388]	eval-rmse:0.330839	train-rmse:0.200147
[389]	eval-rmse:0.330841	train-rmse:0.200123
[390]	eval-rmse:0.330761	train-rmse:0.199674
[391]	eval-rmse:0.33076	train-rmse:0.199556
[392]	eval-rmse:0.330753	train-rmse:0.199464
[393]	eval-

[555]	eval-rmse:0.329905	train-rmse:0.181889
[556]	eval-rmse:0.329907	train-rmse:0.181829
[557]	eval-rmse:0.329905	train-rmse:0.181744
[558]	eval-rmse:0.329902	train-rmse:0.181682
[559]	eval-rmse:0.3299	train-rmse:0.181599
[560]	eval-rmse:0.329896	train-rmse:0.181539
[561]	eval-rmse:0.329894	train-rmse:0.181458
[562]	eval-rmse:0.329881	train-rmse:0.181253
[563]	eval-rmse:0.329879	train-rmse:0.181153
[564]	eval-rmse:0.329869	train-rmse:0.181018
[565]	eval-rmse:0.329868	train-rmse:0.180964
[566]	eval-rmse:0.329868	train-rmse:0.180865
[567]	eval-rmse:0.329862	train-rmse:0.18077
[568]	eval-rmse:0.329861	train-rmse:0.180677
[569]	eval-rmse:0.329861	train-rmse:0.180633
[570]	eval-rmse:0.329858	train-rmse:0.180482
[571]	eval-rmse:0.329858	train-rmse:0.180439
[572]	eval-rmse:0.329857	train-rmse:0.180353
[573]	eval-rmse:0.329855	train-rmse:0.180224
[574]	eval-rmse:0.329856	train-rmse:0.180217
[575]	eval-rmse:0.329861	train-rmse:0.180102
[576]	eval-rmse:0.329861	train-rmse:0.180064
[577]	eval-rm

[739]	eval-rmse:0.329349	train-rmse:0.166723
[740]	eval-rmse:0.329349	train-rmse:0.166695
[741]	eval-rmse:0.329346	train-rmse:0.166644
[742]	eval-rmse:0.329348	train-rmse:0.166493
[743]	eval-rmse:0.329347	train-rmse:0.166468
[744]	eval-rmse:0.329344	train-rmse:0.166441
[745]	eval-rmse:0.329343	train-rmse:0.16631
[746]	eval-rmse:0.329345	train-rmse:0.166206
[747]	eval-rmse:0.329344	train-rmse:0.166154
[748]	eval-rmse:0.329344	train-rmse:0.166129
[749]	eval-rmse:0.329346	train-rmse:0.166093
[750]	eval-rmse:0.329349	train-rmse:0.166016
[751]	eval-rmse:0.329347	train-rmse:0.165956
[752]	eval-rmse:0.329349	train-rmse:0.16587
[753]	eval-rmse:0.329346	train-rmse:0.165796
[754]	eval-rmse:0.329349	train-rmse:0.165692
[755]	eval-rmse:0.329346	train-rmse:0.16559
[756]	eval-rmse:0.329345	train-rmse:0.165518
[757]	eval-rmse:0.329342	train-rmse:0.165406
[758]	eval-rmse:0.32934	train-rmse:0.165371
[759]	eval-rmse:0.329336	train-rmse:0.1653
[760]	eval-rmse:0.329324	train-rmse:0.165196
[761]	eval-rmse:

[922]	eval-rmse:0.329105	train-rmse:0.154125
[923]	eval-rmse:0.329106	train-rmse:0.154083
[924]	eval-rmse:0.329106	train-rmse:0.154037
[925]	eval-rmse:0.329109	train-rmse:0.153934
[926]	eval-rmse:0.329112	train-rmse:0.153884
[927]	eval-rmse:0.329113	train-rmse:0.153793
[928]	eval-rmse:0.329112	train-rmse:0.153743
[929]	eval-rmse:0.329112	train-rmse:0.153711
[930]	eval-rmse:0.329111	train-rmse:0.153648
[931]	eval-rmse:0.329111	train-rmse:0.153591
[932]	eval-rmse:0.329115	train-rmse:0.153554
[933]	eval-rmse:0.329113	train-rmse:0.153526
[934]	eval-rmse:0.32911	train-rmse:0.153405
[935]	eval-rmse:0.329111	train-rmse:0.15338
[936]	eval-rmse:0.329111	train-rmse:0.153373
[937]	eval-rmse:0.329112	train-rmse:0.15335
[938]	eval-rmse:0.329109	train-rmse:0.153257
[939]	eval-rmse:0.329106	train-rmse:0.153204
[940]	eval-rmse:0.329102	train-rmse:0.153119
[941]	eval-rmse:0.329105	train-rmse:0.153065
[942]	eval-rmse:0.329105	train-rmse:0.153005
[943]	eval-rmse:0.329101	train-rmse:0.152919
[944]	eval-rm

In [17]:
# Test predictions
y_pred = np.exp(gbm.predict(xgb.DMatrix(X_test))) - 1

In [19]:
# Use mean absolute error to get a basic estimate of the error
mae = (abs(y_pred - y_test)).mean()
mae

5.0934283983094115

In [20]:
# Take a look at feature importance
feature_scores = gbm.get_fscore()
feature_scores

{'dropoff_latitude': 408961,
 'dropoff_longitude': 398115,
 'latitude_difference': 321800,
 'longitude_difference': 315388,
 'passenger_count': 138235,
 'pickup_day': 246727,
 'pickup_hour': 186841,
 'pickup_latitude': 466207,
 'pickup_longitude': 499024,
 'pickup_minute': 247127,
 'pickup_month': 132498,
 'pickup_weekday': 125545,
 'store_and_fwd_flag': 3881,
 'trip_distance': 279769}

In [21]:
# This is not very telling, so let's scale the features
summ = 0
for key in feature_scores:
    summ = summ + feature_scores[key]

for key in feature_scores:
    feature_scores[key] = feature_scores[key] / summ

feature_scores

{'dropoff_latitude': 0.10847432361533511,
 'dropoff_longitude': 0.10559749058252288,
 'latitude_difference': 0.08535541858371541,
 'longitude_difference': 0.0836546760605371,
 'passenger_count': 0.03666596111845836,
 'pickup_day': 0.06544277924457537,
 'pickup_hour': 0.04955839578495952,
 'pickup_latitude': 0.12365846374039222,
 'pickup_longitude': 0.1323629658275948,
 'pickup_minute': 0.06554887671950851,
 'pickup_month': 0.035144258084229725,
 'pickup_weekday': 0.033300018726204327,
 'store_and_fwd_flag': 0.0010294107505388426,
 'trip_distance': 0.07420696116142785}

## Model Saved

In [22]:
filename = "xgb_model.sav"
pickle.dump(gbm, open(filename, 'wb'))