In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

import pickle
from geopy.geocoders import Nominatim
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)

Load the data:

In [2]:
# Read train file
train_df = pd.read_csv('Taxi-Trip-Duration-Data/train.csv')
train_df.shape

(1458644, 11)

In [6]:
train_df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,0,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,0,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,0,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,0,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,0,435


In [3]:
# Read test file
test_df = pd.read_csv("Taxi-Trip-Duration-Data/test.csv")
test_df.shape

(625134, 9)

In [7]:
test_df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,0
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,0
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,0
3,id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,0
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,0


Engineer the features:

In [4]:
# Convert character variables to numeric 
f = lambda x: 0 if x == 'N' else 1
train_df["store_and_fwd_flag"] = train_df["store_and_fwd_flag"].apply(lambda x: f(x))
test_df["store_and_fwd_flag"] = test_df["store_and_fwd_flag"].apply(lambda x: f(x))

In [8]:
# First, convert datetime strings into datetime
train_df["dropoff_datetime"] = pd.to_datetime(train_df["dropoff_datetime"], format='%Y-%m-%d %H:%M:%S')
train_df["pickup_datetime"] = pd.to_datetime(train_df["pickup_datetime"], format='%Y-%m-%d %H:%M:%S')
test_df["pickup_datetime"] = pd.to_datetime(test_df["pickup_datetime"], format='%Y-%m-%d %H:%M:%S')

In [9]:
# Now construct other variables, like month, date, etc.
train_df["pickup_month"] = train_df["pickup_datetime"].dt.month
train_df["pickup_day"] = train_df["pickup_datetime"].dt.day
train_df["pickup_weekday"] = train_df["pickup_datetime"].dt.weekday
train_df["pickup_hour"] = train_df["pickup_datetime"].dt.hour
train_df["pickup_minute"] = train_df["pickup_datetime"].dt.minute

test_df["pickup_month"] = test_df["pickup_datetime"].dt.month
test_df["pickup_day"] = test_df["pickup_datetime"].dt.day
test_df["pickup_weekday"] = test_df["pickup_datetime"].dt.weekday
test_df["pickup_hour"] = test_df["pickup_datetime"].dt.hour
test_df["pickup_minute"] = test_df["pickup_datetime"].dt.minute

In [10]:
# Get latitude and longitude differences 
train_df["latitude_difference"] = train_df["dropoff_latitude"] - train_df["pickup_latitude"]
train_df["longitude_difference"] = train_df["dropoff_longitude"] - train_df["pickup_longitude"]

test_df["latitude_difference"] = test_df["dropoff_latitude"] - test_df["pickup_latitude"]
test_df["longitude_difference"] = test_df["dropoff_longitude"] - test_df["pickup_longitude"]

In [12]:
# Convert duration to minutes for easier interpretation
train_df["trip_duration"] = train_df["trip_duration"].apply(lambda x: round(x/60))

In [13]:
# Convert trip distance from longitude and latitude differences to Manhattan distance.
train_df["trip_distance"] = 0.621371 * 6371 * \
(abs(2 * np.arctan2(np.sqrt(np.square(np.sin((abs(train_df["latitude_difference"]) * np.pi / 180) / 2))), 
np.sqrt(1-(np.square(np.sin((abs(train_df["latitude_difference"]) * np.pi / 180) / 2)))))) + \
abs(2 * np.arctan2(np.sqrt(np.square(np.sin((abs(train_df["longitude_difference"]) * np.pi / 180) / 2))), 
np.sqrt(1-(np.square(np.sin((abs(train_df["longitude_difference"]) * np.pi / 180) / 2)))))))

In [14]:
test_df["trip_distance"] = 0.621371 * 6371 * \
(abs(2 * np.arctan2(np.sqrt(np.square(np.sin((abs(test_df["latitude_difference"]) * np.pi / 180) / 2))), 
np.sqrt(1-(np.square(np.sin((abs(test_df["latitude_difference"]) * np.pi / 180) / 2)))))) + \
abs(2 * np.arctan2(np.sqrt(np.square(np.sin((abs(test_df["longitude_difference"]) * np.pi / 180) / 2))), 
np.sqrt(1-(np.square(np.sin((abs(test_df["longitude_difference"]) * np.pi / 180) / 2)))))))

In [15]:
train_df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_month,pickup_day,pickup_weekday,pickup_hour,pickup_minute,latitude_difference,longitude_difference,trip_distance
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,0,0,3,14,0,17,24,-0.002335,0.017525,1.372146
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,0,0,6,12,6,0,43,-0.007412,-0.019066,1.82944
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,0,1,1,19,1,11,35,-0.053852,-0.026306,5.538397
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,0,0,4,6,2,19,32,-0.013252,-0.002228,1.069567
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,0,0,3,26,5,13,30,-0.010689,0.00013,0.747485


In [16]:
test_df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,pickup_month,pickup_day,pickup_weekday,pickup_hour,pickup_minute,latitude_difference,longitude_difference,trip_distance
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,0,6,30,3,23,59,0.024651,-0.002045,1.844463
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,0,6,30,3,23,59,-0.02459,0.004395,2.002605
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,0,6,30,3,23,59,-0.00806,0.011276,1.336036
3,id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,0,6,30,3,23,59,-0.041431,-0.030357,4.960124
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,0,6,30,3,23,59,-0.005585,0.008705,0.987333


Modeling

In [18]:
X_train = train_df.drop(["trip_duration", "id", "vendor_id", "pickup_datetime", "dropoff_datetime"], axis=1)
y_train = train_df["trip_duration"]

X_test = test_df.drop(["id", "vendor_id", "pickup_datetime"], axis=1)

In [19]:
# Split the train data into training and valdiation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=2019)

In [20]:
# Define evaluation metric
def rmsle(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    return np.square(np.log(y_pred + 1) - np.log(y_true + 1)).mean() ** 0.5

In [21]:
# XGBoost parameters 
params = {
    'booster':            'gbtree',
    'objective':          'reg:linear',
    'learning_rate':      0.05,
    'max_depth':          14,
    'subsample':          0.9,
    'colsample_bytree':   0.7,
    'colsample_bylevel':  0.7,
    'silent':             1,
    'feval':              'rmsle'
}

In [22]:
# Define train and validation sets
dtrain = xgb.DMatrix(X_train, np.log(y_train+1))
dval = xgb.DMatrix(X_val, np.log(y_val+1))

# this is for tracking the error
watchlist = [(dval, 'eval'), (dtrain, 'train')]

In [23]:
nrounds = 100

# Train model
gbm = xgb.train(params,
                dtrain,
                num_boost_round = nrounds,
                evals = watchlist,
                verbose_eval = True
                )

[0]	eval-rmse:0.474375	train-rmse:0.472735
[1]	eval-rmse:0.453872	train-rmse:0.451405
[2]	eval-rmse:0.434623	train-rmse:0.4317
[3]	eval-rmse:0.416442	train-rmse:0.413496
[4]	eval-rmse:0.399246	train-rmse:0.395924
[5]	eval-rmse:0.383169	train-rmse:0.379363
[6]	eval-rmse:0.367996	train-rmse:0.363626
[7]	eval-rmse:0.354038	train-rmse:0.349089
[8]	eval-rmse:0.340707	train-rmse:0.335383
[9]	eval-rmse:0.328117	train-rmse:0.322262
[10]	eval-rmse:0.316416	train-rmse:0.309767
[11]	eval-rmse:0.305574	train-rmse:0.298353
[12]	eval-rmse:0.29531	train-rmse:0.287602
[13]	eval-rmse:0.285859	train-rmse:0.277382
[14]	eval-rmse:0.276914	train-rmse:0.267788
[15]	eval-rmse:0.268717	train-rmse:0.259056
[16]	eval-rmse:0.26111	train-rmse:0.250733
[17]	eval-rmse:0.254041	train-rmse:0.24305
[18]	eval-rmse:0.24747	train-rmse:0.235827
[19]	eval-rmse:0.241362	train-rmse:0.22905
[20]	eval-rmse:0.235774	train-rmse:0.222881
[21]	eval-rmse:0.230516	train-rmse:0.216964
[22]	eval-rmse:0.225597	train-rmse:0.211349
[23]	

In [25]:
# Test predictions
y_pred = np.exp(gbm.predict(xgb.DMatrix(X_test))) - 1

In [26]:
# Take a look at feature importance
feature_scores = gbm.get_fscore()
feature_scores

{'dropoff_latitude': 41872,
 'dropoff_longitude': 48314,
 'latitude_difference': 44916,
 'longitude_difference': 42558,
 'passenger_count': 17320,
 'pickup_day': 29067,
 'pickup_hour': 20909,
 'pickup_latitude': 61204,
 'pickup_longitude': 67860,
 'pickup_minute': 30661,
 'pickup_month': 15049,
 'pickup_weekday': 12279,
 'store_and_fwd_flag': 1085,
 'trip_distance': 45752}

In [27]:
# This is not very telling, so let's scale the features
summ = 0
for key in feature_scores:
    summ = summ + feature_scores[key]

for key in feature_scores:
    feature_scores[key] = feature_scores[key] / summ

feature_scores

{'dropoff_latitude': 0.08744356223086337,
 'dropoff_longitude': 0.10089673924393229,
 'latitude_difference': 0.09380051206442154,
 'longitude_difference': 0.08887617313290704,
 'passenger_count': 0.036170292745475584,
 'pickup_day': 0.060702188177409855,
 'pickup_hour': 0.04366539555514717,
 'pickup_latitude': 0.1278156233945778,
 'pickup_longitude': 0.14171570818175364,
 'pickup_minute': 0.06403102458828099,
 'pickup_month': 0.03142764061932229,
 'pickup_weekday': 0.025642899804947727,
 'store_and_fwd_flag': 0.0022658641818037533,
 'trip_distance': 0.09554637607915697}