In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import pickle
from scipy.stats import zscore

from sklearn.model_selection import train_test_split

In [2]:
def preProcess(data):
    #convert store_and_fwd_flag to number
    f = lambda x: 0 if x == 'N' else 1
    data["store_and_fwd_flag"] = data["store_and_fwd_flag"].apply(lambda x: f(x))
    
    #converting to paramters
    data['mm_pickup'] = data.pickup_datetime.dt.month
    data['dow_pickup'] = data.pickup_datetime.dt.weekday
    data['day_pickup'] = data.pickup_datetime.dt.day
    data['hour_pickup'] = data.pickup_datetime.dt.hour
    data['min_pickup'] = data.pickup_datetime.dt.minute
    
    #gather distance in terms of latitude and longitude difference
    data['latitude_difference'] = data['dropoff_latitude'] - data['pickup_latitude']
    data['longitude_difference'] = data['dropoff_longitude'] - data['pickup_longitude']
    
    #convert trip_duration to seconds from minutes  
    data['trip_duration'] = data['trip_duration'].apply(lambda x: round(x / 60))
    
    #calculate distance travelled in kilometer as per the Haversine Formula for Latitude_difference and longitude_difference
        #Haversine formula - the shortest distance over the earth’s surface – giving an ‘as-the-crow-flies’ distance between the points (ignoring any hills they fly over, of course!).
        #source: https://www.movable-type.co.uk/scripts/latlong.html
    R = 6371 #Radius of earth
    phi1 = data['dropoff_latitude'] * np.pi / 180
    phi2 = data['pickup_latitude'] * np.pi / 180
    Dlat = np.abs(data['latitude_difference'] * np.pi / 180)
    Dlong = np.abs(data['longitude_difference'] * np.pi / 180)
    
    a = np.sin(Dlat / 2) * np.sin(Dlat / 2) + np.cos(phi1) * np.cos(phi2) * np.sin(Dlong / 2) * np.sin(Dlong / 2)
    c = 2 * np.arctan2(np.sqrt(np.abs(a)), np.sqrt(np.abs(1-a)))
    data["trip_distance"] = R * c

In [3]:
def costFunction(y_actual, y_estimated):
    '''
    Evaluation metric for XGBoost
    '''
    #Dimension-tracker 3
    #source https://github.com/dmlc/xgboost/blob/master/demo/guide-python/custom_objective.py
    #source https://stackoverflow.com/questions/55001509/typeerror-when-writing-my-own-evaluation-metric-for-xgboost-in-python
    #print(type(y_actual), type(y_estimated), y_actual.size, y_estimated.num_row(), print(type(y_actual.size)), print(type(y_estimated.num_row())))
    y_estimated_float = y_estimated.get_label()

    assert y_actual.shape == y_estimated_float.shape
    return 'my-error', np.sqrt(np.square(y_estimated_float - y_actual).mean())

In [4]:
def XGBModel(X, y):
    '''
    Machine learning model
    Input: passenger count, coordinates, pickup datetime, store_and_fwd_flag
    Output: trip duration
    '''
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2020)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state = 2020)
    
    #Dimension-tracker 1
    #print(type(X_train), type(y_test), X_train.shape, X_test.shape, y_train.shape, y_test.shape, X_val.shape, y_val.shape)
    
    #XGBoost paramaeters
    parameters = {
        'booster': 'gbtree',
        'objective': 'reg:squarederror',
        'max_depth': 14,
        'subsample': 0.9,
        'eta': 0.05,
        'colsample_bytree': 0.7,
        'colsample_bylevel': 0.7,
        'n_jobs': 4,
    }
    
    #Define training and cross-validation sets for XGBoost
        #The metric of this project is RMSLE, 
        #RMSLE is by default not supported by algorithms such as LightGBM and XGBoost
        #Hence, you will need to write your own optimization function for your models if you want to use RMSLE. 
        #Using a log-transformed target will allow you to simply use the RMSE metric, which is often the default objective for a regression model.
    Dtrain = xgb.DMatrix(data = X_train, label = np.log1p(y_train))
    Dval = xgb.DMatrix(data = X_val, label = np.log1p(y_val))
    
    #Dimension-tracker 2
    #print(Dtrain.num_col(), Dtrain.num_row(), Dval.num_col(), Dval.num_row())
    
    #for tracking the error
    watchlist = [(Dval, 'eval'), (Dtrain, 'train')]
    
    #Number of training rounds
    n_rounds = 1000
    
    #Train model
    GBM = xgb.train(params = parameters, dtrain = Dtrain, num_boost_round = n_rounds, evals = watchlist, feval = costFunction, verbose_eval = True)    
    return GBM

In [13]:
taxiDB = pd.read_csv(filepath_or_buffer = 'train.csv', delimiter = ',', engine = 'c', low_memory = True, infer_datetime_format = True, parse_dates=[2,3])

preProcess(taxiDB)

#dividing data to test and train dataset
X = taxiDB.drop(['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime', 'trip_duration'], axis = 1)
y = taxiDB['trip_duration']

#train model to data
model = XGBModel(X, y)

[0]	eval-rmse:2.00559	train-rmse:2.00532	eval-my-error:2.00559	train-my-error:2.00532
[1]	eval-rmse:1.90972	train-rmse:1.90939	eval-my-error:1.90972	train-my-error:1.90941
[2]	eval-rmse:1.81827	train-rmse:1.81792	eval-my-error:1.81828	train-my-error:1.81793
[3]	eval-rmse:1.73153	train-rmse:1.73112	eval-my-error:1.73153	train-my-error:1.73112
[4]	eval-rmse:1.64939	train-rmse:1.64891	eval-my-error:1.64939	train-my-error:1.64892
[5]	eval-rmse:1.57192	train-rmse:1.57139	eval-my-error:1.57192	train-my-error:1.57140
[6]	eval-rmse:1.49814	train-rmse:1.49752	eval-my-error:1.49814	train-my-error:1.49752
[7]	eval-rmse:1.43020	train-rmse:1.42949	eval-my-error:1.43020	train-my-error:1.42949
[8]	eval-rmse:1.36433	train-rmse:1.36343	eval-my-error:1.36433	train-my-error:1.36343
[9]	eval-rmse:1.30198	train-rmse:1.30100	eval-my-error:1.30198	train-my-error:1.30100
[10]	eval-rmse:1.24257	train-rmse:1.24133	eval-my-error:1.24257	train-my-error:1.24133
[11]	eval-rmse:1.18721	train-rmse:1.18580	eval-my-err

[95]	eval-rmse:0.34061	train-rmse:0.27659	eval-my-error:0.34061	train-my-error:0.27660
[96]	eval-rmse:0.34043	train-rmse:0.27564	eval-my-error:0.34043	train-my-error:0.27565
[97]	eval-rmse:0.34024	train-rmse:0.27479	eval-my-error:0.34024	train-my-error:0.27480
[98]	eval-rmse:0.34010	train-rmse:0.27392	eval-my-error:0.34010	train-my-error:0.27393
[99]	eval-rmse:0.34001	train-rmse:0.27333	eval-my-error:0.34001	train-my-error:0.27333
[100]	eval-rmse:0.33979	train-rmse:0.27248	eval-my-error:0.33979	train-my-error:0.27249
[101]	eval-rmse:0.33959	train-rmse:0.27176	eval-my-error:0.33959	train-my-error:0.27177
[102]	eval-rmse:0.33945	train-rmse:0.27105	eval-my-error:0.33946	train-my-error:0.27106
[103]	eval-rmse:0.33929	train-rmse:0.27028	eval-my-error:0.33929	train-my-error:0.27029
[104]	eval-rmse:0.33921	train-rmse:0.26979	eval-my-error:0.33921	train-my-error:0.26980
[105]	eval-rmse:0.33911	train-rmse:0.26918	eval-my-error:0.33911	train-my-error:0.26919
[106]	eval-rmse:0.33895	train-rmse:0.

[189]	eval-rmse:0.33318	train-rmse:0.23113	eval-my-error:0.33318	train-my-error:0.23114
[190]	eval-rmse:0.33317	train-rmse:0.23100	eval-my-error:0.33317	train-my-error:0.23101
[191]	eval-rmse:0.33313	train-rmse:0.23066	eval-my-error:0.33313	train-my-error:0.23067
[192]	eval-rmse:0.33312	train-rmse:0.23034	eval-my-error:0.33312	train-my-error:0.23034
[193]	eval-rmse:0.33312	train-rmse:0.23030	eval-my-error:0.33312	train-my-error:0.23031
[194]	eval-rmse:0.33310	train-rmse:0.23006	eval-my-error:0.33310	train-my-error:0.23007
[195]	eval-rmse:0.33309	train-rmse:0.22994	eval-my-error:0.33309	train-my-error:0.22995
[196]	eval-rmse:0.33306	train-rmse:0.22966	eval-my-error:0.33306	train-my-error:0.22967
[197]	eval-rmse:0.33301	train-rmse:0.22939	eval-my-error:0.33301	train-my-error:0.22940
[198]	eval-rmse:0.33299	train-rmse:0.22932	eval-my-error:0.33299	train-my-error:0.22933
[199]	eval-rmse:0.33294	train-rmse:0.22905	eval-my-error:0.33295	train-my-error:0.22906
[200]	eval-rmse:0.33288	train-rm

[283]	eval-rmse:0.33118	train-rmse:0.21268	eval-my-error:0.33119	train-my-error:0.21269
[284]	eval-rmse:0.33114	train-rmse:0.21234	eval-my-error:0.33114	train-my-error:0.21235
[285]	eval-rmse:0.33114	train-rmse:0.21232	eval-my-error:0.33114	train-my-error:0.21233
[286]	eval-rmse:0.33113	train-rmse:0.21227	eval-my-error:0.33114	train-my-error:0.21228
[287]	eval-rmse:0.33109	train-rmse:0.21213	eval-my-error:0.33109	train-my-error:0.21214
[288]	eval-rmse:0.33104	train-rmse:0.21172	eval-my-error:0.33104	train-my-error:0.21173
[289]	eval-rmse:0.33102	train-rmse:0.21149	eval-my-error:0.33102	train-my-error:0.21150
[290]	eval-rmse:0.33101	train-rmse:0.21147	eval-my-error:0.33102	train-my-error:0.21147
[291]	eval-rmse:0.33099	train-rmse:0.21112	eval-my-error:0.33099	train-my-error:0.21113
[292]	eval-rmse:0.33097	train-rmse:0.21094	eval-my-error:0.33097	train-my-error:0.21095
[293]	eval-rmse:0.33096	train-rmse:0.21087	eval-my-error:0.33096	train-my-error:0.21088
[294]	eval-rmse:0.33095	train-rm

[377]	eval-rmse:0.32995	train-rmse:0.19913	eval-my-error:0.32995	train-my-error:0.19914
[378]	eval-rmse:0.32993	train-rmse:0.19896	eval-my-error:0.32993	train-my-error:0.19897
[379]	eval-rmse:0.32993	train-rmse:0.19892	eval-my-error:0.32993	train-my-error:0.19893
[380]	eval-rmse:0.32990	train-rmse:0.19872	eval-my-error:0.32990	train-my-error:0.19873
[381]	eval-rmse:0.32989	train-rmse:0.19866	eval-my-error:0.32990	train-my-error:0.19867
[382]	eval-rmse:0.32987	train-rmse:0.19847	eval-my-error:0.32987	train-my-error:0.19848
[383]	eval-rmse:0.32986	train-rmse:0.19833	eval-my-error:0.32987	train-my-error:0.19834
[384]	eval-rmse:0.32984	train-rmse:0.19813	eval-my-error:0.32984	train-my-error:0.19814
[385]	eval-rmse:0.32983	train-rmse:0.19803	eval-my-error:0.32983	train-my-error:0.19804
[386]	eval-rmse:0.32979	train-rmse:0.19778	eval-my-error:0.32979	train-my-error:0.19779
[387]	eval-rmse:0.32978	train-rmse:0.19772	eval-my-error:0.32979	train-my-error:0.19772
[388]	eval-rmse:0.32978	train-rm

[471]	eval-rmse:0.32913	train-rmse:0.18776	eval-my-error:0.32914	train-my-error:0.18777
[472]	eval-rmse:0.32912	train-rmse:0.18764	eval-my-error:0.32913	train-my-error:0.18765
[473]	eval-rmse:0.32912	train-rmse:0.18760	eval-my-error:0.32912	train-my-error:0.18761
[474]	eval-rmse:0.32912	train-rmse:0.18743	eval-my-error:0.32912	train-my-error:0.18744
[475]	eval-rmse:0.32911	train-rmse:0.18729	eval-my-error:0.32911	train-my-error:0.18729
[476]	eval-rmse:0.32911	train-rmse:0.18723	eval-my-error:0.32911	train-my-error:0.18724
[477]	eval-rmse:0.32911	train-rmse:0.18722	eval-my-error:0.32911	train-my-error:0.18722
[478]	eval-rmse:0.32911	train-rmse:0.18717	eval-my-error:0.32911	train-my-error:0.18717
[479]	eval-rmse:0.32910	train-rmse:0.18715	eval-my-error:0.32910	train-my-error:0.18716
[480]	eval-rmse:0.32909	train-rmse:0.18701	eval-my-error:0.32910	train-my-error:0.18701
[481]	eval-rmse:0.32907	train-rmse:0.18682	eval-my-error:0.32907	train-my-error:0.18683
[482]	eval-rmse:0.32907	train-rm

[565]	eval-rmse:0.32862	train-rmse:0.17831	eval-my-error:0.32862	train-my-error:0.17832
[566]	eval-rmse:0.32861	train-rmse:0.17822	eval-my-error:0.32862	train-my-error:0.17822
[567]	eval-rmse:0.32861	train-rmse:0.17807	eval-my-error:0.32861	train-my-error:0.17808
[568]	eval-rmse:0.32859	train-rmse:0.17790	eval-my-error:0.32860	train-my-error:0.17791
[569]	eval-rmse:0.32859	train-rmse:0.17786	eval-my-error:0.32859	train-my-error:0.17786
[570]	eval-rmse:0.32859	train-rmse:0.17784	eval-my-error:0.32859	train-my-error:0.17785
[571]	eval-rmse:0.32858	train-rmse:0.17774	eval-my-error:0.32858	train-my-error:0.17775
[572]	eval-rmse:0.32858	train-rmse:0.17772	eval-my-error:0.32858	train-my-error:0.17772
[573]	eval-rmse:0.32858	train-rmse:0.17768	eval-my-error:0.32858	train-my-error:0.17769
[574]	eval-rmse:0.32858	train-rmse:0.17758	eval-my-error:0.32859	train-my-error:0.17758
[575]	eval-rmse:0.32857	train-rmse:0.17735	eval-my-error:0.32857	train-my-error:0.17736
[576]	eval-rmse:0.32857	train-rm

[659]	eval-rmse:0.32833	train-rmse:0.17086	eval-my-error:0.32834	train-my-error:0.17087
[660]	eval-rmse:0.32834	train-rmse:0.17080	eval-my-error:0.32834	train-my-error:0.17081
[661]	eval-rmse:0.32834	train-rmse:0.17073	eval-my-error:0.32834	train-my-error:0.17074
[662]	eval-rmse:0.32834	train-rmse:0.17070	eval-my-error:0.32834	train-my-error:0.17071
[663]	eval-rmse:0.32834	train-rmse:0.17068	eval-my-error:0.32834	train-my-error:0.17069
[664]	eval-rmse:0.32834	train-rmse:0.17063	eval-my-error:0.32834	train-my-error:0.17063
[665]	eval-rmse:0.32834	train-rmse:0.17044	eval-my-error:0.32834	train-my-error:0.17045
[666]	eval-rmse:0.32834	train-rmse:0.17039	eval-my-error:0.32834	train-my-error:0.17040
[667]	eval-rmse:0.32834	train-rmse:0.17037	eval-my-error:0.32834	train-my-error:0.17037
[668]	eval-rmse:0.32834	train-rmse:0.17024	eval-my-error:0.32834	train-my-error:0.17025
[669]	eval-rmse:0.32833	train-rmse:0.17012	eval-my-error:0.32834	train-my-error:0.17012
[670]	eval-rmse:0.32833	train-rm

[753]	eval-rmse:0.32817	train-rmse:0.16423	eval-my-error:0.32817	train-my-error:0.16423
[754]	eval-rmse:0.32817	train-rmse:0.16413	eval-my-error:0.32817	train-my-error:0.16414
[755]	eval-rmse:0.32817	train-rmse:0.16409	eval-my-error:0.32817	train-my-error:0.16409
[756]	eval-rmse:0.32815	train-rmse:0.16398	eval-my-error:0.32815	train-my-error:0.16398
[757]	eval-rmse:0.32815	train-rmse:0.16391	eval-my-error:0.32815	train-my-error:0.16392
[758]	eval-rmse:0.32815	train-rmse:0.16383	eval-my-error:0.32815	train-my-error:0.16384
[759]	eval-rmse:0.32815	train-rmse:0.16381	eval-my-error:0.32815	train-my-error:0.16382
[760]	eval-rmse:0.32814	train-rmse:0.16372	eval-my-error:0.32815	train-my-error:0.16372
[761]	eval-rmse:0.32814	train-rmse:0.16369	eval-my-error:0.32815	train-my-error:0.16369
[762]	eval-rmse:0.32815	train-rmse:0.16363	eval-my-error:0.32815	train-my-error:0.16364
[763]	eval-rmse:0.32814	train-rmse:0.16354	eval-my-error:0.32814	train-my-error:0.16354
[764]	eval-rmse:0.32814	train-rm

[847]	eval-rmse:0.32797	train-rmse:0.15793	eval-my-error:0.32798	train-my-error:0.15793
[848]	eval-rmse:0.32797	train-rmse:0.15790	eval-my-error:0.32797	train-my-error:0.15790
[849]	eval-rmse:0.32797	train-rmse:0.15785	eval-my-error:0.32797	train-my-error:0.15785
[850]	eval-rmse:0.32797	train-rmse:0.15775	eval-my-error:0.32797	train-my-error:0.15775
[851]	eval-rmse:0.32797	train-rmse:0.15766	eval-my-error:0.32797	train-my-error:0.15766
[852]	eval-rmse:0.32797	train-rmse:0.15764	eval-my-error:0.32797	train-my-error:0.15765
[853]	eval-rmse:0.32797	train-rmse:0.15745	eval-my-error:0.32797	train-my-error:0.15746
[854]	eval-rmse:0.32797	train-rmse:0.15742	eval-my-error:0.32797	train-my-error:0.15742
[855]	eval-rmse:0.32797	train-rmse:0.15732	eval-my-error:0.32797	train-my-error:0.15732
[856]	eval-rmse:0.32796	train-rmse:0.15723	eval-my-error:0.32797	train-my-error:0.15724
[857]	eval-rmse:0.32797	train-rmse:0.15720	eval-my-error:0.32797	train-my-error:0.15721
[858]	eval-rmse:0.32796	train-rm

[941]	eval-rmse:0.32784	train-rmse:0.15236	eval-my-error:0.32785	train-my-error:0.15237
[942]	eval-rmse:0.32784	train-rmse:0.15234	eval-my-error:0.32785	train-my-error:0.15234
[943]	eval-rmse:0.32785	train-rmse:0.15229	eval-my-error:0.32785	train-my-error:0.15229
[944]	eval-rmse:0.32785	train-rmse:0.15222	eval-my-error:0.32785	train-my-error:0.15223
[945]	eval-rmse:0.32785	train-rmse:0.15218	eval-my-error:0.32785	train-my-error:0.15218
[946]	eval-rmse:0.32785	train-rmse:0.15216	eval-my-error:0.32785	train-my-error:0.15217
[947]	eval-rmse:0.32785	train-rmse:0.15212	eval-my-error:0.32785	train-my-error:0.15213
[948]	eval-rmse:0.32784	train-rmse:0.15205	eval-my-error:0.32785	train-my-error:0.15205
[949]	eval-rmse:0.32784	train-rmse:0.15202	eval-my-error:0.32785	train-my-error:0.15203
[950]	eval-rmse:0.32785	train-rmse:0.15194	eval-my-error:0.32785	train-my-error:0.15194
[951]	eval-rmse:0.32785	train-rmse:0.15188	eval-my-error:0.32785	train-my-error:0.15189
[952]	eval-rmse:0.32785	train-rm

In [14]:
#Saving the model as a pickle file
filename = "xgb_model.sav"
pickle.dump(model, open(filename, 'wb'))

In [15]:
#Testing cells for error calculation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2020)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state = 2020)

#finding estimated values
y_estimated = np.exp(model.predict(xgb.DMatrix(X_test))) - 1

In [16]:
#Error calculations
S = y_estimated
A = y_test.to_numpy(dtype = float)
error = S - A

#1. Mean absolute deviation
MAD = sum(abs(error)) / len(A)
print("Mean Absolute Deviation is:", MAD, "minutes")

#2. Mean square error
MSE = sum(error**2) / len(A)
print("Mean Square Error is:", MSE, "minutes")

#3. Mean absolute percentage error
MAPE = sum(np.divide(error, A)) / len(A) * 100
print("Mean Absolute Percentage Error is:", MAPE, "%")

#4. bias
bias = sum(error)
print("bias is:", bias, "minutes")

#5. Root mean square percentage error
RMSPE = np.sqrt(np.mean(np.square((A - S) / A))) * 100
print("Root Mean Square Percentage Error is:", RMSPE, "%")

Mean Absolute Deviation is: 4.838805668043165 minutes
Mean Square Error is: 7531.875085081213 minutes
Mean Absolute Percentage Error is: nan %
bias is: -739943.4911689162 minutes
Root Mean Square Percentage Error is: inf %


  MAPE = sum(np.divide(error, A)) / len(A) * 100
  MAPE = sum(np.divide(error, A)) / len(A) * 100
  RMSPE = np.sqrt(np.mean(np.square((A - S) / A))) * 100


In [17]:
print(taxiDB.head())
print(X.head())

          id  vendor_id     pickup_datetime    dropoff_datetime  \
0  id2875421          2 2016-03-14 17:24:55 2016-03-14 17:32:30   
1  id2377394          1 2016-06-12 00:43:35 2016-06-12 00:54:38   
2  id3858529          2 2016-01-19 11:35:24 2016-01-19 12:10:48   
3  id3504673          2 2016-04-06 19:32:31 2016-04-06 19:39:40   
4  id2181028          2 2016-03-26 13:30:55 2016-03-26 13:38:10   

   passenger_count  pickup_longitude  pickup_latitude  dropoff_longitude  \
0                1        -73.982155        40.767937         -73.964630   
1                1        -73.980415        40.738564         -73.999481   
2                1        -73.979027        40.763939         -74.005333   
3                1        -74.010040        40.719971         -74.012268   
4                1        -73.973053        40.793209         -73.972923   

   dropoff_latitude  store_and_fwd_flag  trip_duration  mm_pickup  dow_pickup  \
0         40.765602                   0              8     