2. Apply a few ML algorithms.
 - We'll use the taxi data first.
      - The taxi pricing model is very clear and there is no surging
      - There is more taxi data, and the model is known to be consistent since Jan 2016 (see prev. notebook)
 - Linear model with the extra categorical variables included. 

### Machine Learning on the cleaned data + addl features
https://ramhiser.com/post/2018-04-16-building-scikit-learn-pipeline-with-pandas-dataframe/


In [2]:
import pandas as pd
import numpy as np
import pickle
from feature_utils import load_hdf

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline, FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from joblib import dump, load

import xgboost as xgb
import datetime as dt

from scipy.stats import randint, uniform

# Plotting
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [3]:
class TypeSelector(BaseEstimator, TransformerMixin):
    def __init__(self, dtype):
        self.dtype = dtype

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        return X.select_dtypes(include=[self.dtype])


preprocess_pipeline = make_pipeline(
    FeatureUnion(transformer_list=[
        ("numeric_features", make_pipeline(
            TypeSelector(np.number),
            StandardScaler()
        )),
        ("categorical_features", make_pipeline(
            TypeSelector("category"),
            OneHotEncoder(categories = 'auto')
        )),
        ("boolean_features", make_pipeline(
            TypeSelector("bool")
        ))
    ])
)


In [8]:
# Linear fit to taxi
X = load_hdf('data/taxi_train.h5',100)
#preprocess_pipeline.fit(X)

y = X["Final_Fare"]
X.drop(["Final_Fare"], axis = 1, inplace = True)


#lin_reg = LinearRegression(preprocess_pipeline,n_jobs = -1)
#lin_reg.fit(X,y)
#dump(lin_reg, 'taxi_lin_model.joblib') 

In [9]:
lin_reg = load('taxi_lin_model.joblib')
lin_mse = mean_squared_error(lin_reg.predict(X), y)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

2.820806646331759

In [None]:
lin_scores = cross_val_score(lin_reg, X, y,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

In [11]:
# Linear fit to tnp
X = load_hdf('data/tnp_train.h5',100)

#preprocess_pipeline.fit(X)

y = X["Final_Fare"]
X.drop(["Final_Fare"], axis = 1, inplace = True)


lin_reg = LinearRegression(preprocess_pipeline,n_jobs = -1)
lin_reg.fit(X,y)
dump(lin_reg, 'tnp_lin_model.joblib') 

['tnp_lin_model.joblib']

## Xgboost 

In [7]:
#taxi

X = load_hdf('data/taxi_train.h5',.2)
y = X["Final_Fare"]
X.drop(["Final_Fare"], axis = 1, inplace = True)

Xv = load_hdf('data/taxi_train.h5',.2)
yv = Xv["Final_Fare"]
Xv.drop(["Final_Fare"], axis = 1, inplace = True)


dvalid = xgb.DMatrix(Xv.values, label=yv.values)
dtrain = xgb.DMatrix(X.values, label=y.values)

watchlist = [(dtrain, 'train'), (dvalid, 'valid')]


xgb_pars = {'min_child_weight': 50, 'eta': 0.3, 'colsample_bytree': 0.3, 'max_depth': 10,
            'subsample': 0.8, 'lambda': 1., 'nthread': 6, 'booster' : 'gbtree', 'silent': 1,
            'eval_metric': 'rmse', 'objective': 'reg:squarederror'}

t0 = dt.datetime.now()
model = xgb.train(xgb_pars, dtrain, 500, watchlist, early_stopping_rounds=50,
                  maximize=False, verbose_eval=100)

t1 = dt.datetime.now()
print('Modeling RMSLE %.5f' % model.best_score)
print('Training time: %i seconds' % (t1 - t0).seconds)

In [45]:
feature_importance_dict = model.get_fscore()
feature_names = X.columns
fs = ['f%i' % i for i in range(len(feature_names))]
f1 = pd.DataFrame({'f': list(feature_importance_dict.keys()),
                   'importance': list(feature_importance_dict.values())})
f2 = pd.DataFrame({'f': fs, 'feature_name': feature_names})
feature_importance = pd.merge(f1, f2, how='right', on='f')
feature_importance = feature_importance.fillna(0)

In [46]:
feature_importance[['feature_name', 'importance']].sort_values(by='importance', ascending=False)

Unnamed: 0,feature_name,importance
11,vel_mph,1279
7,bearing,1250
0,x_dist_km,1197
2,hour,1144
5,l2_dist_km,1070
10,y_dist_km,1056
15,Dropoff_Centroid_Longitude,1028
9,Dropoff_Centroid_Latitude,952
8,Trip_Seconds,900
4,Trip_Miles,787


In [None]:
#tnp

X = load_hdf('data/tnp_train.h5',.2)
y = X["Final_Fare"]
X.drop(["Final_Fare"], axis = 1, inplace = True)

Xv = load_hdf('data/tnp_train.h5',.2)
yv = Xv["Final_Fare"]
Xv.drop(["Final_Fare"], axis = 1, inplace = True)


dvalid = xgb.DMatrix(Xv.values, label=yv.values)
dtrain = xgb.DMatrix(X.values, label=y.values)

watchlist = [(dtrain, 'train'), (dvalid, 'valid')]


xgb_pars = {'min_child_weight': 50, 'eta': 0.3, 'colsample_bytree': 0.3, 'max_depth': 10,
            'subsample': 0.8, 'lambda': 1., 'nthread': 6, 'booster' : 'gbtree', 'silent': 1,
            'eval_metric': 'rmse', 'objective': 'reg:squarederror'}

t0 = dt.datetime.now()
model = xgb.train(xgb_pars, dtrain, 500, watchlist, early_stopping_rounds=50,
                  maximize=False, verbose_eval=100)

t1 = dt.datetime.now()
print('Modeling RMSLE %.5f' % model.best_score)
print('Training time: %i seconds' % (t1 - t0).seconds)

In [None]:
feature_importance_dict = model.get_fscore()
feature_names = X.columns
fs = ['f%i' % i for i in range(len(feature_names))]
f1 = pd.DataFrame({'f': list(feature_importance_dict.keys()),
                   'importance': list(feature_importance_dict.values())})
f2 = pd.DataFrame({'f': fs, 'feature_name': feature_names})
feature_importance = pd.merge(f1, f2, how='right', on='f')
feature_importance = feature_importance.fillna(0)

In [53]:
# param search
# see https://xgboost.readthedocs.io/en/latest/parameter.html

param_dist = {'max_depth': randint(low=4, high=20),
              'n_estimators': randint(low=20, high=400),
              'min_child_weight':randint(low=5, high=200),
              'eta':uniform(0.15,.2),
              'colsample_bytree':uniform(0.2,.4),
              'subsample': uniform(0.4,.5),
              'lambda': uniform(0.5,3.)
             }
xgb_model = xgb.XGBRegressor({
              'booster' : 'gbtree', 'eval_metric': 'rmse','silent': 1, 'objective': 'reg:squarederror'})
clf =  RandomizedSearchCV(xgb_model,
                   param_dist, verbose=1,n_jobs = 7,n_iter=200, cv=4, scoring='neg_mean_squared_error')
clf.fit(X.values,y.values)
print(clf.best_score_)
print(clf.best_params_)


Fitting 4 folds for each of 200 candidates, totalling 800 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:  1.2min
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:  4.4min
[Parallel(n_jobs=7)]: Done 436 tasks      | elapsed: 10.0min
[Parallel(n_jobs=7)]: Done 786 tasks      | elapsed: 17.4min
[Parallel(n_jobs=7)]: Done 800 out of 800 | elapsed: 17.7min finished


-2.494055128976227
{'colsample_bytree': 0.4659175029639101, 'eta': 0.30900240687858116, 'lambda': 3.335261321413178, 'max_depth': 7, 'min_child_weight': 8, 'n_estimators': 355, 'subsample': 0.7367235778794784}


In [54]:
cvres = clf.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

1.7123291437145671 {'colsample_bytree': 0.5567229659693154, 'eta': 0.2643172145557544, 'lambda': 0.6145159608205376, 'max_depth': 11, 'min_child_weight': 115, 'n_estimators': 318, 'subsample': 0.7646243506947745}
1.7767354941591014 {'colsample_bytree': 0.4188809148764538, 'eta': 0.3299295153083107, 'lambda': 2.761794126461167, 'max_depth': 18, 'min_child_weight': 184, 'n_estimators': 397, 'subsample': 0.6665119794338152}
1.9121555538543245 {'colsample_bytree': 0.5202082492247373, 'eta': 0.32545965223976264, 'lambda': 1.241946634658322, 'max_depth': 6, 'min_child_weight': 197, 'n_estimators': 233, 'subsample': 0.41743556955877004}
1.722405998095129 {'colsample_bytree': 0.4638224522464749, 'eta': 0.15189091750876899, 'lambda': 1.0742024443090579, 'max_depth': 14, 'min_child_weight': 136, 'n_estimators': 381, 'subsample': 0.7288450390694795}
1.7845215242331083 {'colsample_bytree': 0.5493204407872195, 'eta': 0.16299402175828062, 'lambda': 2.788994943025087, 'max_depth': 14, 'min_child_weig

Full model

In [None]:
{'colsample_bytree': 0.4659175029639101, 'eta': 0.30900240687858116, 'lambda': 3.335261321413178, 'max_depth': 7, 'min_child_weight': 8, 'n_estimators': 355, 'subsample': 0.7367235778794784}

In [2]:
X = load_hdf('data/taxi_train.h5',100)

y = X["Final_Fare"].copy()
X.drop(["Final_Fare"], axis = 1, inplace = True)


X_train, Xv, y_train, yv = train_test_split(X, y, test_size=0.2, random_state=42)

dvalid = xgb.DMatrix(Xv.values, label=yv.values)
dtrain = xgb.DMatrix(X_train.values, label=y_train.values)


In [5]:
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]



xgb_pars = {'colsample_bytree': 0.4659175029639101, 'eta': 0.30900240687858116, 
            'lambda': 3.335261321413178, 'max_depth': 7, 'min_child_weight': 8, 
            'n_estimators': 355, 'subsample': 0.7367235778794784, 'nthread': 6, 
            'booster' : 'gbtree', 'silent': 1,
            'eval_metric': 'rmse', 'objective': 'reg:squarederror'}


t0 = dt.datetime.now()
model = xgb.train(xgb_pars, dtrain, 500, watchlist, early_stopping_rounds=50,
                  maximize=False, verbose_eval=1)

t1 = dt.datetime.now()
print('Modeling RMSLE %.5f' % model.best_score)
print('Training time: %i seconds' % (t1 - t0).seconds)

[0]	train-rmse:13.4387	valid-rmse:13.4353
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[1]	train-rmse:9.67493	valid-rmse:9.67632
[2]	train-rmse:6.84618	valid-rmse:6.84827
[3]	train-rmse:4.91189	valid-rmse:4.91369
[4]	train-rmse:3.63353	valid-rmse:3.635
[5]	train-rmse:2.90893	valid-rmse:2.91197
[6]	train-rmse:2.46996	valid-rmse:2.4737
[7]	train-rmse:2.23058	valid-rmse:2.23484
[8]	train-rmse:2.10806	valid-rmse:2.11241
[9]	train-rmse:1.92299	valid-rmse:1.92635
[10]	train-rmse:1.88641	valid-rmse:1.88981
[11]	train-rmse:1.79633	valid-rmse:1.79814
[12]	train-rmse:1.72469	valid-rmse:1.72623
[13]	train-rmse:1.68272	valid-rmse:1.68319
[14]	train-rmse:1.67338	valid-rmse:1.67409
[15]	train-rmse:1.65009	valid-rmse:1.6504
[16]	train-rmse:1.63574	valid-rmse:1.63621
[17]	train-rmse:1.62128	valid-rmse:1.62129
[18]	train-rmse:1.61167	valid-rmse:1.61154
[19]	train-rmse:1.60974	valid-rmse:1.60957
[20]	trai

[187]	train-rmse:1.42883	valid-rmse:1.45521
[188]	train-rmse:1.42873	valid-rmse:1.45515
[189]	train-rmse:1.42842	valid-rmse:1.45492
[190]	train-rmse:1.42809	valid-rmse:1.45467
[191]	train-rmse:1.42796	valid-rmse:1.45456
[192]	train-rmse:1.42782	valid-rmse:1.4545
[193]	train-rmse:1.42769	valid-rmse:1.45448
[194]	train-rmse:1.42757	valid-rmse:1.45442
[195]	train-rmse:1.42707	valid-rmse:1.45412
[196]	train-rmse:1.4269	valid-rmse:1.45402
[197]	train-rmse:1.42653	valid-rmse:1.45404
[198]	train-rmse:1.42608	valid-rmse:1.45378
[199]	train-rmse:1.42593	valid-rmse:1.45369
[200]	train-rmse:1.42579	valid-rmse:1.45358
[201]	train-rmse:1.42547	valid-rmse:1.45343
[202]	train-rmse:1.42535	valid-rmse:1.45335
[203]	train-rmse:1.42514	valid-rmse:1.45327
[204]	train-rmse:1.42502	valid-rmse:1.45319
[205]	train-rmse:1.42486	valid-rmse:1.45311
[206]	train-rmse:1.42469	valid-rmse:1.45306
[207]	train-rmse:1.42448	valid-rmse:1.45291
[208]	train-rmse:1.42428	valid-rmse:1.45278
[209]	train-rmse:1.42402	valid-rms

[374]	train-rmse:1.38985	valid-rmse:1.4378
[375]	train-rmse:1.3898	valid-rmse:1.43777
[376]	train-rmse:1.38975	valid-rmse:1.43781
[377]	train-rmse:1.38969	valid-rmse:1.4378
[378]	train-rmse:1.38951	valid-rmse:1.43783
[379]	train-rmse:1.38931	valid-rmse:1.43787
[380]	train-rmse:1.38925	valid-rmse:1.43784
[381]	train-rmse:1.38916	valid-rmse:1.43779
[382]	train-rmse:1.38908	valid-rmse:1.43776
[383]	train-rmse:1.38895	valid-rmse:1.43776
[384]	train-rmse:1.38888	valid-rmse:1.43776
[385]	train-rmse:1.38875	valid-rmse:1.4377
[386]	train-rmse:1.3887	valid-rmse:1.43769
[387]	train-rmse:1.38866	valid-rmse:1.43766
[388]	train-rmse:1.38833	valid-rmse:1.43748
[389]	train-rmse:1.38829	valid-rmse:1.43746
[390]	train-rmse:1.38816	valid-rmse:1.43742
[391]	train-rmse:1.38802	valid-rmse:1.43749
[392]	train-rmse:1.38797	valid-rmse:1.43747
[393]	train-rmse:1.3877	valid-rmse:1.43716
[394]	train-rmse:1.38758	valid-rmse:1.43717
[395]	train-rmse:1.38747	valid-rmse:1.43715
[396]	train-rmse:1.38737	valid-rmse:1.

In [6]:
#model.save_model("taxi_xgb_full")

In [None]:
import pickle

# The sklearn API models are picklable
print("Pickling sklearn API models")
# must open in binary format to pickle
pickle.dump(clf, open("best_boston.pkl", "wb"))
clf2 = pickle.load(open("best_boston.pkl", "rb"))
print(np.allclose(clf.predict(X), clf2.predict(X)))

In [2]:
# TNP param tuning

tnp = load_hdf('data/tnp_train.h5',.2)

y = tnp["Final_Fare"].copy()
X = tnp.drop(["Final_Fare"], axis = 1)


X_train, Xv, y_train, yv = train_test_split(X, y, test_size=0.2, random_state=42)

dvalid = xgb.DMatrix(Xv.values, label=yv.values)
dtrain = xgb.DMatrix(X_train.values, label=y_train.values)

param_dist = {'max_depth': randint(low=4, high=20),
              'n_estimators': randint(low=20, high=400),
              'min_child_weight':randint(low=5, high=200),
              'eta':uniform(0.15,.2),
              'colsample_bytree':uniform(0.2,.4),
              'subsample': uniform(0.4,.5),
              'lambda': uniform(0.5,3.)
             }

xgb_model = xgb.XGBRegressor({
              'booster' : 'gbtree', 'eval_metric': 'rmse','silent': 1, 'objective': 'reg:squarederror'})
clf =  RandomizedSearchCV(xgb_model,
                   param_dist, verbose=1,n_jobs = 7,n_iter=300, cv=5, scoring='neg_mean_squared_error')
clf.fit(X.values,y.values)
print(clf.best_score_)
print(clf.best_params_)


In [3]:
X = load_hdf('data/tnp_train.h5',100)

y = X["Final_Fare"].copy()
X.drop(["Final_Fare"], axis = 1, inplace = True)


X_train, Xv, y_train, yv = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
dvalid = xgb.DMatrix(Xv.values, label=yv.values)
dtrain = xgb.DMatrix(X_train.values, label=y_train.values)

In [7]:
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]


xgb_pars = {'colsample_bytree': 0.5994699901820792, 'eta': 0.16180028105758762, 
            'lambda': 0.697099727254144, 'max_depth': 5, 'min_child_weight': 25, 
            'n_estimators': 125, 'subsample': 0.8213364096819988, 'nthread': 7, 
            'booster' : 'gbtree', 'silent': 1,
            'eval_metric': 'rmse', 'objective': 'reg:squarederror'}


t0 = dt.datetime.now()
model = xgb.train(xgb_pars, dtrain, 100, watchlist, early_stopping_rounds=50,
                  maximize=False, verbose_eval=1)

t1 = dt.datetime.now()
print('Modeling RMSLE %.5f' % model.best_score)
print('Training time: %i seconds' % (t1 - t0).seconds)

model.save_model("tnp_xgb_full")

[0]	train-rmse:11.8992	valid-rmse:11.9038
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[1]	train-rmse:10.1238	valid-rmse:10.1291
[2]	train-rmse:8.65287	valid-rmse:8.65893
[3]	train-rmse:7.49376	valid-rmse:7.50048
[4]	train-rmse:6.55277	valid-rmse:6.5601
[5]	train-rmse:5.7504	valid-rmse:5.75818
[6]	train-rmse:5.14227	valid-rmse:5.15029
[7]	train-rmse:4.58652	valid-rmse:4.59449
[8]	train-rmse:4.20653	valid-rmse:4.21434
[9]	train-rmse:3.91328	valid-rmse:3.92052
[10]	train-rmse:3.62797	valid-rmse:3.6352
[11]	train-rmse:3.45539	valid-rmse:3.46236
[12]	train-rmse:3.28572	valid-rmse:3.2925
[13]	train-rmse:3.16147	valid-rmse:3.16806
[14]	train-rmse:3.06217	valid-rmse:3.06853
[15]	train-rmse:2.98881	valid-rmse:2.99492
[16]	train-rmse:2.9364	valid-rmse:2.94227
[17]	train-rmse:2.90733	valid-rmse:2.91309
[18]	train-rmse:2.87656	valid-rmse:2.88216
[19]	train-rmse:2.85638	valid-rmse:2.86189
[20]	train

[187]	train-rmse:2.6802	valid-rmse:2.68716
[188]	train-rmse:2.68008	valid-rmse:2.68708
[189]	train-rmse:2.67993	valid-rmse:2.68694
[190]	train-rmse:2.67984	valid-rmse:2.68688
[191]	train-rmse:2.67957	valid-rmse:2.68662
[192]	train-rmse:2.6793	valid-rmse:2.68637
[193]	train-rmse:2.67903	valid-rmse:2.6861
[194]	train-rmse:2.67888	valid-rmse:2.68594
[195]	train-rmse:2.67874	valid-rmse:2.68582
[196]	train-rmse:2.67864	valid-rmse:2.68572
[197]	train-rmse:2.67843	valid-rmse:2.68554
[198]	train-rmse:2.67828	valid-rmse:2.6854
[199]	train-rmse:2.67803	valid-rmse:2.68518
Modeling RMSLE 2.68518
Training time: 2595 seconds


### Test set evaluation

Here we predict on the witheld data to confirm the quality of our estimators.

In [18]:
taxi_linmod = load('taxi_lin_model.joblib')
taxi_treemod = xgb.Booster()
taxi_treemod.load_model("taxi_xgb_full")

tnp_linmod = load('tnp_lin_model.joblib')
tnp_treemod = xgb.Booster()
tnp_treemod.load_model("tnp_xgb_full")

In [33]:
errors = []
samples = []
features = []

X = load_hdf('data/taxi_test.h5',100)
#these should have been dropped earlier...
X.drop(X[X["Final_Fare"]>500].index, inplace = True)
y = X["Final_Fare"]
X.drop(["Final_Fare"], axis = 1, inplace = True)
dtest = xgb.DMatrix(X.values, label=y.values)

pred_tree = taxi_treemod.predict(dtest)
pred_lin = taxi_linmod.predict(X)
samples.append(X.size)
features.append(len(X.columns))

errors.append(np.sqrt(mean_squared_error(y, pred_tree)))
errors.append(np.sqrt(mean_squared_error(y, pred_lin)))

print("XG tree RMSE: ",np.sqrt(mean_squared_error(y, pred_tree)))
print("Linear RMSE :", np.sqrt(mean_squared_error(y, pred_lin)))

XG tree RMSE:  1.6650221
Linear RMSE : 2.9674565977800547


In [34]:
X = load_hdf('data/tnp_test.h5',100)
y = X["Final_Fare"]
X.drop(["Final_Fare"], axis = 1, inplace = True)
dtest = xgb.DMatrix(X.values, label=y.values)

pred_tree = tnp_treemod.predict(dtest)
pred_lin = tnp_linmod.predict(X)

errors.append(np.sqrt(mean_squared_error(y, pred_tree)))
errors.append(np.sqrt(mean_squared_error(y, pred_lin)))
samples.append(X.size)
features.append(len(X.columns))



print("XG tree RMSE: ",np.sqrt(mean_squared_error(y, pred_tree)))
print("Linear RMSE :", np.sqrt(mean_squared_error(y, pred_lin)))

XG tree RMSE:  2.6830482
Linear RMSE : 3.1459173678591887


In [65]:
X = load_hdf('data/tnp_test.h5',100)
y = X["Final_Fare"]
X.drop(["Final_Fare","Shared_Trip_Authorized"], axis = 1, inplace = True)
dtest = xgb.DMatrix(X.values, label=y.values)

taxi_treemod.predict(dtest)

array([23.455196 , 17.450554 , 19.038605 , ..., 46.223186 , 40.795383 ,
        7.0205503], dtype=float32)

In [22]:
X = load_hdf('data/tnp_test.h5',1)
y = X["Final_Fare"]
X.drop(["Final_Fare","Shared_Trip_Authorized"], axis = 1, inplace = True)
dtest = xgb.DMatrix(X.values, label=y.values)

taxi_treemod.predict(dtest)
taxi_on_tnp = taxi_treemod.predict(dtest)
diff = taxi_on_tnp - y
diff[diff>0].count()/diff[diff<0].count()

3.9685308984183925

In [49]:
errors

[1.6650221, 2.9674565977800547, 2.6830482, 3.1459173678591887]

In [71]:
diff[diff>0].count()/diff[diff<0].count()

4.063948908675753

In [28]:
X.count()

Trip_Seconds                  3561937
Trip_Miles                    3561937
Pickup_Community_Area         3561937
Dropoff_Community_Area        3561937
Pickup_Centroid_Latitude      3561937
Pickup_Centroid_Longitude     3561937
Dropoff_Centroid_Latitude     3561937
Dropoff_Centroid_Longitude    3561937
vel_mph                       3561937
l2_dist_km                    3561937
x_dist_km                     3561937
y_dist_km                     3561937
bearing                       3561937
bAirport                      3561937
day_of_wk                     3561937
hour                          3561937
dtype: int64

In [73]:
diff[diff<0].count()

601746