We create another price prediction model that doesn't use as many of the dataset features (eg.. community area) since the information should be redundant and I either don't have/ don't want to compute the features for an active user.

In [1]:
import pandas as pd
import numpy as np
import pickle
from Fair_Fare.feature_utils import load_hdf

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline, FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from joblib import dump, load

import xgboost as xgb
import datetime as dt

from scipy.stats import randint, uniform

# Plotting
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
mpl.rc('axes', labelsize=18)
mpl.rc('xtick', labelsize=16)
mpl.rc('ytick', labelsize=16)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

### TNP Dataset:

Parameter tuining

In [13]:
tnp = load_hdf('data/tnp_train.h5',.02)
tnp.drop(['Pickup_Community_Area', 'Dropoff_Community_Area','l2_dist_km','x_dist_km','y_dist_km','bearing'], axis = 1, inplace = True)
tnp.head()
y = tnp["Final_Fare"].copy()
X = tnp.drop(["Final_Fare"], axis = 1)

In [16]:
tnp = load_hdf('data/tnp_train.h5',.1)

y = tnp["Final_Fare"].copy()
X = tnp.drop(["Final_Fare"], axis = 1)


X_train, Xv, y_train, yv = train_test_split(X, y, test_size=0.2, random_state=42)

dvalid = xgb.DMatrix(Xv.values, label=yv.values)
dtrain = xgb.DMatrix(X_train.values, label=y_train.values)

param_dist = {'max_depth': randint(low=4, high=20),
              'n_estimators': randint(low=20, high=400),
              'min_child_weight':randint(low=5, high=200),
              'eta':uniform(0.15,.2),
              'colsample_bytree':uniform(0.2,.4),
              'subsample': uniform(0.4,.5),
              'lambda': uniform(0.5,3.)
             }

xgb_model = xgb.XGBRegressor({
              'booster' : 'gbtree', 'eval_metric': 'rmse','silent': 1, 'objective': 'reg:squarederror'})
clf =  RandomizedSearchCV(xgb_model,
                   param_dist, verbose=1,n_jobs = 6,n_iter=100, cv=5, scoring='neg_mean_squared_error')
clf.fit(X.values,y.values)
print(clf.best_score_)
print(clf.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:   21.8s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:  2.2min
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:  4.6min
[Parallel(n_jobs=6)]: Done 500 out of 500 | elapsed:  5.4min finished


-7.523053428443643
{'colsample_bytree': 0.439881349485437, 'eta': 0.2913248799253698, 'lambda': 1.0784417148624983, 'max_depth': 5, 'min_child_weight': 61, 'n_estimators': 131, 'subsample': 0.8144003761476369}


##### Full model:

In [22]:
X = load_hdf('data/tnp_train.h5',100)
X.drop(['Pickup_Community_Area', 'Dropoff_Community_Area','l2_dist_km','x_dist_km','y_dist_km','bearing'], axis = 1, inplace = True)

y = X["Final_Fare"].copy()
X.drop(["Final_Fare"], axis = 1, inplace = True)


X_train, Xv, y_train, yv = train_test_split(X, y, test_size=0.2, random_state=42)

dvalid = xgb.DMatrix(Xv.values, label=yv.values)
dtrain = xgb.DMatrix(X_train.values, label=y_train.values)

watchlist = [(dtrain, 'train'), (dvalid, 'valid')]


xgb_pars = {'colsample_bytree': 0.439881349485437, 'eta': 0.2913248799253698, 'lambda': 1.0784417148624983, 'max_depth': 5, 
            'min_child_weight': 61, 'n_estimators': 131, 'subsample': 0.8144003761476369, 'nthread': 7, 
            'booster' : 'gbtree', 'silent': 1,
            'eval_metric': 'rmse', 'objective': 'reg:squarederror'}


t0 = dt.datetime.now()
model = xgb.train(xgb_pars, dtrain, 100, watchlist, early_stopping_rounds=50,
                  maximize=False, verbose_eval=1)

t1 = dt.datetime.now()
print('Modeling RMSLE %.5f' % model.best_score)
print('Training time: %i seconds' % (t1 - t0).seconds)

#model.save_model("tnp_xgb_full")
model.save_model("tnp_xgb_full_reduced_params")

[0]	train-rmse:11.0648	valid-rmse:11.0693
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[1]	train-rmse:8.39363	valid-rmse:8.39986
[2]	train-rmse:6.53617	valid-rmse:6.54333
[3]	train-rmse:5.43437	valid-rmse:5.44189
[4]	train-rmse:4.69678	valid-rmse:4.70428
[5]	train-rmse:4.31623	valid-rmse:4.3237
[6]	train-rmse:4.12529	valid-rmse:4.13286
[7]	train-rmse:3.95834	valid-rmse:3.96551
[8]	train-rmse:3.86461	valid-rmse:3.87145
[9]	train-rmse:3.80887	valid-rmse:3.81534
[10]	train-rmse:3.77869	valid-rmse:3.78483
[11]	train-rmse:3.47718	valid-rmse:3.48341
[12]	train-rmse:3.29924	valid-rmse:3.30518
[13]	train-rmse:3.20531	valid-rmse:3.21116
[14]	train-rmse:3.1291	valid-rmse:3.13505
[15]	train-rmse:3.10212	valid-rmse:3.10756
[16]	train-rmse:2.98065	valid-rmse:2.98598
[17]	train-rmse:2.91328	valid-rmse:2.91865
[18]	train-rmse:2.89693	valid-rmse:2.90256
[19]	train-rmse:2.88912	valid-rmse:2.89477
[20]	tr

In [23]:
X.columns

Index(['Trip_Seconds', 'Trip_Miles', 'Shared_Trip_Authorized',
       'Pickup_Centroid_Latitude', 'Pickup_Centroid_Longitude',
       'Dropoff_Centroid_Latitude', 'Dropoff_Centroid_Longitude', 'vel_mph',
       'bAirport', 'day_of_wk', 'hour'],
      dtype='object')