### Reduced-feature models.

We create another price prediction model that doesn't use as many of the dataset features (eg.. community area) since the information is either redundant or difficult to compute in real time for the average user.

In [1]:
import pandas as pd
import numpy as np
import pickle
from Fair_Fare.feature_utils import load_hdf

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline, FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from joblib import dump, load

import xgboost as xgb
import datetime as dt

from scipy.stats import randint, uniform

# Plotting
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
mpl.rc('axes', labelsize=18)
mpl.rc('xtick', labelsize=16)
mpl.rc('ytick', labelsize=16)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

### TNP Dataset:

Parameter tuning

In [2]:
tnp = load_hdf('data/tnp_train.h5',.02)
tnp.drop(['Pickup_Community_Area', 'Dropoff_Community_Area','l2_dist_km','x_dist_km','y_dist_km','bearing'], axis = 1, inplace = True)
tnp.head()
y = tnp["Final_Fare"].copy()
X = tnp.drop(["Final_Fare"], axis = 1)

In [7]:
tnp = load_hdf('data/tnp_train.h5',.01)
# drop extra features
tnp.drop(['Pickup_Community_Area', 'Dropoff_Community_Area','l2_dist_km','x_dist_km','y_dist_km','bearing'], axis = 1, inplace = True)

y = tnp["Final_Fare"].copy()
X = tnp.drop(["Final_Fare"], axis = 1)


X_train, Xv, y_train, yv = train_test_split(X, y, test_size=0.2, random_state=42)

dvalid = xgb.DMatrix(Xv.values, label=yv.values)
dtrain = xgb.DMatrix(X_train.values, label=y_train.values)

param_dist = {'max_depth': randint(low=4, high=20),
              'n_estimators': randint(low=20, high=400),
              'min_child_weight':randint(low=5, high=200),
              'eta':uniform(0.15,.2),
              'colsample_bytree':uniform(0.2,.4),
              'subsample': uniform(0.4,.5),
              'lambda': uniform(0.5,3.)
             }

xgb_model = xgb.XGBRegressor({
              'booster' : 'gbtree', 'eval_metric': 'rmse','silent': 1, 'objective': 'reg:squarederror'})
clf =  RandomizedSearchCV(xgb_model,
                   param_dist, verbose=1,n_jobs = 6,n_iter=100, cv=5, scoring='neg_mean_squared_error')
clf.fit(X.values,y.values)
print(clf.best_score_)
print(clf.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    2.1s
[Parallel(n_jobs=6)]: Done 409 tasks      | elapsed:   12.9s
[Parallel(n_jobs=6)]: Done 489 out of 500 | elapsed:   15.3s remaining:    0.3s
[Parallel(n_jobs=6)]: Done 500 out of 500 | elapsed:   15.6s finished


-10.699642221720152
{'colsample_bytree': 0.5830943566473055, 'eta': 0.29138688164777615, 'lambda': 1.3572576723966736, 'max_depth': 18, 'min_child_weight': 10, 'n_estimators': 247, 'subsample': 0.5195772003626937}


##### Full model:

In [6]:
X = load_hdf('data/tnp_train.h5',10)
X.drop(['Pickup_Community_Area', 'Dropoff_Community_Area','l2_dist_km','x_dist_km','y_dist_km','bearing'], axis = 1, inplace = True)

y = X["Final_Fare"].copy()
X.drop(["Final_Fare"], axis = 1, inplace = True)


X_train, Xv, y_train, yv = train_test_split(X, y, test_size=0.2, random_state=42)

dvalid = xgb.DMatrix(Xv.values, label=yv.values,feature_names = Xv.columns)
dtrain = xgb.DMatrix(X_train.values, label=y_train.values, feature_names = X_train.columns)

watchlist = [(dtrain, 'train'), (dvalid, 'valid')]


xgb_pars = {'colsample_bytree': 0.439881349485437, 'eta': 0.2913248799253698, 'lambda': 1.0784417148624983, 'max_depth': 5, 
            'min_child_weight': 61, 'n_estimators': 131, 'subsample': 0.8144003761476369, 'nthread': 7, 
            'booster' : 'gbtree', 'silent': 1,
            'eval_metric': 'rmse', 'objective': 'reg:squarederror'}


t0 = dt.datetime.now()
model = xgb.train(xgb_pars, dtrain, 500, watchlist, early_stopping_rounds=50,
                  maximize=False, verbose_eval=2)

t1 = dt.datetime.now()
print('Modeling RMSLE %.5f' % model.best_score)
print('Training time: %i seconds' % (t1 - t0).seconds)

#model.save_model("tnp_xgb_full")

# this is a way of saving metadata like feature names to the model
if hasattr(model, 'feature_names'): model.set_attr(feature_names = '|'.join(model.feature_names))

model.save_model("tnp_xgb_full_reduced_params")

[0]	train-rmse:10.3148	valid-rmse:10.2749
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[1]	train-rmse:8.17851	valid-rmse:8.13749
[2]	train-rmse:6.49763	valid-rmse:6.45791
[3]	train-rmse:5.24322	valid-rmse:5.20372
[4]	train-rmse:4.50248	valid-rmse:4.46311
[5]	train-rmse:4.11955	valid-rmse:4.08209
[6]	train-rmse:3.82896	valid-rmse:3.79323
[7]	train-rmse:3.58179	valid-rmse:3.54614
[8]	train-rmse:3.50313	valid-rmse:3.46806
[9]	train-rmse:3.42995	valid-rmse:3.3951
[10]	train-rmse:3.4105	valid-rmse:3.37678
[11]	train-rmse:3.39192	valid-rmse:3.35882
[12]	train-rmse:3.37445	valid-rmse:3.34226
[13]	train-rmse:3.35115	valid-rmse:3.31985
[14]	train-rmse:3.3245	valid-rmse:3.29334
[15]	train-rmse:3.14238	valid-rmse:3.10958
[16]	train-rmse:3.12631	valid-rmse:3.09346
[17]	train-rmse:3.0273	valid-rmse:2.99245
[18]	train-rmse:3.01966	valid-rmse:2.98514
[19]	train-rmse:3.01429	valid-rmse:2.98007
[20]	trai

### Taxi Dataset:

In [16]:
taxi = load_hdf('data/taxi_train.h5',1)
# drop extra features
taxi.drop(['Pickup_Community_Area', 'Dropoff_Community_Area','l2_dist_km','x_dist_km','y_dist_km','bearing'], axis = 1, inplace = True)

y = taxi["Final_Fare"].copy()
X = taxi.drop(["Final_Fare"], axis = 1)


X_train, Xv, y_train, yv = train_test_split(X, y, test_size=0.2, random_state=42)

dvalid = xgb.DMatrix(Xv.values, label=yv.values)
dtrain = xgb.DMatrix(X_train.values, label=y_train.values)


# parameter search
param_dist = {'max_depth': randint(low=4, high=20),
              'n_estimators': randint(low=20, high=400),
              'min_child_weight':randint(low=5, high=200),
              'eta':uniform(0.15,.2),
              'colsample_bytree':uniform(0.2,.4),
              'subsample': uniform(0.4,.5),
              'lambda': uniform(0.5,3.)
             }
xgb_model = xgb.XGBRegressor({
              'booster' : 'gbtree', 'eval_metric': 'rmse','silent': 1, 'objective': 'reg:squarederror'})
clf =  RandomizedSearchCV(xgb_model,
                   param_dist, verbose=1,n_jobs = 7,n_iter=200, cv=4, scoring='neg_mean_squared_error')
clf.fit(X.values,y.values)
print(clf.best_score_)
print(clf.best_params_)

Fitting 4 folds for each of 200 candidates, totalling 800 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:  5.7min
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed: 26.6min
[Parallel(n_jobs=7)]: Done 436 tasks      | elapsed: 65.1min
[Parallel(n_jobs=7)]: Done 786 tasks      | elapsed: 120.4min
[Parallel(n_jobs=7)]: Done 800 out of 800 | elapsed: 121.2min finished


-2.3565539554551655
{'colsample_bytree': 0.518631511987256, 'eta': 0.2964347911729374, 'lambda': 0.6257690011998417, 'max_depth': 19, 'min_child_weight': 111, 'n_estimators': 392, 'subsample': 0.8593893731535742}


In [2]:
X = load_hdf('data/taxi_train.h5',10)

y = X["Final_Fare"].copy()
X.drop(['Pickup_Community_Area', 'Dropoff_Community_Area','l2_dist_km','x_dist_km','y_dist_km','bearing'], axis = 1, inplace = True)
X.drop(["Final_Fare"], axis = 1, inplace = True)


X_train, Xv, y_train, yv = train_test_split(X, y, test_size=0.2, random_state=42)

dvalid = xgb.DMatrix(Xv.values, label=yv.values,feature_names = Xv.columns)
dtrain = xgb.DMatrix(X_train.values, label=y_train.values, feature_names = X_train.columns)

watchlist = [(dtrain, 'train'), (dvalid, 'valid')]



xgb_pars = {'colsample_bytree': 0.518631511987256, 'eta': 0.2964347911729374, 'lambda': 0.6257690011998417, 'max_depth': 19, 
            'min_child_weight': 111, 'n_estimators': 392, 'subsample': 0.8593893731535742, 'nthread': 6, 
            'booster' : 'gbtree', 'silent': 1,
            'eval_metric': 'rmse', 'objective': 'reg:squarederror'}


t0 = dt.datetime.now()
model = xgb.train(xgb_pars, dtrain, 500, watchlist, early_stopping_rounds=50,
                  maximize=False, verbose_eval=5)

t1 = dt.datetime.now()
print('Modeling RMSLE %.5f' % model.best_score)
print('Training time: %i seconds' % (t1 - t0).seconds)

# this is a way of saving metadata like feature names to the model
if hasattr(model, 'feature_names'): model.set_attr(feature_names = '|'.join(model.feature_names))

model.save_model("taxi_xgb_full_reduced_params")

[0]	train-rmse:13.3868	valid-rmse:13.3996
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[5]	train-rmse:3.09134	valid-rmse:3.11081
[10]	train-rmse:1.60107	valid-rmse:1.63803
[15]	train-rmse:1.44672	valid-rmse:1.50294
[20]	train-rmse:1.39969	valid-rmse:1.47021
[25]	train-rmse:1.38467	valid-rmse:1.46501
[30]	train-rmse:1.36936	valid-rmse:1.45584
[35]	train-rmse:1.35112	valid-rmse:1.44598
[40]	train-rmse:1.34401	valid-rmse:1.44189
[45]	train-rmse:1.33927	valid-rmse:1.44068
[50]	train-rmse:1.3352	valid-rmse:1.43856
[55]	train-rmse:1.33091	valid-rmse:1.43677
[60]	train-rmse:1.32727	valid-rmse:1.43504
[65]	train-rmse:1.32464	valid-rmse:1.43458
[70]	train-rmse:1.31881	valid-rmse:1.43324
[75]	train-rmse:1.31506	valid-rmse:1.43311
[80]	train-rmse:1.31189	valid-rmse:1.43127
[85]	train-rmse:1.30737	valid-rmse:1.42853
[90]	train-rmse:1.30093	valid-rmse:1.42611
[95]	train-rmse:1.29785	valid-rmse:1.4252

In [15]:
model.feature_names

['Trip_Seconds',
 'Trip_Miles',
 'Pickup_Centroid_Latitude',
 'Pickup_Centroid_Longitude',
 'Dropoff_Centroid_Latitude',
 'Dropoff_Centroid_Longitude',
 'vel_mph',
 'bAirport',
 'day_of_wk',
 'hour']