### Reduced-feature models.

We create another price prediction model that doesn't use as many of the dataset features (eg.. community area) since the information is either redundant or difficult to compute in real time for the average user.

In [None]:
import pandas as pd
import numpy as np
import pickle
from Fair_Fare.feature_utils import load_hdf

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline, FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from joblib import dump, load

import xgboost as xgb
import datetime as dt

from scipy.stats import randint, uniform

# Plotting
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
mpl.rc('axes', labelsize=18)
mpl.rc('xtick', labelsize=16)
mpl.rc('ytick', labelsize=16)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

### TNP Dataset:

Parameter tuning

In [2]:
tnp = load_hdf('data/tnp_train.h5',.02)
tnp.drop(['Pickup_Community_Area', 'Dropoff_Community_Area','l2_dist_km','x_dist_km','y_dist_km','bearing'], axis = 1, inplace = True)
tnp.head()
y = tnp["Final_Fare"].copy()
X = tnp.drop(["Final_Fare"], axis = 1)

In [None]:
tnp = load_hdf('data/tnp_train.h5',15)
# drop extra features
tnp.drop(['Pickup_Community_Area', 'Dropoff_Community_Area','l2_dist_km','x_dist_km','y_dist_km','bearing'], axis = 1, inplace = True)

y = tnp["Final_Fare"].copy()
X = tnp.drop(["Final_Fare"], axis = 1)


X_train, Xv, y_train, yv = train_test_split(X, y, test_size=0.2, random_state=42)

dvalid = xgb.DMatrix(Xv.values, label=yv.values)
dtrain = xgb.DMatrix(X_train.values, label=y_train.values)

param_dist = {'max_depth': randint(low=4, high=20),
              'n_estimators': randint(low=20, high=400),
              'min_child_weight':randint(low=5, high=200),
              'eta':uniform(0.15,.2),
              'colsample_bytree':uniform(0.2,.4),
              'subsample': uniform(0.4,.5),
              'lambda': uniform(0.5,3.)
             }

xgb_model = xgb.XGBRegressor({
              'booster' : 'gbtree', 'eval_metric': 'rmse','silent': 1, 'objective': 'reg:squarederror'})
clf =  RandomizedSearchCV(xgb_model,
                   param_dist, verbose=1,n_jobs = 6,n_iter=1000, cv=5, scoring='neg_mean_squared_error')
clf.fit(X.values,y.values)
print(clf.best_score_)
print(clf.best_params_)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.


##### Full model:

In [2]:
X = load_hdf('data/tnp_train.h5',20)
X.drop(['Pickup_Community_Area', 'Dropoff_Community_Area','l2_dist_km','x_dist_km','y_dist_km','bearing'], axis = 1, inplace = True)

y = X["Final_Fare"].copy()
X.drop(["Final_Fare"], axis = 1, inplace = True)


X_train, Xv, y_train, yv = train_test_split(X, y, test_size=0.2, random_state=42)

dvalid = xgb.DMatrix(Xv.values, label=yv.values,feature_names = Xv.columns)
dtrain = xgb.DMatrix(X_train.values, label=y_train.values, feature_names = X_train.columns)

watchlist = [(dtrain, 'train'), (dvalid, 'valid')]


xgb_pars = {'colsample_bytree': 0.439881349485437, 'eta': 0.2913248799253698, 'lambda': 1.0784417148624983, 'max_depth': 5, 
            'min_child_weight': 61, 'n_estimators': 131, 'subsample': 0.8144003761476369, 'nthread': 7, 
            'booster' : 'gbtree', 'silent': 1,
            'eval_metric': 'rmse', 'objective': 'reg:squarederror'}


t0 = dt.datetime.now()
model = xgb.train(xgb_pars, dtrain, 1000, watchlist, early_stopping_rounds=50,
                  maximize=False, verbose_eval=2)

t1 = dt.datetime.now()
print('Modeling RMSLE %.5f' % model.best_score)
print('Training time: %i seconds' % (t1 - t0).seconds)

#model.save_model("tnp_xgb_full")

# this is a way of saving metadata like feature names to the model
if hasattr(model, 'feature_names'): model.set_attr(feature_names = '|'.join(model.feature_names))

model.save_model("tnp_xgb_full_reduced_params")

[0]	train-rmse:10.3373	valid-rmse:9.59544
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[2]	train-rmse:6.70169	valid-rmse:5.9826
[4]	train-rmse:4.78332	valid-rmse:4.08334
[6]	train-rmse:4.12017	valid-rmse:3.48801
[8]	train-rmse:3.57784	valid-rmse:3.03341
[10]	train-rmse:3.35888	valid-rmse:2.892
[12]	train-rmse:3.25541	valid-rmse:2.84395
[14]	train-rmse:3.20795	valid-rmse:2.83448
[16]	train-rmse:3.16551	valid-rmse:2.82939
[18]	train-rmse:3.11897	valid-rmse:2.83603
[20]	train-rmse:3.06205	valid-rmse:2.84422
[22]	train-rmse:3.03675	valid-rmse:2.84141
[24]	train-rmse:2.99979	valid-rmse:2.81795
[26]	train-rmse:2.96648	valid-rmse:2.80324
[28]	train-rmse:2.93414	valid-rmse:2.81304
[30]	train-rmse:2.9137	valid-rmse:2.80004
[32]	train-rmse:2.89662	valid-rmse:2.81526
[34]	train-rmse:2.86698	valid-rmse:2.80902
[36]	train-rmse:2.84091	valid-rmse:2.81207
[38]	train-rmse:2.81631	valid-rmse:2.8131
[40]	

### Taxi Dataset:

In [16]:
taxi = load_hdf('data/taxi_train.h5',1)
# drop extra features
taxi.drop(['Pickup_Community_Area', 'Dropoff_Community_Area','l2_dist_km','x_dist_km','y_dist_km','bearing'], axis = 1, inplace = True)

y = taxi["Final_Fare"].copy()
X = taxi.drop(["Final_Fare"], axis = 1)


X_train, Xv, y_train, yv = train_test_split(X, y, test_size=0.2, random_state=42)

dvalid = xgb.DMatrix(Xv.values, label=yv.values)
dtrain = xgb.DMatrix(X_train.values, label=y_train.values)


# parameter search
param_dist = {'max_depth': randint(low=4, high=20),
              'n_estimators': randint(low=20, high=400),
              'min_child_weight':randint(low=5, high=200),
              'eta':uniform(0.15,.2),
              'colsample_bytree':uniform(0.2,.4),
              'subsample': uniform(0.4,.5),
              'lambda': uniform(0.5,3.)
             }
xgb_model = xgb.XGBRegressor({
              'booster' : 'gbtree', 'eval_metric': 'rmse','silent': 1, 'objective': 'reg:squarederror'})
clf =  RandomizedSearchCV(xgb_model,
                   param_dist, verbose=1,n_jobs = 7,n_iter=200, cv=4, scoring='neg_mean_squared_error')
clf.fit(X.values,y.values)
print(clf.best_score_)
print(clf.best_params_)

Fitting 4 folds for each of 200 candidates, totalling 800 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:  5.7min
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed: 26.6min
[Parallel(n_jobs=7)]: Done 436 tasks      | elapsed: 65.1min
[Parallel(n_jobs=7)]: Done 786 tasks      | elapsed: 120.4min
[Parallel(n_jobs=7)]: Done 800 out of 800 | elapsed: 121.2min finished


-2.3565539554551655
{'colsample_bytree': 0.518631511987256, 'eta': 0.2964347911729374, 'lambda': 0.6257690011998417, 'max_depth': 19, 'min_child_weight': 111, 'n_estimators': 392, 'subsample': 0.8593893731535742}


In [None]:
X = load_hdf('data/taxi_train.h5',100)
print('data loaded')

y = X["Final_Fare"].copy()
X.drop(['Pickup_Community_Area', 'Dropoff_Community_Area','l2_dist_km','x_dist_km','y_dist_km','bearing'], axis = 1, inplace = True)
X.drop(["Final_Fare"], axis = 1, inplace = True)


X_train, Xv, y_train, yv = train_test_split(X, y, test_size=0.2, random_state=42)

dvalid = xgb.DMatrix(Xv.values, label=yv.values,feature_names = Xv.columns)
dtrain = xgb.DMatrix(X_train.values, label=y_train.values, feature_names = X_train.columns)

watchlist = [(dtrain, 'train'), (dvalid, 'valid')]



xgb_pars = {'colsample_bytree': 0.518631511987256, 'eta': 0.2964347911729374, 'lambda': 0.6257690011998417, 'max_depth': 19, 
            'min_child_weight': 111, 'n_estimators': 392, 'subsample': 0.8593893731535742, 'nthread': 6, 
            'booster' : 'gbtree', 'silent': 1,
            'eval_metric': 'rmse', 'objective': 'reg:squarederror'}


t0 = dt.datetime.now()
model = xgb.train(xgb_pars, dtrain, 1000, watchlist, early_stopping_rounds=50,
                  maximize=False, verbose_eval=5)

t1 = dt.datetime.now()
print('Modeling RMSLE %.5f' % model.best_score)
print('Training time: %i seconds' % (t1 - t0).seconds)

# this is a way of saving metadata like feature names to the model
if hasattr(model, 'feature_names'): model.set_attr(feature_names = '|'.join(model.feature_names))

model.save_model("taxi_xgb_full_reduced_params")

data loaded


In [15]:
model.feature_names

['Trip_Seconds',
 'Trip_Miles',
 'Pickup_Centroid_Latitude',
 'Pickup_Centroid_Longitude',
 'Dropoff_Centroid_Latitude',
 'Dropoff_Centroid_Longitude',
 'vel_mph',
 'bAirport',
 'day_of_wk',
 'hour']