In [49]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

### Gradient Boosted Regression on Taxi data (features from Zhen)

In [18]:
df = pd.read_csv('partial_train_10000.csv')
df.columns

Index(['Unnamed: 0', 'vendor_id', 'passenger_count', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'store_and_fwd_flag', 'Manhattan distance (miles)', 'pickup_hour_min',
       'pickup_hour', 'pickup_month', 'pickup_day', 'pickup_weekday',
       'pickup_area_0', 'dropoff_area_0', 'pickup_area_1', 'dropoff_area_1',
       'pickup_area_2', 'dropoff_area_2', 'pickup_area_3', 'dropoff_area_3',
       'pickup_area_4', 'dropoff_area_4', 'pickup_area_5', 'dropoff_area_5',
       'pickup_area_6', 'dropoff_area_6', 'pickup_area_7', 'dropoff_area_7',
       'pickup_area_8', 'dropoff_area_8', 'pickup_area_9', 'dropoff_area_9',
       'pickup_area_10', 'dropoff_area_10', 'pickup_area_11',
       'dropoff_area_11', 'pickup_area_12', 'dropoff_area_12',
       'pickup_area_13', 'dropoff_area_13', 'pickup_area_14',
       'dropoff_area_14', 'trip_duration'],
      dtype='object')

First do fitting using default parameters for thegradient boosted regressor

### Train test split

In [19]:
X = df.iloc[:, 1:-1].as_matrix()
y = df['trip_duration']

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [30]:
GBR = GradientBoostingRegressor()
GBR_tts = GBR.fit(X_train, y_train)

In [31]:
print('Root mean squared error on test set: {}'.format(np.sqrt(mean_squared_error(GBR.predict(X_test), y_test))))
print('Root mean squared error on test set: {}'.format(np.sqrt(mean_squared_error(GBR.predict(X_train), y_train))))

Root mean squared error on test set: 348.3322480387723
Root mean squared error on test set: 324.45099574257125


### 5-fold CV

In [36]:
cv_scores = cross_val_score(GradientBoostingRegressor(), X, y, cv = 5, scoring = 'neg_mean_squared_error')

In [43]:
scores = np.sqrt(-cv_scores)
print('Cross_val_scores: {}'.format(scores))
print('Mean score: {}'.format(np.mean(scores)))

Cross_val_scores: [ 343.92343011  375.77368376  367.74449734  380.20638674  333.98504752]
Mean score: 360.32660909466824


RMSE is 6 minutes using default parameters and 5-fold CV.

Next use grid search to optimize.

### Grid search

In [56]:
param_list = params = {'n_estimators' : np.arange(100, 200, 20), 'max_depth': np.arange(4, 8, 1), 'max_features': np.arange(6, 14, 1), 'learning_rate' : [0.10, 0.20, 0.30, 0.4]}
gs_cv = GridSearchCV(GradientBoostingRegressor(), param_list, cv=5, scoring='neg_mean_squared_error', )
search_results = gs_cv.fit(X, y)

In [57]:
np.sqrt(-search_results.best_score_)

337.72863317094027

In [58]:
search_results.best_params_

{'learning_rate': 0.1, 'max_depth': 7, 'max_features': 10, 'n_estimators': 180}

I still get a pretty large error. To improve we probably need a larger training set. This grid search took about 30 mins, so we need XGBoost to use more training data.