In [47]:
import pandas as pd
import numpy as np
import pickle
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import r2_score , mean_squared_error , median_absolute_error
from scipy.stats import randint

In [48]:
with open('Datasets/recs_processed_otliers.pickle','rb') as handle:
    recs_processed_otliers = pickle.load(handle)

In [49]:
X = recs_processed_otliers.iloc[:,:-2]
y = recs_processed_otliers['TOTALBTU']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 7)

## Gradient Boosting Model

In [16]:
gbr = GradientBoostingRegressor(random_state = 7)

In [18]:
 param_grid = {'n_estimators': [1500,2200],
               'max_features': [50,80],
               'max_depth': [6,8],
               'min_samples_split': [4],
               }
cv_gbr = GridSearchCV(estimator=gbr, param_grid=param_grid, n_jobs=2, cv=5, scoring = 'neg_mean_squared_error')
cv_gbr.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_sampl...ate=7, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=2,
       param_grid={'n_estimators': [1500, 2200], 'max_features': [50, 80], 'max_depth': [6, 8], 'min_samples_split': [4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [19]:
cv_gbr.best_params_

{'max_depth': 6,
 'max_features': 80,
 'min_samples_split': 4,
 'n_estimators': 2200}

In [20]:
gbr1 = GradientBoostingRegressor(max_depth = 6, max_features = 80, n_estimators = 2200, min_samples_split = 4,
                                 learning_rate = 0.1, random_state = 7)
gbr1.fit(X_train,y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=6, max_features=80,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=4, min_weight_fraction_leaf=0.0,
             n_estimators=2200, n_iter_no_change=None, presort='auto',
             random_state=7, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [21]:
print("Gradient Boosting Test Set R-Squared =", gbr1.score(X_test,y_test))

Gradient Boosting Test Set R-Squared = 0.7280281831863015


In [22]:
print("Gradient Boosting Test Set RMSE =", 
      np.sqrt(mean_squared_error(y_test, gbr1.predict(X_test))))

Gradient Boosting Test Set RMSE = 25124.231014946403


In [12]:
param_dist = {"max_depth": randint(6,16),
              "max_features": randint(50, 100),
              "min_samples_split": randint(2, 11),
              'n_estimators': randint(500,2500)
             }
rcv_gbr = RandomizedSearchCV(estimator=gbr, param_distributions=param_dist, cv = 5, random_state = 7, 
                             scoring = 'neg_mean_squared_error' )
rcv_gbr.fit(X_train,y_train)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_sampl...ate=7, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a13e79320>, 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a13e79278>, 'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a13e79128>, 'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a12097518>},
          pre_dispatch='2*n_jobs', random_state=7, refit=True,
          return_train_

In [16]:
rcv_gbr.best_params_

{'max_depth': 6,
 'max_features': 50,
 'min_samples_split': 4,
 'n_estimators': 812}

In [17]:
gbr2 = GradientBoostingRegressor(max_depth = 6, max_features = 50, n_estimators = 812, 
                                 min_samples_split = 4, random_state = 7)
gbr2.fit(X_train,y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=6, max_features=50,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=4, min_weight_fraction_leaf=0.0,
             n_estimators=812, n_iter_no_change=None, presort='auto',
             random_state=7, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [18]:
print("Gradient Boosting Regression Test Set R-Squared =", gbr2.score(X_test,y_test))

Gradient Boosting Regression Test Set R-Squared = 0.7203284235485394


In [19]:
print("Gradient Boosting Regression Test Set RMSE =", 
      np.sqrt(mean_squared_error(y_test, gbr2.predict(X_test))))

Gradient Boosting Regression Test Set RMSE = 25644.403705088545


## XgBoost Model

In [23]:
xgr = XGBRegressor(random_state = 7)

In [50]:
parameters = {
              'eval_metric':['rmse'],
              'eta': [.07],
              'max_depth': [4,6,8],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [2200]}

xgb_grid = GridSearchCV(xgr, parameters, cv = 5, n_jobs = 5)
xgb_grid.fit(X_train,y_train)

  if getattr(data, 'base', None) is not None and \


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=7, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1),
       fit_params=None, iid='warn', n_jobs=5,
       param_grid={'eval_metric': ['rmse'], 'eta': [0.07], 'max_depth': [4, 6, 8], 'subsample': [0.7], 'colsample_bytree': [0.7], 'n_estimators': [2200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [51]:
xgb_grid.best_params_

{'colsample_bytree': 0.7,
 'eta': 0.07,
 'eval_metric': 'rmse',
 'max_depth': 4,
 'n_estimators': 2200,
 'subsample': 0.7}

In [56]:
xgr1 = XGBRegressor(n_estimators = 2200, max_depth = 4 ,eval_metric = 'rmse',
                   learning_rate=  0.07, colsample_bytree = 0.7, random_state = 7)
xgr1.fit(X_train,y_train)

  if getattr(data, 'base', None) is not None and \


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, gamma=0, importance_type='gain',
       learning_rate=0.07, max_delta_step=0, max_depth=4,
       min_child_weight=1, missing=None, n_estimators=2200, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=7, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [57]:
print("XGBoosting Regression Test Set R-Squared =", xgr1.score(X_test,y_test))

XGBoosting Regression Test Set R-Squared = 0.7487812230504696


In [58]:
print("XGBoosting Regression Test Set RMSE =", 
      np.sqrt(mean_squared_error(y_test, xgr1.predict(X_test))))

XGBoosting Regression Test Set RMSE = 21710.906516194387


In [59]:
print("XGBoosting Regression Test Set MAE  =", 
      median_absolute_error(y_test, xgr1.predict(X_test)))

XGBoosting Regression Test Set MAE  = 11325.434093749995
