In [2]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split ,GridSearchCV, RandomizedSearchCV
from sklearn.metrics import r2_score , mean_squared_error, make_scorer
from scipy.stats import randint
%load_ext watermark

In [3]:
%watermark -a "Random Forest Models" -d -t -v -p numpy,pandas,sklearn,scipy -g

Random Forest Models 2019-07-06 11:06:47 

CPython 3.7.1
IPython 7.2.0

numpy 1.16.4
pandas 0.23.4
sklearn 0.20.1
scipy 1.1.0
Git hash: 948b5557e1f3c2ee8a78c09e952e7215486d8527


In [2]:
with open('Preprocessing/Datasets/recs_processed_otliers.pickle','rb') as handle:
    recs_processed_otliers = pickle.load(handle)

In [3]:
X = recs_processed_otliers.iloc[:,:-2]
y = recs_processed_otliers['TOTALBTU']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 7)

## Random Forest Model

In [4]:
rfr  = RandomForestRegressor(random_state = 7)

In [5]:
param_grid = {'n_estimators': [1900,2100,2300],
              'max_features': ['sqrt'],
              'max_depth' : [18,20,22]
             }

In [6]:
cv_rfr1 = GridSearchCV(estimator = rfr, param_grid = param_grid, cv = 5, n_jobs = 3, 
                       scoring = 'neg_mean_squared_error')
_ = cv_rfr1.fit(X_train,y_train)

In [7]:
cv_rfr1.best_params_

{'max_depth': 22, 'max_features': 'sqrt', 'n_estimators': 2300}

In [8]:
rfr1 = RandomForestRegressor(max_depth = 20, max_features = 'sqrt', n_estimators = 2100, random_state = 7)
_ = rfr1.fit(X_train,y_train)

In [9]:
print("Random Forest Regression Test Set R-Squared =", rfr1.score(X_test,y_test))

Random Forest Regression Test Set R-Squared = 0.6791019179869215


In [10]:
print("Random Forest Regression Test Set RMSE =", 
      np.sqrt(mean_squared_error(y_test, rfr1.predict(X_test))))

Elastic Regression Test Set RMSE = 24537.79136764059


In [11]:
param_dist = {'n_estimators': randint(500,2500),
              'max_features': ['sqrt'],
              'max_depth' : randint(10,20)
             }
             
rcv_rfr = RandomizedSearchCV(estimator=rfr, param_distributions=param_dist, cv = 5, random_state = 7, 
                             scoring = 'neg_mean_squared_error' )
_ = rcv_rfr.fit(X_train,y_train)

In [12]:
rcv_rfr.best_params_

{'max_depth': 19, 'max_features': 'sqrt', 'n_estimators': 1922}

In [13]:
rfr2 = RandomForestRegressor(max_depth = 18, max_features = 'sqrt', n_estimators = 1691, random_state = 7)
_ = rfr2.fit(X_train,y_train)

In [14]:
print("Random Forest Regression Test Set R-Squared =", rfr2.score(X_test,y_test))

Random Forest Regression Test Set R-Squared = 0.678716146384943


In [15]:
print("Random Forest Regression Test Set RMSE =", 
      np.sqrt(mean_squared_error(y_test, rfr2.predict(X_test))))

Elastic Regression Test Set RMSE = 24552.53614240225
