### Module 6: Random Forest #2
Tune Two Random Forest Hyperparameters

In [2]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor # Here is the RF regressor
from sklearn.metrics import r2_score,mean_squared_error  # Use our past metrics
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [3]:
# Load the Boston housing dataset.
#     https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_boston.html
data = datasets.load_boston() # loads Boston dataset from datasets library 
# Load all the possible features
df = pd.DataFrame(data.data, columns=data.feature_names)
df.head(2)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14


### Isolate the target variable

In [4]:
# The Target will be the median value "MEDV" in thousands of dollars
y = pd.DataFrame(data.target, columns=["MEDV"])
y.head(3)

Unnamed: 0,MEDV
0,24.0
1,21.6
2,34.7


### Isolate the features matrix

In [5]:
# Decide which features to include
print('Possible Features:\n',list(df.columns))
# Lets drop 'B'
X = df.drop(['B'], axis = 1)
print('\nSelected Features:\n',list(X.columns))
X.head(2)

Possible Features:
 ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']

Selected Features:
 ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'LSTAT']


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,9.14


### Split the data into training and test sets

In [6]:
# Split into train/test
# Reserve 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
# Verify the sizes of the split datasets
print('X_train:', X_train.shape)
print('y_train:', y_train.shape)
print('X_test:', X_test.shape)
print('y_test:', y_test.shape)

X_train: (404, 12)
y_train: (404, 1)
X_test: (102, 12)
y_test: (102, 1)


### Create a baseline model

In [14]:
# Create the RF regressor object
# This algorithm is from the sklearn.ensemble module
rfr = RandomForestRegressor(max_depth = None, n_estimators = 100) # default parameters
#
# Train the model using the training data
fit_rfr = rfr.fit(X_train, y_train.values.ravel())
# Show hyperparameters
fit_rfr 

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

### Use the random forest regressor to predict values for the test set

In [15]:
# Predict new MEDV values using the X_test data
y_pred = rfr.predict(X_test)

### Evaluate the performance of RF regressor


In [16]:
# Calculate the metrics
#
r2 = round(r2_score(y_test, y_pred),2)
mse = round(mean_squared_error(y_test, y_pred),2)
#
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))
print("MSE: ", mse)
print("RMSE: ", round(mse**(1/2.0),3)) # Root Mean Squared Error

Coefficient of determination: 0.90
MSE:  10.74
RMSE:  3.277


### Tune max_depth and n_estimators by "brute force"

In [10]:
#  Setup the search space
max_depths = np.linspace(1, 12, 12, endpoint=True)
print(max_depths)
n_estimators = np.array([10,50,100,150,200])
print(n_estimators)

[ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12.]
[ 10  50 100 150 200]


In [18]:
# The goal is to minimize mse
best_mse = float('inf') # set this value very high. We will try to minimize it
best_n = 0  # Will keep track of n_estimators
best_md = 0 # Will keep track of max_depth

# Let's time how long this takes
import time
st = time.time() # time right now
# 
# Do an embedded for loop and search though each combination of n_estimator and max_depth
for md in max_depths:
    for n in n_estimators:
        # Do the workflow
        rfr = RandomForestRegressor(max_depth = md, n_estimators = n)
        fit_rfr = rfr.fit(X_train, y_train.values.ravel() )
        y_pred = rfr.predict(X_test)
        mse = round(mean_squared_error(y_test, y_pred),2)
        if mse < best_mse: # If the mse is lower, then update current variable values
            best_mse = mse 
            best_n = n
            best_md = md
#
# Get the end time
et = time.time()
#
# get the elapsed time
elapsed_time = et - st
# Report results of search
print('Execution time:', elapsed_time, 'seconds')
print('Best MSE:', best_mse)
print('Best Max Depth:', best_md)
print('Best n_estimators:', best_n)

Execution time: 12.811979293823242 seconds
Best MSE: 9.95
Best Max Depth: 12.0
Best n_estimators: 10


### Use the best hyperparameters and create a new model

In [19]:
# Run this cell several times and discuss output.
rfr = RandomForestRegressor(max_depth = best_md, n_estimators = best_n)
fit_rfr = rfr.fit(X_train, y_train.values.ravel() )
y_pred = rfr.predict(X_test)
#
# Calculate the metrics
r2 = round(r2_score(y_test, y_pred),2)
mse = round(mean_squared_error(y_test, y_pred),2)
#
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))
print("MSE: ", mse)
print("RMSE: ", round(mse**(1/2.0),3)) # Root Mean Squared Error

Coefficient of determination: 0.88
MSE:  13.2
RMSE:  3.633


### Predict new values

In [20]:
print(list(X.columns))
# Define house features
h1 = [18.811, 0.0, 18.1, 0.0, 0.597, 4.628, 100.0, 1.5539, 24.0, 666.0, 20.2, 34.37]
h2 = [5.66998, 0.0, 18.1, 1.0, 0.631, 6.683, 96.8, 1.3567, 24.0, 666.0, 20.2, 3.73]
h3 = [7.99248, 0.0, 18.1, 0.0, 0.7, 5.52, 100.0, 1.5331, 24.0, 666.0, 20.2, 24.56]
h4 = [0.30347, 0.0, 7.38, 0.0, 0.493, 6.312, 28.9, 5.4159, 5.0, 287.0, 19.6, 6.15]
h5 = [0.15086, 0.0, 27.74, 0.0, 0.609, 5.454, 92.7, 1.8209, 4.0, 711.0, 20.1, 18.06]
#
h_lst = [h1, h2, h3, h4, h5]
#
for house in h_lst:
    print('House features:', house)
    df = pd.DataFrame(data=house)
    pv = round(rfr.predict(df.T).item(),3) # Watch out for the form of input to the predictor
    print('Predicted Value in thousands of dollars:', pv , '\n')

['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'LSTAT']
House features: [18.811, 0.0, 18.1, 0.0, 0.597, 4.628, 100.0, 1.5539, 24.0, 666.0, 20.2, 34.37]
Predicted Value in thousands of dollars: 16.65 

House features: [5.66998, 0.0, 18.1, 1.0, 0.631, 6.683, 96.8, 1.3567, 24.0, 666.0, 20.2, 3.73]
Predicted Value in thousands of dollars: 37.97 

House features: [7.99248, 0.0, 18.1, 0.0, 0.7, 5.52, 100.0, 1.5331, 24.0, 666.0, 20.2, 24.56]
Predicted Value in thousands of dollars: 11.58 

House features: [0.30347, 0.0, 7.38, 0.0, 0.493, 6.312, 28.9, 5.4159, 5.0, 287.0, 19.6, 6.15]
Predicted Value in thousands of dollars: 23.846 

House features: [0.15086, 0.0, 27.74, 0.0, 0.609, 5.454, 92.7, 1.8209, 4.0, 711.0, 20.1, 18.06]
Predicted Value in thousands of dollars: 15.891 

