# Random Forest Regression

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_csv('Housing.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

## Splitting the dataset into the Training set and Test set

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the Random Forest Regression model on the whole dataset

In [4]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=10, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

## Predicting the Test set results

In [5]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[425040. 417900.]
 [642600. 632100.]
 [328860. 281400.]
 [279930. 577500.]
 [507990. 474600.]
 [263550. 359100.]
 [322140. 247800.]
 [439320. 474600.]
 [450030. 449400.]
 [344400. 411600.]
 [223650. 214200.]
 [292740. 214200.]
 [454440. 428400.]
 [237510. 375900.]
 [531300. 577500.]
 [334950. 327600.]
 [426930. 420000.]
 [529200. 489300.]
 [431130. 470400.]
 [616560. 640500.]
 [535920. 651000.]
 [753060. 690900.]
 [244440. 147000.]
 [423360. 441000.]
 [436170. 367500.]
 [896490. 966000.]
 [732060. 756000.]
 [905730. 888300.]
 [433860. 455700.]
 [452550. 445200.]
 [432810. 323400.]
 [395640. 455700.]
 [485520. 430500.]
 [687960. 867300.]
 [430290. 445200.]
 [894180. 875700.]
 [577710. 415800.]
 [222390. 268800.]
 [528360. 590100.]
 [556080. 497700.]
 [234570. 231000.]
 [360150. 315000.]
 [439950. 388500.]
 [497700. 449400.]
 [402780. 413700.]
 [339780. 352800.]
 [507990. 453600.]
 [300300. 306600.]
 [835380. 898800.]
 [425670. 514500.]
 [706440. 743400.]
 [395850. 474600.]
 [569100. 60

## Evaluating the Model Performance

In [6]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.7958582987098185

## Analyzing Model’s Performance

Graphing the models performance with varying criteria can help in the analysis process. We can identity valuable insights such as visualizing behavior that would not be apparent with the R2 scores alone.

In [7]:
# Here we are able to see all of our paramater for our randomforest regressor and the values they are set to

from pprint import pprint #pprint allow the output to be printed in a structured format 
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(regressor.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}


In order to improve our model we will tune the folowing parameters:
- n_estimators = number of trees in the foreset
- max_features = max number of features considered for splitting a node
- max_depth = max number of levels in each decision tree
- min_samples_split = min number of data points placed in a node before the node is split
- min_samples_leaf = min number of data points allowed in a leaf node
- bootstrap = method for sampling data points (with or without replacement)


## Random Hyperparameter Grid

On each iteration, the algorithm will choose a difference combination of the features.The benefit of random search is that we are not using every combination, but selecting at random to sample a wide range of values

To use RandomizedSearchCV, it is important to create a parameter grid to sample from during fitting:

In [8]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


## Random Search Training 

Now we are going to fit the random search module 

In [9]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune 
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=0, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   36.8s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  8.1min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                              

The 2 most important arguments in RandomizedSearchCV are:
- n_iter, which tells us the number of different combinations to try
- cv which is the number of folds to use for cross validation (we use 100 and 3 respectively)

More iterations will cover a wider space for our search and the more cv folds will prevent  the chances of the our model overfitting. The downside to increasing cv is that it increases the run time.

In [10]:
# This will allow us to view the best parameters from fitting the random search. This will allow us to narrow the range for
# each hyperparameter

rf_random.best_params_

{'n_estimators': 1800,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': True}

## Evaluate Random Search

It is now important to determine if the random search yielded a better model. we compare the original regression model, 
with the best random search model

In [11]:
def evaluate(model, X_test, y_test):
    predictions = model.predict(X_test)
    errors = abs(predictions - y_test)
    mape = 100 * np.mean(errors / y_test)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

# notice this exactly the same as our regressor model from earlier on
base_model = RandomForestRegressor(n_estimators = 10, random_state = 0)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)

Model Performance
Average Error: 59271.4286 degrees.
Accuracy = 85.21%.


In [12]:
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

Model Performance
Average Error: 55649.1429 degrees.
Accuracy = 85.58%.


In [13]:
print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

Improvement of 0.44%.


As you can see the improvement in accuracy improved by 0.44%. We can improve this result even further using grid search

## Grid Search with Cross Validation

Random search gives us the opportunity to narrow the range of each hyperparameter. We can now concentrate our search, using the GridSearchCV. This method is much better than sampling randomly from a distribution.

We will make another grid based on the best values provided by random search:

In [21]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}


# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [23]:
# Fit the grid search to the dataset
grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   20.1s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  8.6min finished


{'bootstrap': True,
 'max_depth': 90,
 'max_features': 2,
 'min_samples_leaf': 4,
 'min_samples_split': 8,
 'n_estimators': 100}

In [26]:
# This will give us Model performace of the GridSearchCV
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test, y_test)

Model Performance
Average Error: 53389.2454 degrees.
Accuracy = 86.32%.


In [27]:
print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))

Improvement of 1.30%.


## Making Predictions

In [30]:
# Produce a matrix for client data
client_data = [[5, 17, 15], # Client 1
               [4, 32, 22], # Client 2
               [8, 3, 12]]  # Client 3

In [28]:
# Show predicitons on the original best_grid model
for i, price in enumerate(best_grid.predict(client_data)):
    print("Predicted selling price for Client {}'s home: ${:,.2f}".format(i+1, price))

Predicted selling price for Client 1's home: $365,135.80
Predicted selling price for Client 2's home: $228,913.45
Predicted selling price for Client 3's home: $914,684.11


Given these values we can conclude:
    
- Client 3 selling price is the near million dollars. This price seems very correct because of its features (8 rooms, very low poverty level and low student-teacher ratio). We can assume it is a wealthy neighborhood 

- Client 2 has the lowest price and this is because it has the least amount of rooms(RM), but the highest LSTAT, PTRATIO  

- For client 1, we can see that its features are intermediate between the latter 2, and therefore, its price is quite near the mean and median.


We can conclude that our hypothesis in the explanatory data analysis was correct