# Peforming model training using sklearn's RandomForestRegressor model

In this notebook, the synthetic data generated in the synthetic_data_ntbk was ocne again imported and used to train sklearn's RandomForestRegressor model. the model is a meta estimator that fits a number of classifying decision trees on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting.

Reference: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

---

In [None]:
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import seaborn as sns

### Import synthetic training and validation data from corresponding directory and verify data integrity
---

In [2]:
training_df = pd.read_csv('data/training_data.csv')
validation_df = pd.read_csv('data/validation_data.csv')

In [3]:
training_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 640 entries, 0 to 639
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       640 non-null    float64
 1   1       640 non-null    float64
 2   2       640 non-null    float64
 3   3       640 non-null    float64
 4   4       640 non-null    float64
 5   5       640 non-null    float64
 6   6       640 non-null    float64
 7   target  640 non-null    float64
dtypes: float64(8)
memory usage: 40.1 KB


In [4]:
validation_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       200 non-null    float64
 1   1       200 non-null    float64
 2   2       200 non-null    float64
 3   3       200 non-null    float64
 4   4       200 non-null    float64
 5   5       200 non-null    float64
 6   6       200 non-null    float64
 7   target  200 non-null    float64
dtypes: float64(8)
memory usage: 12.6 KB


## Model training and Hyperparameter tuning

In an attempt to increase the accuracy of our prediction results, hyperparameter tuning was performed in this notebook using sklearn's RandomizedSearchCV.

---

In [10]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap
}

In [14]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(
    estimator=rf, 
    param_distributions=random_grid, 
    n_iter=100, 
    cv=3, 
    verbose=2, 
    random_state=42, 
    n_jobs=-1
)

# train RandomForestRegressor using RandomizedSearchCV
rf_random.fit(
    training_df[['0', '1', '2', '3', '4', '5', '6']], 
    training_df['target']
)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [15]:
rf_random.best_params_

{'n_estimators': 1600,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 90,
 'bootstrap': True}

In [16]:
# helper function for evaluating a model
def evaluate(model, features, labels):
    predictions = model.predict(features)
    mean_squared_error = np.mean((labels - predictions)**2)
    score = model.score(features, labels)

    print(f'\nMSE = {mean_squared_error}')
    print(f'Model accuracy (R^2 score) = {score * 100}\n')
    
    return mean_squared_error, score

## Train and evaluate both base and tuned RandomForestRegressor models

---

In [17]:
# train base RandomForestRegressor
base_model = RandomForestRegressor(random_state=0)
base_model.fit(
    training_df[['0', '1', '2', '3', '4', '5', '6']], 
    training_df['target']
)

# evaluate using validation data
base_mse, base_accuracy = evaluate(
    base_model, 
    validation_df[['0', '1', '2', '3', '4', '5', '6']], 
    validation_df['target']
)


MSE = 3943.902885011237
Model accuracy (R^2 score) = 75.40195760042964



In [18]:
best_random = rf_random.best_estimator_
random_mse, random_accuracy = evaluate(
    best_random, 
    validation_df[['0', '1', '2', '3', '4', '5', '6']], 
    validation_df['target']
)

print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))


MSE = 3877.7746257594035
Model accuracy (R^2 score) = 75.81439821377951

Improvement of 0.55%.


From the generated results, the tuned model saw an improvement of 0.55%. As such, we will be pickling this model 

---

## Pickle tuned RandomForestRegressor model
---

In [19]:
import joblib

# Save to file in the current working directory
joblib_file = 'models/capstone_rf_model.pkl'
joblib.dump(best_random, joblib_file)

['models/capstone_rf_model.pkl']

## Import the validation and evaluation csv files
---

In [20]:
# import validation and evaluation csv files  
valid_df = pd.read_csv('data/valid_data.csv')
eval_df = pd.read_csv('data/evaluation_data.csv')

## Store all of the model performance results in the imported dataframes
---

In [21]:
rf_data = {
    'algorithm': 'RandomForestRegressor',
    'MSE': random_mse,
    'r2_score': random_accuracy*100
}

#append row to the dataframe
eval_df = eval_df.append(rf_data, ignore_index=True)

eval_df.head()

Unnamed: 0,algorithm,MSE,r2_score
0,LinearRegression,2568.494309,83.980353
1,RandomForestRegressor,3877.774626,75.814398


In [22]:
rf_predictions = best_random.predict(validation_df[['0', '1', '2', '3', '4', '5', '6']])

valid_df['rf_predictions'] = rf_predictions
valid_df.head()

Unnamed: 0,actual,lr_predictions,rf_predictions
0,223.013403,249.381526,184.263197
1,103.557634,50.058942,26.280123
2,208.844585,189.456792,168.070799
3,-65.937306,-36.324588,40.217284
4,-37.186189,-57.440259,-37.501712


## Export model evaluation dataframes into csv files
---

In [23]:
# export data 
valid_df.to_csv('data/valid_data.csv', index=False)
eval_df.to_csv('data/evaluation_data.csv', index=False)