# Random Forests
* **Multiple Decision Trees** making up a forest
* Versatile, able to handle large datasets with higher dimensionality, able to handle missing values
* **Feature Importance**
    * Can rank importance of input values    
* **Avoid overfitting**
* **Doesn't require Feature Scaling** like SVM or KNN    
* Useful for:
    * Classification
    * Regression (predicting continous values)
    * **Non-linear Relationships**
    * Bioinformatics
    * Image and Voice recognition
* May run slower than other models
* Many of its uses can be better modeled by Deep Learning Models (neural networks)

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
#plt.rcParams["figure.figsize"] = (15,11)

# This lets us see all of the columns, preventing Jupyter from redacting them
pd.set_option('display.max_columns', None)

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay

# This module lets us save our models once we fit them.
import pickle

# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)

In [2]:
file_location = "Churn_Modelling.csv"
df_original = pd.read_csv(file_location)
df_original.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
# Drop useless and sensitive (Gender) cols
churn_df = df_original.drop(['RowNumber', 'CustomerId', 'Surname', 'Gender'], axis=1)
churn_df.head()

Unnamed: 0,CreditScore,Geography,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,41,1,83807.86,1,0,1,112542.58,0
2,502,France,42,8,159660.8,3,1,0,113931.57,1
3,699,France,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,43,2,125510.82,1,1,1,79084.1,0


In [4]:
# Dummy encode categoricals
churn_df2 = pd.get_dummies(churn_df, drop_first=True)
churn_df2.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain
0,619,42,2,0.0,1,1,1,101348.88,1,False,False
1,608,41,1,83807.86,1,0,1,112542.58,0,False,True
2,502,42,8,159660.8,3,1,0,113931.57,1,False,False
3,699,39,1,0.0,2,0,0,93826.63,0,False,False
4,850,43,2,125510.82,1,1,1,79084.1,0,False,True


In [5]:
# Split data
y = churn_df2["Exited"]

X = churn_df2.copy()
X = X.drop("Exited", axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

In [6]:
%%time

# Train model
max_depth = [2,3,4,5, None]
min_samlpes_leaf = [1,2,3]
min_samples_split = [2,3,4]
max_features = [75, 100, 125, 150]

cv_params = {'max_depth': max_depth,
             'min_samples_leaf': min_samlpes_leaf,
             'min_samples_split': min_samples_split,
             'max_features':max_features
            }

rf = RandomForestClassifier(random_state=0)

scoring = ['accuracy', 'precision', 'recall', 'f1']

rf_cv = GridSearchCV(rf, cv_params, scoring=scoring, cv=5, refit='f1')

rf_cv.fit(X_train, y_train)

CPU times: total: 13min 39s
Wall time: 13min 39s


In [7]:
path = ''

# Pickle
* Saves a lot of time because you wouldn't have to run (train) the model from the beginning every time
* Pickles are compressed to improve memory storage and speed

In [8]:
# Pickle the model
with open(path+'rf_cv_model_p.pickle', 'wb') as to_write:
    pickle.dump(rf_cv, to_write)

In [16]:
# Open pickled model
with open(path+'rf_cv_model_p.pickle', 'rb') as to_read:
    rf_cv = pickle.load(to_read)

In [17]:
rf_cv.best_params_

{'max_depth': None,
 'max_features': 75,
 'min_samples_leaf': 3,
 'min_samples_split': 2}

In [18]:
rf_cv.best_score_

0.5799582189787593

# Results Table
<div style="text-align: left;">

**F1:**  
$$ F1 = 2 \times \frac{\text{Precision} \times \text{Recall}}{\text{Precision} + \text{Recall}} $$


<br>

**Recall (aka Sensitivity || True Positive Rate):**  
$$ \text{Recall} = \frac{\text{True Positives}}{\text{True Positives} + \text{False Negatives}} $$


<br>

**Precision:**  
$$ \text{Precision} = \frac{\text{True Positives}}{\text{True Positives} + \text{False Positives}} $$


<br>

**Accuracy:**  
$$ \text{Accuracy} = \frac{\text{True Positives} + \text{True Negatives}}{\text{Total Predictions}} $$

</div>

In [19]:
def make_results(model_name, model_object):
    '''
    Accepts as arguments a model name (your choice - string) and
    a fit GridSearchCV model object.
  
    Returns a pandas df with the F1, recall, precision, and accuracy scores
    for the model with the best mean F1 score across all validation folds.  
    '''

    # Get all the results from the CV and put them in a df
    cv_results = pd.DataFrame(model_object.cv_results_)

    # Isolate the row of the df with the max(mean f1 score)
    best_estimator_results = cv_results.iloc[cv_results['mean_test_f1'].idxmax(), :]

    # Extract accuracy, precision, recall, and f1 score from that row
    f1 = best_estimator_results.mean_test_f1
    recall = best_estimator_results.mean_test_recall
    precision = best_estimator_results.mean_test_precision
    accuracy = best_estimator_results.mean_test_accuracy
  
    # Create table of results
    table = pd.DataFrame(
         {'model': [model_name],
          'precision': [precision],
          'recall': [recall],
          'F1': [f1],
          'accuracy': [accuracy],
         },
     )
  
    return table

In [20]:
rf_cv_results = make_results('Random Forest CV', rf_cv)
rf_cv_results

Unnamed: 0,model,precision,recall,F1,accuracy
0,Random Forest CV,0.730583,0.482338,0.579958,0.858


In [21]:
# Read in master results table
results = pd.read_csv('results1.csv', index_col=0)
results

Unnamed: 0,Model,F1,Recall,Precision,Accuracy
0,Tuned Decision Tree,0.560655,0.469255,0.701608,0.8504


In [22]:
# Concatenate the random forest results to the master table
results = pd.concat([rf_cv_results, results])
results

Unnamed: 0,model,precision,recall,F1,accuracy,Model,Recall,Precision,Accuracy
0,Random Forest CV,0.730583,0.482338,0.579958,0.858,,,,
0,,,,0.560655,,Tuned Decision Tree,0.469255,0.701608,0.8504


# Validation Set
* Used to evaluate the model during the training process
    * **Hyperparameter tuning**
    * Helps in selecting the best version of the model and in **preventing overfitting**

In [23]:
# Create separate validation data
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, 
                                            stratify=y_train, random_state=10)

In [24]:
# Create list of split indices
split_index = [0 if x in X_val.index else -1 for x in X_train.index]

In [25]:
from sklearn.model_selection import PredefinedSplit

# provides train/test indices to split data into training
# and test sets using a predefined scheme

In [29]:
rf = RandomForestClassifier(random_state=0)

max_depth = [2,3,4,5, None]
min_samlpes_leaf = [1,2,3]
min_samples_split = [2,3,4]
max_features = [75, 100, 125, 150]

cv_params = {'max_depth': max_depth,
             'min_samples_leaf': min_samlpes_leaf,
             'min_samples_split': min_samples_split,
             'max_features':max_features
            } 

rf = RandomForestClassifier(random_state=0)

scoring = ['accuracy', 'precision', 'recall', 'f1']

custom_split = PredefinedSplit(split_index)

rf_val = GridSearchCV(rf, cv_params, scoring=scoring, cv=custom_split, refit='f1')

In [30]:
%%time
rf_val.fit(X_train, y_train)

CPU times: total: 2min 47s
Wall time: 2min 47s


In [31]:
# Pickle the model
with open(path+'rf_val_model.pickle', 'wb') as to_write:
    pickle.dump(rf_val, to_write)

In [32]:
# Open pickled model
with open(path+'rf_val_model.pickle', 'rb') as to_read:
    rf_val = pickle.load(to_read)

In [36]:
rf_val.best_params_

{'max_depth': None,
 'max_features': 75,
 'min_samples_leaf': 3,
 'min_samples_split': 2}

In [37]:
# Create model results table
rf_val_results = make_results('Random Forest Validated', rf_val)

# Concatentate model results table with master results table
results = pd.concat([rf_val_results, results])

# Sort master results by F1 score in descending order
results.sort_values(by=['F1'], ascending=False)

Unnamed: 0,model,precision,recall,F1,accuracy,Model,Recall,Precision,Accuracy
0,Random Forest CV,0.730583,0.482338,0.579958,0.858,,,,
0,,,,0.560655,,Tuned Decision Tree,0.469255,0.701608,0.8504
0,Random Forest Validated,0.714286,0.457516,0.557769,0.852,,,,
0,Random Forest Validated,0.714286,0.457516,0.557769,0.852,,,,


In [38]:
# Save the master results table
results.to_csv(path+'results2.csv', index=False);