# Model: Random Forest
---

# 1. Setting up the notebook

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_validate
from sklearn.metrics import make_scorer, recall_score, fbeta_score, roc_auc_score

In [3]:
df_train = pd.read_csv("../Data/train.csv")
y_train = df_train[["attrition_flag"]]
x_train = df_train.drop("attrition_flag", axis=1)

# 2. Running base model

In [4]:
def cv_evaluate_model(rf_clf):
    scale_features = x_train.drop("gender", axis=1).columns
    scaler = ColumnTransformer(transformers=[ ('scaler', MinMaxScaler(), scale_features) ], remainder='passthrough')
    pipeline = Pipeline(steps = [['scaler', scaler ],
                                 ['smote', SMOTE(random_state=2021)],
                                 ['classifier', rf_clf]
                                ])

    stratified_kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=2021)

    scoring = {"accuracy": "accuracy",
               "recall": 'recall',
               "precision": "precision",
               "fbeta_2": make_scorer(fbeta_score, beta=2),
               "roc_auc": make_scorer(roc_auc_score),
              }

    scores = cross_validate(pipeline, x_train, y_train.values.ravel(), cv=stratified_kfold,
                           scoring = scoring)

    accuracy = [ val for val in scores['test_accuracy'] ]
    recall = [ val for val in scores['test_recall'] ]
    precision = [ val for val in scores['test_precision'] ]
    fbeta_2 = [ val for val in scores['test_fbeta_2'] ]
    auc = [ val for val in scores['test_roc_auc'] ]

    accuracy.append( sum(accuracy) / len(accuracy) )
    recall.append( sum(recall) / len(recall) )
    precision.append( sum(precision) / len(precision) )
    fbeta_2.append( sum(fbeta_2) / len(fbeta_2) )
    auc.append( sum(auc) / len(auc) )

    score_df = pd.DataFrame(data=[accuracy, recall, precision, fbeta_2, auc], columns=['Fold 1','Fold 2','Fold 3', 'Average'],
                                index=['Accuracy', 'Recall', 'Precision', 'Fbeta2', 'AUC'])
    return score_df

In [5]:
rf_clf = RandomForestClassifier(random_state=2021)
base_model = cv_evaluate_model(rf_clf)
display(base_model)

Unnamed: 0,Fold 1,Fold 2,Fold 3,Average
Accuracy,0.929285,0.924815,0.924074,0.926058
Recall,0.769585,0.781106,0.730415,0.760369
Precision,0.785882,0.758389,0.782716,0.775663
Fbeta2,0.77279,0.776454,0.740308,0.763184
AUC,0.864722,0.866722,0.84579,0.859078


# 3. Hyper parameter tuning with GridSearchCV

## 3.1 First Grid Search

In [4]:
# Creating parameter grid to search
n_estimators = [100, 300, 500, 700, 900]
max_features = ['sqrt', 'log2']
max_depth = [5, 10, 15, 20]
min_samples_split = [3, 5, 8, 10, 13]
min_samples_leaf = [1, 3, 5, 8, 10]

params_grid = {
                'classifier__n_estimators': n_estimators,
                'classifier__max_features': max_features,
                'classifier__min_samples_split': min_samples_split,
                'classifier__min_samples_leaf': min_samples_leaf,
                'classifier__max_depth': max_depth
              }

total_combi = 1
for param, value in params_grid.items():
    print(param, value)
    total_combi *= len(value)

print('-----------------')
print('Total combinations:', total_combi)
print('Across 5-folds =',total_combi*5)

classifier__n_estimators [100, 300, 500, 700, 900]
classifier__max_features ['sqrt', 'log2']
classifier__min_samples_split [3, 5, 8, 10, 13]
classifier__min_samples_leaf [1, 3, 5, 8, 10]
classifier__max_depth [5, 10, 15, 20]
-----------------
Total combinations: 1000
Across 5-folds = 5000


In [5]:
smote_sampler = SMOTE(random_state=2021)
rf_clf = RandomForestClassifier(random_state=2021)

scale_features = x_train.columns

scaler = ColumnTransformer(transformers=[ ('scaler', MinMaxScaler(), scale_features) ], remainder='passthrough')

pipeline = Pipeline(steps = [['scaler', scaler],
                             ['smote', smote_sampler],
                             ['classifier', rf_clf]])

stratified_kfold = StratifiedKFold(shuffle=True, n_splits=5, random_state=2021)

rf_gridsearch = GridSearchCV(estimator = pipeline,
                           param_grid = params_grid,
                           scoring = 'recall',
                           cv = stratified_kfold,
                           refit = True,
                           n_jobs = -1)

rf_gridsearch.fit(x_train, y_train.values.ravel() )

best_parameters = rf_gridsearch.best_params_
print(best_parameters)

{'classifier__max_depth': 10, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 8, 'classifier__min_samples_split': 3, 'classifier__n_estimators': 100}

## 3.2 Second Grid Search
This second parameter search space aims to narrow down the search space based on the results obtained in the first grid search.

**Previous parameter search space:**
- n_estimators [100, 300, 500, 700, 900]
- max_features ['sqrt', 'log2']
- min_samples_split [3, 5, 8, 10, 13]
- min_samples_leaf [1, 3, 5, 8, 10]
- max_depth [5, 10, 15, 20]

**Obtained results:**

| Feature           | Value | New search space |
|-------------------|:-----:|------------|
| n_estimators      | 100   | [50, 75, 100, 125, 150] |
| max_features      |'sqrt' | Leave as sqrt |
| min_samples_split | 3     | [2, 3, 4] |
| min_samples_leaf  | 8     | [6, 7, 8, 9] |
| max_depth         | 10    | [8, 9, 10, 11, 12] |


In [6]:
# Creating parameter grid to search
n_estimators = [50, 75, 100, 125, 150]
min_samples_split = [2, 3, 4]
min_samples_leaf = [6, 7, 8, 9, 10]
max_depth = [8, 9, 10, 11, 12]

params_grid = {
                'classifier__n_estimators': n_estimators,
                'classifier__min_samples_split': min_samples_split,
                'classifier__min_samples_leaf': min_samples_leaf,
                'classifier__max_depth': max_depth
              }

total_combi = 1
for param, value in params_grid.items():
    print(param, value)
    total_combi *= len(value)

print('-----------------')
print('Total combinations:', total_combi)
print('Across 5-folds =',total_combi*5)

classifier__n_estimators [50, 75, 100, 125, 150]
classifier__min_samples_split [2, 3, 4]
classifier__min_samples_leaf [6, 7, 8, 9, 10]
classifier__max_depth [8, 9, 10, 11, 12]
-----------------
Total combinations: 375
Across 5-folds = 1875


In [7]:
smote_sampler = SMOTE(random_state=2021)
rf_clf = RandomForestClassifier(max_features='sqrt', random_state=2021)

scale_features = x_train.columns

scaler = ColumnTransformer(transformers=[ ('scaler', MinMaxScaler(), scale_features) ], remainder='passthrough')

pipeline = Pipeline(steps = [['scaler', scaler],
                             ['smote', smote_sampler],
                             ['classifier', rf_clf]])

stratified_kfold = StratifiedKFold(shuffle=True, n_splits=5, random_state=2021)

rf_gridsearch = GridSearchCV(estimator = pipeline,
                           param_grid = params_grid,
                           scoring = 'recall',
                           cv = stratified_kfold,
                           refit = True,
                           n_jobs = -1)

rf_gridsearch.fit(x_train, y_train.values.ravel() )

best_parameters = rf_gridsearch.best_params_
print(best_parameters)

{'classifier__max_depth': 10, 'classifier__min_samples_leaf': 8, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 125}

## 3.3 Third Grid Search
This third parameter search space aims to narrow down the search space based on the results obtained in the first grid search.

**Previous parameter search spaces:**
1. GridSearch 1
- n_estimators = [100, 300, 500, 700, 900]
- max_features = ['sqrt', 'log2']
- min_samples_split = [3, 5, 8, 10, 13]
- min_samples_leaf = [1, 3, 5, 8, 10]
- max_depth = [5, 10, 15, 20]

2. GridSearch 2
- n_estimators = [50, 75, 100, 125, 150]
- min_samples_split = [2, 3, 4]
- min_samples_leaf = [6, 7, 8, 9, 10]
- max_depth = [8, 9, 10, 11, 12]

**Obtained results:**

| Feature           | GridSearch1 | GridSearch2 |      New search space     |
|-------------------|:-----------:|:-----------:|:-------------------------:|
| n_estimators      |     100     |     125     | [100, 105, 110, 115, 120, 125, 130, 135, 140, 145, 150] |
| min_samples_split |      3      |      2      |         [2,3]             |
| min_samples_leaf  |      8      |      8      |         Leave as 8        |
|    max_depth      |     10      |     10      |         Leave as 10       |



In [8]:
# Creating parameter grid to search
n_estimators = [100, 105, 110, 115, 120, 125, 130, 135, 140, 145, 150]
min_samples_split = [2, 3]

params_grid = {
                'classifier__n_estimators': n_estimators,
                'classifier__min_samples_split': min_samples_split
              }

total_combi = 1
for param, value in params_grid.items():
    print(param, value)
    total_combi *= len(value)

print('-----------------')
print('Total combinations:', total_combi)
print('Across 5-folds =', total_combi*5)

classifier__n_estimators [100, 105, 110, 115, 120, 125, 130, 135, 140, 145, 150]
classifier__min_samples_split [2, 3]
-----------------
Total combinations: 22
Across 5-folds = 110


In [9]:
# running grid search 3
smote_sampler = SMOTE(random_state=2021)
rf_clf = RandomForestClassifier(min_samples_leaf=8, max_depth=10, max_features='sqrt', random_state=2021)

scale_features = x_train.columns

scaler = ColumnTransformer(transformers=[ ('scaler', MinMaxScaler(), scale_features) ], remainder='passthrough')

pipeline = Pipeline(steps = [['scaler', scaler],
                             ['smote', smote_sampler],
                             ['classifier', rf_clf]])

stratified_kfold = StratifiedKFold(shuffle=True, n_splits=5, random_state=2021)

rf_gridsearch = GridSearchCV(estimator = pipeline,
                           param_grid = params_grid,
                           scoring = 'recall',
                           cv = stratified_kfold,
                           refit = True,
                           n_jobs = -1)

rf_gridsearch.fit(x_train, y_train.values.ravel() )

best_parameters = rf_gridsearch.best_params_
print(best_parameters)

{'classifier__min_samples_split': 2, 'classifier__n_estimators': 105}

# 4. Conclusion

#### Best params from First Grid Search:
{'classifier__max_depth': 10, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 8, 'classifier__min_samples_split': 3, 'classifier__n_estimators': 100}
#### Best params from Second Grid Search:
{'classifier__max_depth': 10, 'classifier__min_samples_leaf': 8, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 125}
#### Best params from Third Grid Search:
{'classifier__min_samples_split': 2, 'classifier__n_estimators': 105}