In [1]:
# Import the main packages

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from imblearn.ensemble import BalancedRandomForestClassifier
from prettytable import PrettyTable
%matplotlib inline

In [2]:
# Read the dataset and take a quick look

df = pd.read_csv("diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
# Investigate the response variable for data imbalance

count0, count1 = df['Outcome'].value_counts()

In [4]:
print(f'The percentage of diabetics in the dataset is only {100*count1/(count0+count1):.2f}%')

The percentage of diabetics in the dataset is only 34.90%


In [5]:
# Assign the predictor and response variables.

# Outcome is the response and all the other columns are the predictors

X = df.drop("Outcome", axis=1)
y = df['Outcome']

In [6]:
# Fix a random_state and split the data into train and validation sets
random_state = 22

X_train, X_val, y_train,y_val = train_test_split(X,y,train_size = 0.8,random_state =random_state)


In [7]:
# We fix the max_depth variable to 20 for all trees

max_depth = 20

## Strategy 1 - Vanilla Random Forest

- No correction for imbalance

In [8]:
# Define a Random Forest classifier with randon_state as above

# Set the maximum depth to be max_depth and use 10 estimators

random_forest = RandomForestClassifier(max_depth=max_depth, random_state=random_state, n_estimators=10)

# Fit the model on the training set

random_forest.fit(X_train, y_train)

RandomForestClassifier(max_depth=20, n_estimators=10, random_state=22)

In [9]:
# We make predictions on the validation set 

predictions = random_forest.predict(X_val)

# We also compute two metrics that better represent misclassification of minority classes i.e `f1 score` and `AUC`

# compute the f1-score and assign it to variable score1

score1 = round(f1_score(predictions,y_val),2)

# compute the `auc` and assign it to variable auc1

auc1 = round(roc_auc_score(predictions,y_val),2)

## Strategy 2 - Random Forest with class weighting
- Balancing the class imbalance in each bootstrap

In [10]:
# Again Define a Random Forest classifier with randon_state as above

# Set the maximum depth to be max_depth and use 10 estimators

random_forest = RandomForestClassifier(max_depth=max_depth, random_state=random_state, n_estimators=10,class_weight='balanced_subsample')

# Fit the model on the training set

random_forest.fit(X_train, y_train)

RandomForestClassifier(class_weight='balanced_subsample', max_depth=20,
                       n_estimators=10, random_state=22)

In [11]:
# We make predictions on the validation set 

predictions = random_forest.predict(X_val)

# Again we also compute two metrics that better represent misclassification of minority classes i.e `f1 score` and `AUC`

# compute the f1-score and assign it to variable score2

score2 = round(f1_score(predictions,y_val),2)

# compute the `auc` and assign it to variable auc2

auc2 = round(roc_auc_score(predictions,y_val),2)

## Strategy 3 - Balanced Random Forest with SMOTE 

- Using the **imblearn** `BalancedRandomForestClassifier()` 
- Read more about this implementation [here](https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.ensemble.BalancedRandomForestClassifier.html)

In [12]:
# This define, define a `Balanced Random Forest Classifier` which is superior to sklearn's implementation

random_forest = BalancedRandomForestClassifier(max_depth=max_depth, random_state=random_state, n_estimators=10, class_weight='balanced_subsample')

# Fit the model on the entire data

random_forest.fit(X_train, y_train)


BalancedRandomForestClassifier(class_weight='balanced_subsample', max_depth=20,
                               n_estimators=10, random_state=22)

In [13]:
# We make predictions on the validation set 

predictions = random_forest.predict(X_val)

# compute the f1-score and assign it to variable score3

score3 = round(f1_score(predictions,y_val),2)

# compute the `auc` and assign it to variable auc3

auc3 = round(roc_auc_score(predictions,y_val),2)

In [15]:
# Finally, we compare the results from the three implementations above

pt = PrettyTable()
pt.field_names = ["Strategy","F1 Score","AUC score"]
pt.add_row(["Random Forest - No imbalance correction",score1,auc1])
pt.add_row(["Random Forest - balanced_subsamples",score2,auc2])
pt.add_row(["Random Forest - SMOTE balancing",score3,auc3])
print(pt)

+-----------------------------------------+----------+-----------+
|                 Strategy                | F1 Score | AUC score |
+-----------------------------------------+----------+-----------+
| Random Forest - No imbalance correction |   0.44   |    0.68   |
|   Random Forest - balanced_subsamples   |   0.51   |    0.7    |
|     Random Forest - SMOTE balancing     |   0.63   |    0.71   |
+-----------------------------------------+----------+-----------+


## Mindchow 🍲

- How is the imblearn implementation different from sklearn?
- How is it giving you superior results?

Read more about **imblearn**'s implementation of `BalancedRandomForestClassifier` and try tweaking it to get a higher f1 and AUC score.
