<a href="https://colab.research.google.com/github/liuy01510/portfolio/blob/master/Python/ML/Decision_Trees/Decision_Tree_Classifer_(Diabetes).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing modules

In [267]:
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.datasets import fetch_openml
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.model_selection import train_test_split, cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,roc_auc_score
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

In [268]:
try:
    data
except:
    data=fetch_openml(data_id=37) # fetching the diabetes dataset

X_frame=pd.DataFrame(data=data['data'],columns=data['feature_names'])
classes=sorted(list(set(data['target'])))
data['binary_target']=[0 if i==classes[0] else 1 for i in data['target']]
y_frame=pd.DataFrame(data=data['binary_target'],columns=data['target_names'])

# Initial analysis of the Decision Tree (DT) model

In [269]:
#Default model#
rnd_num=10 # default number used for the random_state arg during the splitting of the training/test sets.
dtc=DecisionTreeClassifier() # unlimited tree growth, guaranteed to overfit the data (Max variance)
X_train,X_test,y_train,y_test=train_test_split(X_frame.to_numpy(),y_frame.to_numpy(),test_size=0.2,random_state=rnd_num)

#Default model training#
dtc.fit(X_train,y_train)
print(export_text(dtc,feature_names=data['feature_names'],max_depth=3)) # truncated form of the decision tree
original_tree_depth=dtc.get_depth()
original_tree_leaves=dtc.get_n_leaves()
print(f"Depth of tree = {original_tree_depth}")
print(f"Number of leaves = {original_tree_leaves}")

|--- plas <= 154.50
|   |--- mass <= 26.35
|   |   |--- age <= 53.50
|   |   |   |--- class: 0
|   |   |--- age >  53.50
|   |   |   |--- pres <= 93.00
|   |   |   |   |--- truncated branch of depth 3
|   |   |   |--- pres >  93.00
|   |   |   |   |--- class: 1
|   |--- mass >  26.35
|   |   |--- age <= 30.50
|   |   |   |--- plas <= 118.50
|   |   |   |   |--- truncated branch of depth 10
|   |   |   |--- plas >  118.50
|   |   |   |   |--- truncated branch of depth 10
|   |   |--- age >  30.50
|   |   |   |--- plas <= 99.50
|   |   |   |   |--- truncated branch of depth 5
|   |   |   |--- plas >  99.50
|   |   |   |   |--- truncated branch of depth 14
|--- plas >  154.50
|   |--- age <= 62.50
|   |   |--- insu <= 544.00
|   |   |   |--- pedi <= 0.13
|   |   |   |   |--- truncated branch of depth 2
|   |   |   |--- pedi >  0.13
|   |   |   |   |--- truncated branch of depth 9
|   |   |--- insu >  544.00
|   |   |   |--- pres <= 69.00
|   |   |   |   |--- class: 1
|   |   |   |--- pres

In [270]:
#Model evaluation#
dtc=DecisionTreeClassifier() # default dtc

def Model_Evaluation(model,X_train,y_train):
    y_predict=cross_val_predict(model,X_train,y_train,cv=5,n_jobs=-1)
    eval={}
    eval['Accuracy']=accuracy_score(y_train,y_predict)
    eval['Precision']=precision_score(y_train,y_predict)
    eval['Recall']=recall_score(y_train,y_predict)
    eval['F1']=f1_score(y_train,y_predict)
    eval['AUC']=roc_auc_score(y_train,y_predict)

    for k,v in eval.items():
        print(f"{k} score: {v:.3f}")

Model_Evaluation(dtc,X_train,y_train)

Accuracy score: 0.689
Precision score: 0.543
Recall score: 0.545
F1 score: 0.544
AUC score: 0.654


# Optimal hyperparameter determination

In [271]:
# Creating the parameter grid
params_grid={}
params_grid['max_depth']=list(range(2,original_tree_depth+1))
params_grid['min_samples_split']=[i/10 for i in range(1,11)]
params_grid['max_leaf_nodes']=list(range(2,original_tree_leaves,10))

#Perfoming grid search#
dtc=DecisionTreeClassifier()
gscv=GridSearchCV(dtc,params_grid,scoring='f1',cv=5,n_jobs=-1)
gscv.fit(X_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=-1,
             param_grid={'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
 

In [272]:
#Checking the optimal hyperparameters#
print(f"Optimal hyperparameters = {gscv.best_params_}")
print(f"F1 score from the optimal parameters = {gscv.best_score_}")

dtc_optimal=gscv.best_estimator_
Model_Evaluation(dtc_optimal,X_train,y_train)

Optimal hyperparameters = {'max_depth': 5, 'max_leaf_nodes': 12, 'min_samples_split': 0.2}
F1 score from the optimal parameters = 0.6407635500654845
Accuracy score: 0.751
Precision score: 0.628
Recall score: 0.656
F1 score: 0.642
AUC score: 0.728


# Ensemble Training

## Bagging Classifier Model

In [273]:
#Creation of the bagging classifier model#
dtc=DecisionTreeClassifier()
dtc_bag=BaggingClassifier(dtc,n_estimators=300,max_samples=0.2,bootstrap=True,random_state=rnd_num,n_jobs=-1) # random hyperparams

In [274]:
#Evaluating bagging classifier performance#
Model_Evaluation(dtc_bag,X_train,y_train.ravel()) # 1d array required for the y_train array

Accuracy score: 0.769
Precision score: 0.694
Recall score: 0.574
F1 score: 0.628
AUC score: 0.722


In [275]:
#Optimal hyperparameters determination#
params_grid={}
params_grid['n_estimators']=[i for i in range(100,1100,200)]
params_grid['max_depth']=[i for i in range(2,original_tree_depth+1,2)]


gscv_rfc=GridSearchCV(RandomForestClassifier(max_samples=0.2),param_grid=params_grid,scoring='f1',cv=5,n_jobs=-1)
# Note that the max_samples hyperparameter is randomly selected due to processing time constraints during GridSearch
gscv_rfc.fit(X_train,y_train.ravel())

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=0.2,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              random

In [276]:
#Evaluation of the optimal hyperparameters#
print(f"Optimal Hyperparameters: {gscv_rfc.best_params_}, Score: {gscv_rfc.best_score_:.3f}")

params_grid_2={}
#2nd round of hyperparameter tuning#
x='max_depth'
i=params_grid[x].index(gscv_rfc.best_params_[x])
if i!=0 and i!=len(params_grid[x])-1:
    params_grid_2[x]=[j for j in range(params_grid[x][i-1],params_grid[x][i+1]+1,1)]

x='n_estimators'
i=params_grid[x].index(gscv_rfc.best_params_[x])
if i!=0 and i!=len(params_grid[x])-1:
    params_grid_2[x]=[j for j in range(params_grid[x][i-1],params_grid[x][i+1]+100,100)]

gscv_rfc=GridSearchCV(RandomForestClassifier(max_samples=0.2,**gscv_rfc.best_params_),param_grid=params_grid_2,scoring='f1',cv=5,n_jobs=-1)
gscv_rfc.fit(X_train,y_train.ravel())

Optimal Hyperparameters: {'max_depth': 8, 'n_estimators': 100}, Score: 0.643


GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=8,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=0.2,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              random_st

In [277]:
#Obtaining the optimal hyperparameters after the 2nd round of tuning#
print(f"Optimal hyperparameters: {gscv_rfc.best_params_}, Score: {gscv_rfc.best_score_:.3f}") # not much further improvement

#Performing final round of model evaluation#
rfc_optimal=RandomForestClassifier(max_samples=0.2,**gscv_rfc.best_params_)

Model_Evaluation(rfc_optimal,X_train,y_train)

Optimal hyperparameters: {'max_depth': 8}, Score: 0.626
Accuracy score: 0.779
Precision score: 0.713
Recall score: 0.584
F1 score: 0.642
AUC score: 0.731
