In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pyforest
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [2]:
# Read data from cleaned file

df = pd.read_csv('../data/clean_Diabetes.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 532 entries, 0 to 531
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                532 non-null    int64  
 1   Pregnancies               532 non-null    int64  
 2   Glucose                   532 non-null    int64  
 3   BloodPressure             532 non-null    int64  
 4   SkinThickness             532 non-null    int64  
 5   Insulin                   532 non-null    int64  
 6   BMI                       532 non-null    float64
 7   DiabetesPedigreeFunction  532 non-null    float64
 8   Age                       532 non-null    int64  
 9   Outcome                   532 non-null    int64  
dtypes: float64(2), int64(8)
memory usage: 41.7 KB


In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0,6,148,72,35,0,33.6,0.627,50,1
1,1,1,85,66,29,0,26.6,0.351,31,0
2,3,1,89,66,23,94,28.1,0.167,21,0
3,4,0,137,40,35,168,43.1,2.288,33,1
4,6,3,78,50,32,88,31.0,0.248,26,1


In [4]:
df.columns

Index(['Unnamed: 0', 'Pregnancies', 'Glucose', 'BloodPressure',
       'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age',
       'Outcome'],
      dtype='object')

In [5]:
df.drop('Unnamed: 0', axis = 1, inplace = True)

In [6]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

### Lazy Predict

In [7]:
# Define X and y

X = df[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']]
y = df['Outcome']

In [8]:
# Split the data into training and testing datasets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 101)

In [9]:
# Modeling with LazyPredict

import lazypredict
from lazypredict.Supervised import LazyClassifier

clf = LazyClassifier()

In [10]:
# Fit the models and do predictions

models, predictions = clf.fit(X_train, X_test, y_train, y_test)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 33.19it/s]


In [11]:
print(models)

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
NearestCentroid                    0.77               0.75     0.75      0.77   
AdaBoostClassifier                 0.79               0.74     0.74      0.79   
GaussianNB                         0.78               0.74     0.74      0.78   
LGBMClassifier                     0.79               0.73     0.73      0.78   
CalibratedClassifierCV             0.78               0.72     0.72      0.78   
BernoulliNB                        0.75               0.72     0.72      0.75   
LinearDiscriminantAnalysis         0.78               0.72     0.72      0.77   
SVC                                0.78               0.72     0.72      0.77   
RidgeClassifierCV                  0.78               0.72     0.72      0.77   
RidgeClassifier                    0.78               0.72     0.72      0.77   
QuadraticDiscriminantAnalysi

In [12]:
# Observation: Best is AdaBoostClassifier followed closely by LightGradientBoostingMachineClassifier (LGBMClassfier)
# Random Forest Classifer and SVC have done well too!

### Ada Boost Classifier with Default values

In [13]:
# Ada Boost classifier with default (Decision Tree)

from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics

In [14]:
# Create adaboost classifier object with default values -- Decision Tree, n_estimators = 50, learning_rate - 1

abc = AdaBoostClassifier(n_estimators=50, learning_rate=1)

In [15]:
# Train Adaboost classifier

model = abc.fit(X_train, y_train)

In [16]:
# Predict response for the test dataset

y_pred = model.predict(X_test)

In [17]:
# Model accuracy

print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))

Accuracy:  0.79375
