In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv("16-diabetes.csv")

In [4]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [5]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [7]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [9]:
columns_to_check  = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]

for col in columns_to_check:
    zero_count = (df[col] == 0).sum()
    zero_per = 100*zero_count / len(df)
    print(f"{col}: {zero_count} %{zero_per:.2f}")


Glucose: 5 %0.65
BloodPressure: 35 %4.56
SkinThickness: 227 %29.56
Insulin: 374 %48.70
BMI: 11 %1.43


In [10]:
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15) 

In [12]:
columns_to_fill  = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]

medians = {}
for col in columns_to_fill:
    median_value = X_train[X_train[col] !=0][col].median()
    medians[col] = median_value
    X_train[col]= X_train[col].replace(0, median_value)

for col in columns_to_fill:
    X_test[col]= X_test[col].replace(0, medians[col])

In [13]:
X_train.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0
mean,3.907166,121.560261,72.612378,29.040717,142.477199,32.448208,0.469948,33.285016
std,3.385438,29.974412,12.165642,8.312217,80.87933,6.862948,0.328516,11.678337
min,0.0,44.0,24.0,7.0,14.0,18.2,0.084,21.0
25%,1.0,100.0,64.0,25.0,125.0,27.6,0.23825,24.0
50%,3.0,117.0,72.0,29.0,129.5,32.3,0.3705,29.0
75%,6.0,139.75,80.0,32.0,130.0,36.5,0.63075,40.0
max,17.0,199.0,122.0,63.0,680.0,67.1,2.42,81.0


In [14]:
X_test.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,154.0,154.0,154.0,154.0,154.0,154.0,154.0,154.0
mean,3.597403,122.038961,71.487013,29.376623,144.402597,32.483117,0.479565,33.064935
std,3.304818,32.320876,11.813495,10.513035,104.291511,6.946159,0.343303,12.118519
min,0.0,61.0,30.0,7.0,23.0,18.4,0.078,21.0
25%,1.0,95.25,64.0,23.25,108.25,26.925,0.254,24.0
50%,3.0,117.0,72.0,29.0,129.5,32.3,0.3765,28.0
75%,5.75,142.75,80.0,33.75,129.5,36.95,0.60375,41.0
max,13.0,197.0,106.0,99.0,846.0,55.0,2.329,69.0


In [18]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [19]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [20]:
ada = AdaBoostClassifier()
ada.fit(X_train,y_train)
y_pred = ada.predict(X_test)

In [21]:
print (classification_report(y_test,y_pred))
print (accuracy_score(y_test,y_pred))
print (confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.84      0.81      0.82       108
           1       0.58      0.63      0.60        46

    accuracy                           0.75       154
   macro avg       0.71      0.72      0.71       154
weighted avg       0.76      0.75      0.76       154

0.7532467532467533
[[87 21]
 [17 29]]


In [23]:
# hyperparameter tuning

In [24]:
from sklearn.model_selection import GridSearchCV

In [25]:
ada_params = {
    "n_estimators" : [50,70,100,120,150,200],
    "learning_rate" : [0.001, 0.01, 0.1,1,10]
}

In [32]:
grid = GridSearchCV(estimator=AdaBoostClassifier(), param_grid= ada_params, cv=3, verbose=1, n_jobs=-1)

In [33]:
grid.fit(X_train, y_train)


Fitting 3 folds for each of 30 candidates, totalling 90 fits


0,1,2
,estimator,AdaBoostClassifier()
,param_grid,"{'learning_rate': [0.001, 0.01, ...], 'n_estimators': [50, 70, ...]}"
,scoring,
,n_jobs,-1
,refit,True
,cv,3
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,estimator,
,n_estimators,200
,learning_rate,1
,algorithm,'deprecated'
,random_state,


In [34]:
ada = AdaBoostClassifier(n_estimators = 200, learning_rate=1)
ada.fit(X_train,y_train)
y_pred = ada.predict(X_test)
print (classification_report(y_test,y_pred))
print (accuracy_score(y_test,y_pred))
print (confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.83      0.80      0.82       108
           1       0.57      0.63      0.60        46

    accuracy                           0.75       154
   macro avg       0.70      0.71      0.71       154
weighted avg       0.76      0.75      0.75       154

0.7467532467532467
[[86 22]
 [17 29]]


In [36]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB


models = {
    "Decision Tree": DecisionTreeClassifier(),
    "K-Neighbors Regressor" : KNeighborsClassifier(),
    "Random Forest" : RandomForestClassifier(),
    "SVC" : SVC(),
    "Bayes" : GaussianNB()
}

def calculate_model_metrics(true,predicted):
    acc =accuracy_score(true,predicted) 
    con =confusion_matrix(true,predicted) 
    clas =classification_report(true,predicted) 
    return acc, con, clas

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_train_acc, model_train_con, model_train_clas = calculate_model_metrics(y_train, y_train_pred)
    model_test_acc, model_test_con, model_test_clas = calculate_model_metrics(y_test, y_test_pred)

    print(list(models.values())[i])

    print("TRAİNİNG")
    print("A: ", model_train_acc)
    print("B: ", model_train_con)
    print("C: ", model_train_clas)

    print("---------------------------")
    print("TEST")
    print("A: ", model_test_acc)
    print("C: ", model_test_con)
    print("Cl: ", model_test_clas)

    print("---------------------------")
    print("\n")

DecisionTreeClassifier()
TRAİNİNG
A:  1.0
B:  [[392   0]
 [  0 222]]
C:                precision    recall  f1-score   support

           0       1.00      1.00      1.00       392
           1       1.00      1.00      1.00       222

    accuracy                           1.00       614
   macro avg       1.00      1.00      1.00       614
weighted avg       1.00      1.00      1.00       614

---------------------------
TEST
A:  0.6948051948051948
C:  [[78 30]
 [17 29]]
Cl:                precision    recall  f1-score   support

           0       0.82      0.72      0.77       108
           1       0.49      0.63      0.55        46

    accuracy                           0.69       154
   macro avg       0.66      0.68      0.66       154
weighted avg       0.72      0.69      0.70       154

---------------------------


KNeighborsClassifier()
TRAİNİNG
A:  0.8306188925081434
B:  [[346  46]
 [ 58 164]]
C:                precision    recall  f1-score   support

           0      

In [40]:
rf_params = {
    "n_estimators" : [100, 200, 500, 1000],
    "max_depth" : [5, 8, 15, 10, None],
    "max_features" : ["sqrt", "log2", 5,6,7,8],
    "min_samples_split" : [2,8,15,20]
}

svc_params = {
    "C" : [0.1, 1, 10, 100, 1000],
    "kernel" : ["linear", "poly", "rbf", "sigmoid", "precomputed"],
    "gamma" : ["scale", "auto"]
}

svc_params = {
    "C" : [0.1, 1, 10, 100, 1000],
    "kernel" : ["linear", "rbf", "sigmoid"],
    "gamma" : ["scale", "auto"]
}
tree_params = {
    "criterion" : ["gini","entropy","log_loss"],
    "splitter" : ["best", "random"],
    "max_depth" : [1,2,3,4,5,15,None],
    "max_features" : ["sqrt", "log2", None]    
}
knn_params = {
    "n_neighbors": [3, 5, 7, 9, 11, 15],    
    "weights": ["uniform", "distance"],     
    "metric": ["euclidean", "manhattan"]    
}

from sklearn.model_selection import RandomizedSearchCV
randomcv_models = [
    ("KNN", KNeighborsClassifier(), knn_params),
    ("RF", RandomForestClassifier(), rf_params),
    ("DT", DecisionTreeClassifier(), tree_params),
    ("SVC", SVC(), svc_params),
]
for name , model, params in randomcv_models:
    randomcv = RandomizedSearchCV(estimator=model , param_distributions=params, n_iter = 100,cv=3, n_jobs=-1)
    randomcv.fit(X_train,y_train)
    print( "best params: ", name, randomcv.best_params_)





best params:  KNN {'weights': 'distance', 'n_neighbors': 9, 'metric': 'euclidean'}
best params:  RF {'n_estimators': 100, 'min_samples_split': 8, 'max_features': 'log2', 'max_depth': None}
best params:  DT {'splitter': 'best', 'max_features': None, 'max_depth': 5, 'criterion': 'log_loss'}




best params:  SVC {'kernel': 'linear', 'gamma': 'scale', 'C': 0.1}


In [41]:
models = {
    "Decision Tree": DecisionTreeClassifier(splitter= "best", max_features=None,max_depth=5,criterion="log_loss"),
    "K-Neighbors Regressor" : KNeighborsClassifier(weights="distance", n_neighbors=9, metric="euclidean"),
    "Random Forest" : RandomForestClassifier(n_estimators=100,min_samples_split=8, max_features="log2" , max_depth = None),
    "SVC" : SVC(kernel= "linear",gamma= "scale", C=0.1)
}

def calculate_model_metrics(true,predicted):
    acc =accuracy_score(true,predicted) 
    con =confusion_matrix(true,predicted) 
    clas =classification_report(true,predicted) 
    return acc, con, clas

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_train_acc, model_train_con, model_train_clas = calculate_model_metrics(y_train, y_train_pred)
    model_test_acc, model_test_con, model_test_clas = calculate_model_metrics(y_test, y_test_pred)

    print(list(models.values())[i])

    print("TRAİNİNG")
    print("A: ", model_train_acc)
    print("B: ", model_train_con)
    print("C: ", model_train_clas)

    print("---------------------------")
    print("TEST")
    print("A: ", model_test_acc)
    print("C: ", model_test_con)
    print("Cl: ", model_test_clas)

    print("---------------------------")
    print("\n")

DecisionTreeClassifier(criterion='log_loss', max_depth=5)
TRAİNİNG
A:  0.8485342019543974
B:  [[367  25]
 [ 68 154]]
C:                precision    recall  f1-score   support

           0       0.84      0.94      0.89       392
           1       0.86      0.69      0.77       222

    accuracy                           0.85       614
   macro avg       0.85      0.81      0.83       614
weighted avg       0.85      0.85      0.84       614

---------------------------
TEST
A:  0.7337662337662337
C:  [[87 21]
 [20 26]]
Cl:                precision    recall  f1-score   support

           0       0.81      0.81      0.81       108
           1       0.55      0.57      0.56        46

    accuracy                           0.73       154
   macro avg       0.68      0.69      0.68       154
weighted avg       0.74      0.73      0.73       154

---------------------------


KNeighborsClassifier(metric='euclidean', n_neighbors=9, weights='distance')
TRAİNİNG
A:  1.0
B:  [[392   0]
 [ 