In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("./diabetes.csv")

In [3]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [4]:
xData = df.iloc[:,:-1]
yData = df.iloc[:,-1]

In [5]:
xData

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [6]:
yData

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

In [7]:
yData.value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

In [8]:
xData = xData.map(lambda x: x if x!=0 else np.nan)

In [9]:
xData.isnull().sum()

Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
dtype: int64

In [10]:
from sklearn.model_selection import train_test_split

In [23]:
xTrain, xTest, yTrain, yTest = train_test_split(xData, yData, shuffle=True, test_size=0.2)
yTrain

625    0
32     0
734    0
587    0
524    0
      ..
140    0
266    1
186    1
633    0
378    1
Name: Outcome, Length: 614, dtype: int64

In [70]:
from sklearn.model_selection import KFold, RepeatedKFold, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.impute import KNNImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report

In [71]:
kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=5)

In [73]:
acc_list = []
for i, item in enumerate(kf.split(xData, yData)):
    train_idx, test_idx = item
    
    xTrain = xData.loc[train_idx]
    xTest = xData.loc[test_idx]
    
    yTrain = yData.loc[train_idx]
    yTest = yData.loc[test_idx]
    
    
    imputer = KNNImputer(n_neighbors=5)
    imputer.fit(xTrain)
    
    xTrain = pd.DataFrame(imputer.transform(xTrain), columns=xTrain.columns)
    xTest = pd.DataFrame(imputer.transform(xTest), columns=xTrain.columns)
    
    model = DecisionTreeClassifier()
    model.fit(xTrain, yTrain)
    y_pred = model.predict(xTest)
    
    acc = accuracy_score(yTest, y_pred)
    print(f"Fold {i} --> acc: {acc}")
    acc_list.append(acc)

Fold 0 --> acc: 0.6883116883116883
Fold 1 --> acc: 0.6363636363636364
Fold 2 --> acc: 0.6493506493506493
Fold 3 --> acc: 0.6993464052287581
Fold 4 --> acc: 0.7124183006535948
Fold 5 --> acc: 0.7012987012987013
Fold 6 --> acc: 0.6298701298701299
Fold 7 --> acc: 0.7272727272727273
Fold 8 --> acc: 0.6928104575163399
Fold 9 --> acc: 0.7189542483660131
Fold 10 --> acc: 0.6753246753246753
Fold 11 --> acc: 0.7402597402597403
Fold 12 --> acc: 0.7272727272727273
Fold 13 --> acc: 0.7516339869281046
Fold 14 --> acc: 0.6993464052287581
Fold 15 --> acc: 0.7597402597402597
Fold 16 --> acc: 0.7012987012987013
Fold 17 --> acc: 0.6948051948051948
Fold 18 --> acc: 0.6470588235294118
Fold 19 --> acc: 0.673202614379085
Fold 20 --> acc: 0.6948051948051948
Fold 21 --> acc: 0.6493506493506493
Fold 22 --> acc: 0.6948051948051948
Fold 23 --> acc: 0.673202614379085
Fold 24 --> acc: 0.6666666666666666


In [74]:
print(f"overal model performance: {np.mean(acc_list)} +/-({2*np.std(acc_list)})")

overal model performance: 0.6921908157202276 +/-(0.06772090056515088)
