In [8]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split,GridSearchCV, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)

In [3]:
df = pd.read_csv("datasets/diabetes.csv")

In [5]:
print(f' Head \n {df.head()}')
print(f' Shape \n {df.shape}')
print(f' Describe \n {df.describe().T}')
print(f' Value_counts \n {df["Outcome"].value_counts()}')

 Head 
    Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  DiabetesPedigreeFunction  Age  Outcome
0            6      148             72             35        0  33.6                     0.627   50        1
1            1       85             66             29        0  26.6                     0.351   31        0
2            8      183             64              0        0  23.3                     0.672   32        1
3            1       89             66             23       94  28.1                     0.167   21        0
4            0      137             40             35      168  43.1                     2.288   33        1
 Shape 
 (768, 9)
 Describe 
                           count        mean         std     min       25%       50%        75%     max
Pregnancies               768.0    3.845052    3.369578   0.000   1.00000    3.0000    6.00000   17.00
Glucose                   768.0  120.894531   31.972618   0.000  99.00000  117.0000  140.25000  199.00

In [7]:
cols = [col for col in df.columns if col not in 'Outcome']

scaler = StandardScaler()
df[cols] = scaler.fit_transform(df[cols])

In [10]:
y = df["Outcome"]
X = df.drop("Outcome", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=17)

In [12]:
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
predicts = knn_model.predict(X_test)

In [13]:
#random_user = X_test.sample(1)

In [15]:
print(classification_report(y_test, predicts))

              precision    recall  f1-score   support

           0       0.76      0.85      0.80       202
           1       0.63      0.48      0.55       106

    accuracy                           0.72       308
   macro avg       0.69      0.67      0.67       308
weighted avg       0.71      0.72      0.71       308



In [16]:
y_prob = knn_model.predict_proba(X_test)[:, 1]
print(roc_auc_score(y_test, y_prob))

0.7481085372688212


In [17]:
knn_model2 = KNeighborsClassifier()
cv_result = cross_validate(knn_model2, X_train, y_train, cv=5, scoring=["accuracy", "f1", "roc_auc"])
cv_result

{'fit_time': array([0.00238013, 0.0045433 , 0.00200462, 0.00149012, 0.00201297]),
 'score_time': array([0.04173541, 0.01907802, 0.01455164, 0.01456738, 0.01405287]),
 'test_accuracy': array([0.73913043, 0.72826087, 0.7826087 , 0.72826087, 0.72826087]),
 'test_f1': array([0.61290323, 0.59016393, 0.65517241, 0.61538462, 0.63768116]),
 'test_roc_auc': array([0.725     , 0.7234375 , 0.81848958, 0.80713919, 0.81997946])}

In [21]:
print(f"test_accuracy \n {cv_result['test_accuracy'].mean()}")
print(f"test_f1 \n {cv_result['test_f1'].mean()}")
print(f"test_roc_auc \n {cv_result['test_roc_auc'].mean()}")

test_accuracy 
 0.7413043478260869
test_f1 
 0.6222610697661379
test_roc_auc 
 0.778809145480226


In [23]:
#Hyperparameter Optimization
knn_model3 = KNeighborsClassifier()
knn_model.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [24]:
knn_params = {"n_neighbors": range(2,50)}

In [25]:
knn_gs_best = GridSearchCV(knn_model3, knn_params, cv=5, n_jobs=-1, verbose=1).fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [27]:
knn_gs_best.best_params_

{'n_neighbors': 24}

In [32]:
knn_model4 = KNeighborsClassifier()
knn_final = knn_model4.set_params(**knn_gs_best.best_params_)

cv_results = cross_validate(knn_final,
                            X_train,
                            y_train,
                            cv=5,
                            scoring=["accuracy", "f1", "roc_auc"])

print(f"test_accuracy \n {cv_results['test_accuracy'].mean()}")
print(f"test_f1 \n {cv_results['test_f1'].mean()}")
print(f"test_roc_auc \n {cv_results['test_roc_auc'].mean()}")

test_accuracy 
 0.7782608695652175
test_f1 
 0.6179515084722415
test_roc_auc 
 0.8356916891371341
