In [None]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

df = pd.read_csv('diabetes_prediction_dataset.csv')
df = df.drop(df[df['diabetes'] == 0].index[:83000])

dataset_features = ['gender', 'smoking_history']

for col in dataset_features:
    label_encoder = LabelEncoder()
    df[col] = label_encoder.fit_transform(df[col])

X = df.drop(columns=['diabetes'])
y = df['diabetes']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = KNeighborsClassifier(n_neighbors=10)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

cf = confusion_matrix(y_test, y_pred)

tn, fp, fn, tp = cf.ravel()
(tn, fp, fn, tp)

accuracy = (tn + tp) / (tn + fp + fn + tp)
precision = (tp) / (tp + fp)
recall = (tp) / (tp + fn) 
f1_score = 2 * (precision * recall) / (precision + recall)

print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('Recall: ', recall)
print('F1 Score: ', f1_score)

Accuracy:  0.8635294117647059
Precision:  0.8784530386740331
Recall:  0.8432527990571597
F1 Score:  0.8604930847865303
