In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

kidney_disease_data = pd.read_csv("kidney_disease.csv")
kidney_disease_data = kidney_disease_data.replace('?', pd.NA).dropna()

X = kidney_disease_data.drop(columns=["classification"])
y = kidney_disease_data["classification"]

X = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42
)

# Train a K-Nearest Neighbors classifier with k = 5 using the training data
knn_model = KNeighborsClassifier(n_neighbors = 5)
knn_model.fit(X_train, y_train)

# Predict labels for the test data
y_pred = knn_model.predict(X_test)

# Compute and display the confusion matrix

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# Compute and print Accuracy, Precision, Recall, F1-score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label = 'ckd')
recall = recall_score(y_test, y_pred, pos_label = 'ckd')
f1 = f1_score(y_test, y_pred, pos_label = 'ckd')

print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {precision:.4f}")
print(f"F1-score:  {f1:.4f}")


# In this kidney disease task, a True Positive (TP) means the patient truly has CKD and the model predicts CKD correctly.

# A True Negative (TN) means the patient truly does not have CKD and the model predicts notckd correctly.

# A False Positive (FP) means the model predicts CKD but the patient actually does not have CKD, which could cause unnecessary stress or follow-up tests.

# A False Negative (FN) means the model predicts notckd but the patient actually has CKD, which is dangerous because the disease case is missed.

# Accuracy alone may be misleading because it can look high even when the model performs poorly on the more important class, especially if the classes are imbalanced.

# If missing a kidney disease case is very serious, **recall (sensitivity) for CKD** is the most important metric because it measures how many real CKD cases we successfully detect.

# Maximizing recall helps minimize false negatives, which reduces the chance of failing to identify a patient who actually has CKD.


Confusion Matrix:
 [[13  0]
 [ 0 35]]
Accuracy:  1.0000
Precision: 1.0000
Recall:    1.0000
F1-score:  1.0000
