In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("kidney_disease.csv")

# Match the PDF: 0 = CKD (positive), 1 = notckd (negative)
df["classification"] = df["classification"].map({"ckd": 0, "notckd": 1})

# Drop rows where the label is missing
df = df.dropna(subset=["classification"])

# Numeric features only (drop id if it exists)
X = df.select_dtypes(include=["number"]).drop(columns=["classification", "id"], errors="ignore")
y = df["classification"]

# Split first
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Impute using TRAINING means only (no leakage)
train_means = X_train.mean()
X_train = X_train.fillna(train_means)
X_test  = X_test.fillna(train_means)

# KNN k=5
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

# Metrics: CKD is the positive class => pos_label=0
cm = confusion_matrix(y_test, y_pred)  # labels order [0,1] => [[TP? nope], it's [[TN,FP],[FN,TP]] for pos=1; so interpret carefully
acc  = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, pos_label=0)
rec  = recall_score(y_test, y_pred, pos_label=0)
f1   = f1_score(y_test, y_pred, pos_label=0)

print("Confusion Matrix:\n", cm)
print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1-score : {f1:.4f}")

# In this kidney disease task, a True Positive (TP) means the patient truly has CKD and the model predicts CKD correctly.

# A True Negative (TN) means the patient truly does not have CKD and the model predicts notckd correctly.

# A False Positive (FP) means the model predicts CKD but the patient actually does not have CKD, which could cause unnecessary stress or follow-up tests.

# A False Negative (FN) means the model predicts notckd but the patient actually has CKD, which is dangerous because the disease case is missed.

# Accuracy alone may be misleading because it can look high even when the model performs poorly on the more important class, especially if the classes are imbalanced.

# If missing a kidney disease case is very serious, **recall (sensitivity) for CKD** is the most important metric because it measures how many real CKD cases we successfully detect.

# Maximizing recall helps minimize false negatives, which reduces the chance of failing to identify a patient who actually has CKD.


Confusion Matrix:
 [[55 21]
 [ 2 42]]
Accuracy : 0.8083
Precision: 0.9649
Recall   : 0.7237
F1-score : 0.8271
