In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("kidney_disease.csv")

# Match the PDF: 0 = CKD (positive), 1 = notckd (negative)
df["classification"] = df["classification"].map({"ckd": 0, "notckd": 1})

# Drop rows where the label is missing
df = df.dropna(subset=["classification"])

# Numeric features only (drop id if it exists)
X = df.select_dtypes(include=["number"]).drop(columns=["classification", "id"], errors="ignore")
y = df["classification"]

# Split first
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Impute using TRAINING means only (no leakage)
train_means = X_train.mean()
X_train = X_train.fillna(train_means)
X_test  = X_test.fillna(train_means)

k_values = [1, 3, 5, 7, 9]
accuracy_results = []

for k in k_values:
    knn_model = KNeighborsClassifier(n_neighbors=5)
    knn_model.fit(X_train, y_train)

    predicted_y = knn_model.predict(X_test)

    cm = confusion_matrix(y_test, predicted_y)
    acc = accuracy_score(y_test, predicted_y)
    accuracy_results.append({"k": k, "Test_Accuracy": acc})

# Results table
results_df = pd.DataFrame(accuracy_results)
print(results_df.to_string(index=False))

# Best k
best_row = results_df.loc[results_df["Test_Accuracy"].idxmax()]
best_k = int(best_row["k"])
best_accuracy = float(best_row["Test_Accuracy"])
print(f"\nBest k = {best_k} with test accuracy = {best_accuracy:.4f}")

# Changing k controls how “local” the decision boundary is: small k uses very nearby neighbors, large k averages across more points..Very small k can overfit because the model becomes sensitive to noise or unusual training points and may memorize quirks.Very large k can underfit because it smooths too much and may ignore meaningful local patterns that separate CKD vs notckd. In practice, we compare several k values on a test (or validation) set to find a balance that generalizes well.

 k  Test_Accuracy
 1       0.808333
 3       0.808333
 5       0.808333
 7       0.808333
 9       0.808333

Best k = 1 with test accuracy = 0.8083
