In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("kidney_disease.csv")

# Match the PDF: 0 = CKD (positive), 1 = notckd (negative)
df["classification"] = df["classification"].map({"ckd": 0, "notckd": 1})

# Drop rows where the label is missing
df = df.dropna(subset=["classification"])

# Numeric features only (drop id if it exists)
X = df.select_dtypes(include=["number"]).drop(columns=["classification", "id"], errors="ignore")
y = df["classification"]

# Split first
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Impute using TRAINING means only (no leakage)
train_means = X_train.mean()
X_train = X_train.fillna(train_means)
X_test  = X_test.fillna(train_means)

# We should not train and test a model on the same data since the model has already seen that data during training, it can memorize the patterns. Testing on the same data makes the results look artificially high and doesnâ€™t tell us how the model will perform on new, unseen cases

# The purpose of the testing set is held out and not used for training. It simulates new patient data and gives an unbiased estimate of how well the trained model generalizes in the real world.