In [1]:
# STEP 1: INSTALL AND IMPORT LIBRARIES
%pip install numpy pandas scikit-learn matplotlib seaborn joblib

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


DEPRECATION: Loading egg at c:\program files\python311\lib\site-packages\vboxapi-1.0-py3.11.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330

[notice] A new release of pip is available: 25.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# STEP 2: LOAD AND INSPECT THE DATA
file_path = '../data/kidney_disease.csv'
df = pd.read_csv(file_path)

print("\n--- First 5 rows of the dataset ---")
print(df.head())

print("\n--- Dataset Info ---")
df.info()



--- First 5 rows of the dataset ---
   id   age    bp     sg   al   su     rbc        pc         pcc          ba  \
0   0  48.0  80.0  1.020  1.0  0.0     NaN    normal  notpresent  notpresent   
1   1   7.0  50.0  1.020  4.0  0.0     NaN    normal  notpresent  notpresent   
2   2  62.0  80.0  1.010  2.0  3.0  normal    normal  notpresent  notpresent   
3   3  48.0  70.0  1.005  4.0  0.0  normal  abnormal     present  notpresent   
4   4  51.0  80.0  1.010  2.0  0.0  normal    normal  notpresent  notpresent   

   ...  pcv    wc   rc  htn   dm  cad appet   pe  ane classification  
0  ...   44  7800  5.2  yes  yes   no  good   no   no            ckd  
1  ...   38  6000  NaN   no   no   no  good   no   no            ckd  
2  ...   31  7500  NaN   no  yes   no  poor   no  yes            ckd  
3  ...   32  6700  3.9  yes   no   no  poor  yes  yes            ckd  
4  ...   35  7300  4.6   no   no   no  good   no   no            ckd  

[5 rows x 26 columns]

--- Dataset Info ---
<class 'pan

In [3]:
# STEP 3: DATA CLEANING

# The 'id' column is just an identifier, so we don't need it.
df.drop('id', axis=1, inplace=True)

# The target column 'classification' has text 'ckd' or 'notckd'.
# Machine learning models only understand numbers, so we'll convert them:
# 'ckd' will become 1 (meaning "disease present")
# 'notckd' will become 0 (meaning "disease not present")
df['classification'] = df['classification'].map({'ckd': 1, 'notckd': 0})

# Some columns that should be numbers are stored as text ('object').
# We need to convert them to numeric types. 'coerce' will turn any bad values into 'NaN' (Not a Number).
numeric_cols_to_fix = ['pcv', 'wc', 'rc']
for col in numeric_cols_to_fix:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Other columns have 'yes'/'no', 'present'/'notpresent', etc.
# We will convert all of these into 1s and 0s.
binary_map_cols = ['htn', 'dm', 'cad', 'pe', 'ane']
for col in binary_map_cols:
    df[col] = df[col].map({'yes': 1, 'no': 0})
df['appet'] = df['appet'].map({'good': 1, 'poor': 0})
df['pcc'] = df['pcc'].map({'present': 1, 'notpresent': 0})
df['ba'] = df['ba'].map({'present': 1, 'notpresent': 0})
df['rbc'] = df['rbc'].map({'normal': 1, 'abnormal': 0})
df['pc'] = df['pc'].map({'normal': 1, 'abnormal': 0})

# Now we need to handle the missing values (NaNs).
# We will fill them with the median (the middle value) of each column.
# This is a good strategy because the median isn't skewed by very high or low values.
for col in df.columns:
    if df[col].dtype == 'object': # If it's a text column (which it shouldn't be now)
        df[col].fillna(df[col].mode()[0], inplace=True)
    else: # If it's a number column
        df[col].fillna(df[col].median(), inplace=True)

print("\n--- Data after cleaning ---")
print(df.head())
print(f"\nTotal missing values after cleaning: {df.isnull().sum().sum()}")


--- Data after cleaning ---
    age    bp     sg   al   su  rbc   pc  pcc   ba    bgr  ...   pcv      wc  \
0  48.0  80.0  1.020  1.0  0.0  1.0  1.0  0.0  0.0  121.0  ...  44.0  7800.0   
1   7.0  50.0  1.020  4.0  0.0  1.0  1.0  0.0  0.0  121.0  ...  38.0  6000.0   
2  62.0  80.0  1.010  2.0  3.0  1.0  1.0  0.0  0.0  423.0  ...  31.0  7500.0   
3  48.0  70.0  1.005  4.0  0.0  1.0  0.0  1.0  0.0  117.0  ...  32.0  6700.0   
4  51.0  80.0  1.010  2.0  0.0  1.0  1.0  0.0  0.0  106.0  ...  35.0  7300.0   

    rc  htn   dm  cad  appet   pe  ane  classification  
0  5.2  1.0  1.0  0.0    1.0  0.0  0.0             1.0  
1  4.8  0.0  0.0  0.0    1.0  0.0  0.0             1.0  
2  4.8  0.0  1.0  0.0    0.0  0.0  1.0             1.0  
3  3.9  1.0  0.0  0.0    0.0  1.0  1.0             1.0  
4  4.6  0.0  0.0  0.0    1.0  0.0  0.0             1.0  

[5 rows x 25 columns]

Total missing values after cleaning: 0


In [4]:
# STEP 4: SEPARATE FEATURES (X) AND TARGET (y)
# 'X' is our input features (the patient's data).
# 'y' is our target output (whether they have the disease or not).
X = df.drop('classification', axis=1)
y = df['classification']

In [5]:
# STEP 5: SPLIT DATA INTO TRAINING AND TESTING SETS
# We will use 80% of the data to teach the model (training set)
# and 20% to test how well it learned (testing set).
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [6]:
# STEP 6: FEATURE SCALING
# We need to put all our features on the same scale.
# For example, 'age' (e.g., 50) is much larger than 'blood_urea' (e.g., 1.2).
# Scaling fixes this, which helps the model learn better.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
# Use the same scaler for the test data
X_test_scaled = scaler.transform(X_test)

In [7]:
# STEP 7: TRAIN THE MODEL
# Model: RandomForestClassifier
print("\n--- Training the model... ---")
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)
print("Model training complete!")



--- Training the model... ---
Model training complete!


In [8]:
# STEP 8: EVALUATE THE MODEL
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

print(f"\n--- Model Performance ---")
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))



--- Model Performance ---
Accuracy: 100.00%

Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        30
         1.0       1.00      1.00      1.00        50

    accuracy                           1.00        80
   macro avg       1.00      1.00      1.00        80
weighted avg       1.00      1.00      1.00        80


Confusion Matrix:
 [[30  0]
 [ 0 50]]


In [9]:
# STEP 9: SAVE THE MODEL AND SCALER
model_path = '../saved_models/kidney_model.joblib'
scaler_path = '../saved_models/kidney_scaler.joblib'

joblib.dump(model, model_path)
joblib.dump(scaler, scaler_path)

print(f"\nModel saved to: {model_path}")
print(f"Scaler saved to: {scaler_path}")



Model saved to: ../saved_models/kidney_model.joblib
Scaler saved to: ../saved_models/kidney_scaler.joblib
