## CKD_Classification

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier


In [20]:
#Read the CSV file
data = pd.read_csv("CKD.csv")
data.head(5)

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,2.0,76.459948,c,3.0,0.0,normal,abnormal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,yes,no,yes
1,3.0,76.459948,c,2.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,34.0,12300.0,4.705597,no,no,no,yes,poor,no,yes
2,4.0,76.459948,a,1.0,0.0,normal,normal,notpresent,notpresent,99.0,...,34.0,8408.191126,4.705597,no,no,no,yes,poor,no,yes
3,5.0,76.459948,d,1.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,poor,yes,yes
4,5.0,50.0,c,0.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,36.0,12400.0,4.705597,no,no,no,yes,poor,no,yes


In [22]:
# Get the information of the feature's
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 399 entries, 0 to 398
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             399 non-null    float64
 1   bp              399 non-null    float64
 2   sg              399 non-null    object 
 3   al              399 non-null    float64
 4   su              399 non-null    float64
 5   rbc             399 non-null    object 
 6   pc              399 non-null    object 
 7   pcc             399 non-null    object 
 8   ba              399 non-null    object 
 9   bgr             399 non-null    float64
 10  bu              399 non-null    float64
 11  sc              399 non-null    float64
 12  sod             399 non-null    float64
 13  pot             399 non-null    float64
 14  hrmo            399 non-null    float64
 15  pcv             399 non-null    float64
 16  wc              399 non-null    float64
 17  rc              399 non-null    flo

In [28]:
#disply the categorical data with max frequency, unique and count values
data.describe(include='object')

Unnamed: 0,sg,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane,classification
count,399,399,399,399,399,399,399,399,399,399,399,399
unique,5,2,2,2,2,2,2,2,2,2,2,2
top,a,normal,normal,notpresent,notpresent,no,no,no,yes,poor,no,yes
freq,152,352,323,357,377,253,263,365,316,322,339,249


In [34]:
data.isnull().sum()

age               0
bp                0
sg                0
al                0
su                0
rbc               0
pc                0
pcc               0
ba                0
bgr               0
bu                0
sc                0
sod               0
pot               0
hrmo              0
pcv               0
wc                0
rc                0
htn               0
dm                0
cad               0
appet             0
pe                0
ane               0
classification    0
dtype: int64

In [32]:
data.sg.unique() # for this i'll be using mapping concept to encode the values

array(['c', 'a', 'd', 'b', 'e'], dtype=object)

In [36]:
#Print the shape of data
print("No of rows :",data.shape[0])
print("No of columns",data.shape[1])


No of rows : 399
No of columns 25


In [38]:
#Since we have one object value ('Gender') - convert to numerical value
df = pd.get_dummies(data, columns=['rbc','pc','pcc','ba','htn','dm','cad','appet','pe','ane'], drop_first=True,dtype=int)
df.head(5)


Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,...,rbc_normal,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes
0,2.0,76.459948,c,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,...,1,0,0,0,0,0,0,1,1,0
1,3.0,76.459948,c,2.0,0.0,148.112676,22.0,0.7,137.528754,4.627244,...,1,1,0,0,0,0,0,1,0,0
2,4.0,76.459948,a,1.0,0.0,99.0,23.0,0.6,138.0,4.4,...,1,1,0,0,0,0,0,1,0,0
3,5.0,76.459948,d,1.0,0.0,148.112676,16.0,0.7,138.0,3.2,...,1,1,0,0,0,0,0,1,0,1
4,5.0,50.0,c,0.0,0.0,148.112676,25.0,0.6,137.528754,4.627244,...,1,1,0,0,0,0,0,1,0,0


In [40]:
#Convert the categorical feature ('sg') to numarical using mapping concept:
mapping = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4}
df['sg_encoded'] = df['sg'].map(mapping)


In [42]:
# Drop original 'sg' column
df.drop('sg', axis=1, inplace=True)

In [44]:
df.head(5)

Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,...,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes,sg_encoded
0,2.0,76.459948,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,...,0,0,0,0,0,0,1,1,0,2
1,3.0,76.459948,2.0,0.0,148.112676,22.0,0.7,137.528754,4.627244,10.7,...,1,0,0,0,0,0,1,0,0,2
2,4.0,76.459948,1.0,0.0,99.0,23.0,0.6,138.0,4.4,12.0,...,1,0,0,0,0,0,1,0,0,0
3,5.0,76.459948,1.0,0.0,148.112676,16.0,0.7,138.0,3.2,8.1,...,1,0,0,0,0,0,1,0,1,3
4,5.0,50.0,0.0,0.0,148.112676,25.0,0.6,137.528754,4.627244,11.8,...,1,0,0,0,0,0,1,0,0,2


In [66]:
df.to_csv('encoded_data.csv',index=False)

In [46]:
#After encoded the categorical data to numerical
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 399 entries, 0 to 398
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             399 non-null    float64
 1   bp              399 non-null    float64
 2   al              399 non-null    float64
 3   su              399 non-null    float64
 4   bgr             399 non-null    float64
 5   bu              399 non-null    float64
 6   sc              399 non-null    float64
 7   sod             399 non-null    float64
 8   pot             399 non-null    float64
 9   hrmo            399 non-null    float64
 10  pcv             399 non-null    float64
 11  wc              399 non-null    float64
 12  rc              399 non-null    float64
 13  classification  399 non-null    object 
 14  rbc_normal      399 non-null    int32  
 15  pc_normal       399 non-null    int32  
 16  pcc_present     399 non-null    int32  
 17  ba_present      399 non-null    int

In [48]:
# split the data into X and Y
independent = df.drop(columns=['classification'])
dependent = df['classification']

In [50]:
#Get the count of each value in classification feature
dependent.value_counts()

classification
yes    249
no     150
Name: count, dtype: int64

In [52]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(independent, dependent, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [56]:
# Models dictionary
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier()
}

best_model_name = None
best_model = None
best_accuracy = 0

print("Model Evaluation:\n")
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    clf_acc = classification_report(y_test, y_pred)
    print(f"{name} Accuracy: {acc:.4f}")
    print("classification_report :\n",clf_acc)
    
    # Save best model
    if acc > best_accuracy:
        best_accuracy = acc
        best_model_name = name
        best_model = model

print(f"\n Best Model: {best_model_name} with Accuracy: {best_accuracy:.4f}")


Model Evaluation:

Logistic Regression Accuracy: 0.9875
classification_report :
               precision    recall  f1-score   support

          no       0.97      1.00      0.99        39
         yes       1.00      0.98      0.99        41

    accuracy                           0.99        80
   macro avg       0.99      0.99      0.99        80
weighted avg       0.99      0.99      0.99        80

Decision Tree Accuracy: 0.9750
classification_report :
               precision    recall  f1-score   support

          no       0.97      0.97      0.97        39
         yes       0.98      0.98      0.98        41

    accuracy                           0.97        80
   macro avg       0.97      0.97      0.97        80
weighted avg       0.97      0.97      0.97        80

SVM Accuracy: 0.9625
classification_report :
               precision    recall  f1-score   support

          no       0.95      0.97      0.96        39
         yes       0.97      0.95      0.96        41


In [58]:
from sklearn.model_selection import GridSearchCV

param_grid = {}

if best_model_name == "Logistic Regression":
    param_grid = {
        'C': [0.1, 1, 10],
        'solver': ['liblinear', 'lbfgs']
    }

elif best_model_name == "Decision Tree":
    param_grid = {
        'max_depth': [3, 5, 10, None],
        'min_samples_split': [2, 5, 10]
    }

elif best_model_name == "SVM":
    param_grid = {
        'C': [0.1, 1, 10,15,20],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto']
    }

elif best_model_name == "Random Forest":
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    }


In [60]:
grid_search = GridSearchCV(estimator=best_model, param_grid=param_grid,refit=True,
                           cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

grid_search.fit(X_train_scaled, y_train)


Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [62]:
print("\n Best Parameters Found:")
print(grid_search.best_params_)

print("\n Best Cross-Validation Accuracy:")
print(grid_search.best_score_)

# Evaluate on test set
best_tuned_model = grid_search.best_estimator_
y_pred_tuned = best_tuned_model.predict(X_test_scaled)
print("\n Test Set Accuracy After Tuning:")
print(accuracy_score(y_test, y_pred_tuned))



 Best Parameters Found:
{'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}

 Best Cross-Validation Accuracy:
0.9937003968253968

 Test Set Accuracy After Tuning:
1.0


In [64]:
print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred_tuned))



Detailed Classification Report:
              precision    recall  f1-score   support

          no       1.00      1.00      1.00        39
         yes       1.00      1.00      1.00        41

    accuracy                           1.00        80
   macro avg       1.00      1.00      1.00        80
weighted avg       1.00      1.00      1.00        80



In [68]:
## Dump into pickle file:
import pickle

In [70]:
filename = "scaler.sav" 
pickle.dump(scaler,open(filename, 'wb'))

In [72]:
filename = "ckd_finilized_classified_gridModel.sav"

In [74]:
pickle.dump(best_tuned_model,open(filename,'wb'))