<a href="https://colab.research.google.com/github/harsh2k1/Project-Multiple-Disease-Prediction-Web-App/blob/main/kidneyDiseasePrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# importing libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split , KFold , StratifiedKFold
kfold = StratifiedKFold(n_splits=5)
from sklearn.metrics import confusion_matrix , accuracy_score , classification_report , plot_confusion_matrix , \
plot_precision_recall_curve, plot_roc_curve , roc_auc_score , recall_score
from sklearn.preprocessing import StandardScaler , RobustScaler , MinMaxScaler , LabelEncoder
from sklearn.model_selection import GridSearchCV , RandomizedSearchCV , cross_val_score
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier 
from sklearn.svm import SVC
import statistics as stats

In [2]:
path = 'kidney_disease.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [3]:
df.shape

(400, 26)

In [4]:
df.isnull().sum()

id                  0
age                 9
bp                 12
sg                 47
al                 46
su                 49
rbc               152
pc                 65
pcc                 4
ba                  4
bgr                44
bu                 19
sc                 17
sod                87
pot                88
hemo               52
pcv                70
wc                105
rc                130
htn                 2
dm                  2
cad                 2
appet               1
pe                  1
ane                 1
classification      0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             391 non-null    float64
 2   bp              388 non-null    float64
 3   sg              353 non-null    float64
 4   al              354 non-null    float64
 5   su              351 non-null    float64
 6   rbc             248 non-null    object 
 7   pc              335 non-null    object 
 8   pcc             396 non-null    object 
 9   ba              396 non-null    object 
 10  bgr             356 non-null    float64
 11  bu              381 non-null    float64
 12  sc              383 non-null    float64
 13  sod             313 non-null    float64
 14  pot             312 non-null    float64
 15  hemo            348 non-null    float64
 16  pcv             330 non-null    object 
 17  wc              295 non-null    obj

In [6]:
df.fillna(df[df.columns].mean(),inplace = True)

In [7]:
df.isnull().sum()

id                  0
age                 0
bp                  0
sg                  0
al                  0
su                  0
rbc               152
pc                 65
pcc                 4
ba                  4
bgr                 0
bu                  0
sc                  0
sod                 0
pot                 0
hemo                0
pcv                70
wc                105
rc                130
htn                 2
dm                  2
cad                 2
appet               1
pe                  1
ane                 1
classification      0
dtype: int64

In [8]:
df.shape

(400, 26)

In [9]:
columns = ['rbc','pc','pcc','ba','pcv','wc','rc','htn','dm','cad','appet','pe','ane','classification']
for column in columns:
    print('Unique values in ',column,'are: ',df[column].unique())

Unique values in  rbc are:  [nan 'normal' 'abnormal']
Unique values in  pc are:  ['normal' 'abnormal' nan]
Unique values in  pcc are:  ['notpresent' 'present' nan]
Unique values in  ba are:  ['notpresent' 'present' nan]
Unique values in  pcv are:  ['44' '38' '31' '32' '35' '39' '36' '33' '29' '28' nan '16' '24' '37' '30'
 '34' '40' '45' '27' '48' '\t?' '52' '14' '22' '18' '42' '17' '46' '23'
 '19' '25' '41' '26' '15' '21' '43' '20' '\t43' '47' '9' '49' '50' '53'
 '51' '54']
Unique values in  wc are:  ['7800' '6000' '7500' '6700' '7300' nan '6900' '9600' '12100' '4500'
 '12200' '11000' '3800' '11400' '5300' '9200' '6200' '8300' '8400' '10300'
 '9800' '9100' '7900' '6400' '8600' '18900' '21600' '4300' '8500' '11300'
 '7200' '7700' '14600' '6300' '\t6200' '7100' '11800' '9400' '5500' '5800'
 '13200' '12500' '5600' '7000' '11900' '10400' '10700' '12700' '6800'
 '6500' '13600' '10200' '9000' '14900' '8200' '15200' '5000' '16300'
 '12400' '\t8400' '10500' '4200' '4700' '10900' '8100' '9500' 

In [10]:
columns = ['pcv','wc','rc']
for column in columns:
    df[column] = pd.to_numeric(df[column], errors = 'coerce')

In [11]:
for column in columns:
    print(df[column].dtype)

float64
float64
float64


In [12]:
columns

['pcv', 'wc', 'rc']

In [13]:
for column in columns:
    df[column].fillna(df[column].mean(), inplace = True)

In [14]:
columns = ['rbc','pc','pcc','ba','htn','dm','cad','appet','pe','ane','classification']
for column in columns:
    print('Unique values in ',column,'are: ',df[column].unique())

Unique values in  rbc are:  [nan 'normal' 'abnormal']
Unique values in  pc are:  ['normal' 'abnormal' nan]
Unique values in  pcc are:  ['notpresent' 'present' nan]
Unique values in  ba are:  ['notpresent' 'present' nan]
Unique values in  htn are:  ['yes' 'no' nan]
Unique values in  dm are:  ['yes' 'no' ' yes' '\tno' '\tyes' nan]
Unique values in  cad are:  ['no' 'yes' '\tno' nan]
Unique values in  appet are:  ['good' 'poor' nan]
Unique values in  pe are:  ['no' 'yes' nan]
Unique values in  ane are:  ['no' 'yes' nan]
Unique values in  classification are:  ['ckd' 'ckd\t' 'notckd']


In [15]:
df['dm'].replace({'\tno':'no' , '\tyes':'yes',' yes':'yes',np.nan : stats.mode(df['dm'])},inplace = True)
df['dm'].unique()

array(['yes', 'no'], dtype=object)

In [16]:
df['cad'].replace({'\tno':'no',np.nan : stats.mode(df['cad'])},inplace = True)
df['cad'].unique()

array(['no', 'yes'], dtype=object)

In [17]:
df['classification'].replace({'ckd\t':'ckd'},inplace = True)
df['classification'].unique()

array(['ckd', 'notckd'], dtype=object)

In [18]:
df = pd.get_dummies(df, drop_first=True)
df.shape

(400, 26)

In [19]:
df.columns

Index(['id', 'age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot',
       'hemo', 'pcv', 'wc', 'rc', 'rbc_normal', 'pc_normal', 'pcc_present',
       'ba_present', 'htn_yes', 'dm_yes', 'cad_yes', 'appet_poor', 'pe_yes',
       'ane_yes', 'classification_notckd'],
      dtype='object')

In [20]:
df.rename({'classification_notckd':'target'},axis = 1,inplace = True)
df['target'].replace({0:1 , 1:0})
df['target'].unique()

array([0, 1], dtype=uint8)

In [21]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [22]:
X_train , X_test , y_train , y_test = train_test_split(X,y , test_size = 0.20 , random_state = 42)
X_train.shape , X_test.shape , y_train.shape , y_test.shape

((320, 25), (80, 25), (320,), (80,))

In [23]:
# Creating a dataframe to keep record of auc scores
model = []
roc_auc = []
def performance(new_model , new_score):
    model.append(new_model)
    roc_auc.append(new_score)
    score_df = pd.DataFrame(zip(model , roc_auc) , columns = ['Model', 'ROC_AUC_Score'])
    #print(score_df)
    return score_df

In [24]:
# Define a function to compute Precision, Recall and F1 score
model_name = []
precision_list = []
recall_list = []
F1_list = []
def get_pre_rec_f1(model , new_model_name):

    if new_model_name == 'Artificial Neural Network':
        model_name.append(new_model_name)
        proba = model.predict(X_test)
        y_pred_ann = (proba > 0.5)
        #y_pred_ann = clf_ann.predict(X_test)
        roc_auc_ann = roc_auc_score(y_test, y_pred_ann)
        tp, fp, fn, tn = confusion_matrix(y_test, y_pred_ann).ravel()
        precision = tp / (tp + fp)
        precision_list.append(precision)
        recall = tp / (tp + fn)
        recall_list.append(recall)
        F1 = (2 * precision * recall) / (precision + recall)
        F1_list.append(F1)
        evaluation_df = pd.DataFrame(zip(model_name , precision_list , recall_list , F1_list) , columns = \
                                 ['Model','Precision', 'Recall', 'F1_Score'])
        #print(f'Precision:{precision:.3f}\nRecall:{recall:.3f}\nF1 score:{F1:.3f}')
        return evaluation_df
    else: 
        model_name.append(new_model_name)
        y_pred = model.predict(X_test)
        tp, fp, fn, tn = confusion_matrix(y_test, y_pred).ravel()
        precision = tp / (tp + fp)
        precision_list.append(precision)
        recall = tp / (tp + fn)
        recall_list.append(recall)
        F1 = (2 * precision * recall) / (precision + recall)
        F1_list.append(F1)
        evaluation_df = pd.DataFrame(zip(model_name , precision_list , recall_list , F1_list) , columns = \
                                 ['Model','Precision', 'Recall', 'F1_Score'])
        #print(f'Precision:{precision:.3f}\nRecall:{recall:.3f}\nF1 score:{F1:.3f}')
        return evaluation_df

# Random Forest Model

In [25]:
# Random Forest
# Hyperparameter tuning

# Create parameter grid  
param_grid = {
    'max_depth': [60, 90, 110],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300]
}

# Instantiate the model
clf_rf = RandomForestClassifier()

# Instantiate grid search model
grid_search = GridSearchCV(estimator = clf_rf, param_grid = param_grid,    
                          cv = 3, n_jobs = -1, verbose = 1)

# Fit grid search to the data
grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 3 folds for each of 81 candidates, totalling 243 fits


{'max_depth': 60,
 'min_samples_leaf': 3,
 'min_samples_split': 8,
 'n_estimators': 100}

In [26]:
# Use the best parameters to fit the model

clf_rf = RandomForestClassifier(**grid_search.best_params_)   
clf_rf.fit(X_train,y_train)

scores = cross_val_score(clf_rf, X_train, y_train, scoring ="roc_auc", cv = 5)
roc_auc_rf = np.mean(scores.mean())
performance('Random Forest',roc_auc_rf)

Unnamed: 0,Model,ROC_AUC_Score
0,Random Forest,1.0


In [27]:
get_pre_rec_f1(clf_rf,'Random Forest')

Unnamed: 0,Model,Precision,Recall,F1_Score
0,Random Forest,1.0,1.0,1.0


# XGBoost

In [28]:

# XGBoost
from sklearn.model_selection import RandomizedSearchCV
# Number of trees
n_estimators = np.arange(200,1000,200)

# Minimum loss reduction required to make a further partition on a leaf node of the tree
# The larger gamma is, the more conservative the algorithm will be
gamma = np.arange(0.1,0.4,0.1)

# Default 0.3, range(0,1)
learning_rate = np.arange(0.1,0.6,0.1)

# Maximum number of levels in tree
max_depth = list(range(3,8,1))

# Subsample ratio of the training instances.Range(0,1)
subsample = np.arange(0.5,0.9,0.1)

# Subsample ratio of columns when constructing each tree. Range(0,1)
colsample_bytree = np.arange(0.5,0.9,0.1)

# Control the balance of positive and negative weights
# Sum(negative instances) / sum(positive instances)
scale_pos_weight = [1,3.5]


# Create the random grid
random_grid_xgb = {'n_estimators': n_estimators,
                   'gamma': gamma,
                   'learning_rate':learning_rate,
                   'max_depth': max_depth,
                   'subsample':subsample,
                   'colsample_bytree':colsample_bytree,
                   'scale_pos_weight':scale_pos_weight
                  }
print(random_grid_xgb)

{'n_estimators': array([200, 400, 600, 800]), 'gamma': array([0.1, 0.2, 0.3, 0.4]), 'learning_rate': array([0.1, 0.2, 0.3, 0.4, 0.5]), 'max_depth': [3, 4, 5, 6, 7], 'subsample': array([0.5, 0.6, 0.7, 0.8]), 'colsample_bytree': array([0.5, 0.6, 0.7, 0.8]), 'scale_pos_weight': [1, 3.5]}


In [29]:
# Use randomized search to find best parameters

xgboost = XGBClassifier()
xgb_random = RandomizedSearchCV(estimator = xgboost, 
                                param_distributions = random_grid_xgb, 
                                n_iter = 5, 
                                cv = 3, 
                                verbose=1, 
                                random_state=42, 
                                n_jobs = -1,
                                scoring ='roc_auc')


xgb_random.fit(X_train, y_train)   

print(xgb_random.best_params_,xgb_random.best_score_)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
{'subsample': 0.7, 'scale_pos_weight': 3.5, 'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 0.2, 'colsample_bytree': 0.7} 1.0


In [30]:
# Use the best parameters to fit the model

clf_xgb = XGBClassifier(**xgb_random.best_params_)   
clf_xgb.fit(X_train,y_train)
scores = cross_val_score(clf_xgb, X_train, y_train, scoring ="roc_auc", cv = 5)
roc_auc_xgb = np.mean(scores.mean())
performance('XGBoost',clf_xgb)



Unnamed: 0,Model,ROC_AUC_Score
0,Random Forest,1
1,XGBoost,"XGBClassifier(base_score=0.5, booster='gbtree'..."


In [31]:
get_pre_rec_f1(clf_xgb,'XGBoost')

Unnamed: 0,Model,Precision,Recall,F1_Score
0,Random Forest,1.0,1.0,1.0
1,XGBoost,1.0,1.0,1.0


# Voting Classifier

In [33]:
# Voting Classifier
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier

hard_voting_clf = VotingClassifier(estimators=[('lr',LogisticRegression()),('svm', SVC(kernel='rbf')),
                                   ('dt', DecisionTreeClassifier()), ('rf',clf_rf ) , ('xgb', clf_xgb)],voting = 'hard')
hard_voting_clf.fit(X_train, y_train)
get_pre_rec_f1(hard_voting_clf,'Voting Classifier')



Unnamed: 0,Model,Precision,Recall,F1_Score
0,Random Forest,1.0,1.0,1.0
1,XGBoost,1.0,1.0,1.0
2,Voting Classifier,1.0,1.0,1.0


In [34]:
auc = 0.99
print('ROC_AUC = ',auc)

ROC_AUC =  0.99
