In [None]:
# Importing Libraries:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import logging
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import StratifiedKFold

# from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# Ensembling
from mlxtend.classifier import StackingCVClassifier

from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings('ignore') 

# Database loading and pre-processing

In [None]:
#Database loading and pre-processing
from sklearn.impute import KNNImputer
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

#Create function for checking missing values which accepts a dataframe as its parameter
def null_values_check(df): 
    #Error handling to prevent abnormal termination of operation
    try:
        #if-else statement for null value check 
        if(df.isnull().values.any() == True):
            #if there are null values present, print a column-wise summary of records with null values
            print('Number of null records within each column:\n' + str(df.isnull().sum()))
        else:
            print('There is no missing values in the dataset.')
    
    except Exception as e:
        logging.error(e)

#initialise variable with dataset name
dataset_name = 'chronic_kidney_disease.csv'

#error-handling to prevent abnormal termination of code
try:
    #import and load weather dataset into pandas dataframe
    chronic_kidney_disease_dataframe = pd.read_csv(dataset_name)
    
    #Description of Datasets
    #Print number of records and attributes of whole kidney dataset
    print('Shape of dataset: ' + str(chronic_kidney_disease_dataframe.shape))
    print('Total number of records in dataset = ' + str(chronic_kidney_disease_dataframe.shape[0]))
    print('Total number of attributes in dataset = ' + str(chronic_kidney_disease_dataframe.shape[1]))
    print('')
    #call function created to check for null values
    null_values_check(chronic_kidney_disease_dataframe)
    #Missing value imputation
    #replace ? to nan values 
    chronic_kidney_disease_dataframe = chronic_kidney_disease_dataframe.replace('?', np.nan)
    
    #set the features and the target variables
    target_class = chronic_kidney_disease_dataframe['class']
    print('\nAre there missing values in Target Class? ' + str(target_class.isna().any()))
    feature_classes = chronic_kidney_disease_dataframe.iloc[:, 0:5]
    
    print('\nAre there missing values in the Features? \n' + str(feature_classes.isna().any()))
    
    # replacing the null values
    #KNN imputation (n_neighbour = 5 means that the missing values will be replaced by the mean value of 5 nearest neighbors)
    knn_missing_values_imputer = KNNImputer(n_neighbors=5)
    feature_classes = pd.DataFrame(knn_missing_values_imputer.fit_transform(feature_classes),
                                   columns = feature_classes.columns)
    print('\nNow, Are there any missing values in Features? ' + str(feature_classes.isna().any()))
    
    
    #Scaling and normalization of features
    standard_feature_scaler = StandardScaler()
    feature_classes = standard_feature_scaler.fit_transform(feature_classes)

    feature_classes = pd.DataFrame(feature_classes, columns=['rbc','su','cad','sg','htn'])
    
    
    #Encoding target class using label encoding
    target_label_encoder = preprocessing.LabelEncoder()
    target_class = target_label_encoder.fit_transform(target_class)
    #target_class1 = pd.DataFrame(target_class, columns=['class'])
    
    y=target_class
    
    #split the dataset into training and testing data using K Fold Approach
    kf = StratifiedKFold(n_splits=5, random_state=None)
    for train_index , test_index in kf.split(feature_classes,y):
        X_train , X_test = feature_classes.iloc[train_index,:],feature_classes.iloc[test_index,:]
        y_train , y_test = y[train_index] , y[test_index]
                                                      
except FileNotFoundError as e:
    logging.error(e)

Shape of dataset: (400, 25)
Total number of records in dataset = 400
Total number of attributes in dataset = 25

There is no missing values in the dataset.

Are there missing values in Target Class? False

Are there missing values in the Features? 
age    True
bp     True
sg     True
al     True
su     True
dtype: bool

Now, Are there any missing values in Features? age    False
bp     False
sg     False
al     False
su     False
dtype: bool


In [None]:
chronic_kidney_disease_dataframe.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48,80,1.02,1,0,0,0,1,1,121.0,...,44,7800,5.2,0,0,1,1,1,1,ckd
1,7,50,1.02,4,0,0,0,1,1,,...,38,6000,0.0,1,1,1,1,1,1,ckd
2,62,80,1.01,2,3,0,0,1,1,423.0,...,31,7500,0.0,1,0,1,0,1,0,ckd
3,48,70,1.005,4,0,0,1,0,1,117.0,...,32,6700,3.9,0,1,1,0,0,0,ckd
4,51,80,1.01,2,0,0,0,1,1,106.0,...,35,7300,4.6,1,1,1,1,1,1,ckd


In [None]:
chronic_kidney_disease_dataframe.shape

(400, 25)

In [None]:
# Description:
chronic_kidney_disease_dataframe.describe()

Unnamed: 0,rbc,pc,pcc,ba,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,0.1175,0.19,0.885,0.935,31.9825,6178.5,3.16575,0.6275,0.645,0.905,0.7925,0.8075,0.8475
std,0.322418,0.392792,0.319421,0.246835,16.962799,4490.489839,2.36621,0.484076,0.479113,0.293582,0.406024,0.394757,0.359955
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,1.0,26.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
50%,0.0,0.0,1.0,1.0,37.0,6900.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0
75%,0.0,0.0,1.0,1.0,44.0,9400.0,5.1,1.0,1.0,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,54.0,26400.0,8.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
# Datatypes:
chronic_kidney_disease_dataframe.dtypes

age       object
bp        object
sg        object
al        object
su        object
rbc        int64
pc         int64
pcc        int64
ba         int64
bgr       object
bu        object
sc        object
sod       object
pot       object
hemo      object
pcv        int64
wbcc       int64
rbcc     float64
htn        int64
dm         int64
cad        int64
appet      int64
pe         int64
ane        int64
class     object
dtype: object

In [None]:
chronic_kidney_disease_dataframe.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48,80,1.02,1,0,0,0,1,1,121.0,...,44,7800,5.2,0,0,1,1,1,1,ckd
1,7,50,1.02,4,0,0,0,1,1,,...,38,6000,0.0,1,1,1,1,1,1,ckd
2,62,80,1.01,2,3,0,0,1,1,423.0,...,31,7500,0.0,1,0,1,0,1,0,ckd
3,48,70,1.005,4,0,0,1,0,1,117.0,...,32,6700,3.9,0,1,1,0,0,0,ckd
4,51,80,1.01,2,0,0,0,1,1,106.0,...,35,7300,4.6,1,1,1,1,1,1,ckd


# Replacing categorial with null value

In [None]:
chronic_kidney_disease_dataframe['class'].value_counts()
chronic_kidney_disease_dataframe['class'] = chronic_kidney_disease_dataframe['class'].replace(to_replace={'ckd\t':'ckd'})
chronic_kidney_disease_dataframe["class"] = [1 if i == "ckd" else 0 for i in chronic_kidney_disease_dataframe["class"]]

In [None]:
chronic_kidney_disease_dataframe.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48,80,1.02,1,0,0,0,1,1,121.0,...,44,7800,5.2,0,0,1,1,1,1,1
1,7,50,1.02,4,0,0,0,1,1,,...,38,6000,0.0,1,1,1,1,1,1,1
2,62,80,1.01,2,3,0,0,1,1,423.0,...,31,7500,0.0,1,0,1,0,1,0,1
3,48,70,1.005,4,0,0,1,0,1,117.0,...,32,6700,3.9,0,1,1,0,0,0,1
4,51,80,1.01,2,0,0,0,1,1,106.0,...,35,7300,4.6,1,1,1,1,1,1,1


# Converting all the data types to float

In [None]:
# taking the length of the column and converting the each column to float
colLength = (len(chronic_kidney_disease_dataframe.axes[1]))

# for storing the position of each column
count=0;

for i in chronic_kidney_disease_dataframe:
    count += 1
    # the last column of the data set is label so when we reach last column just break the loop
    if(count==25):
        break
    
    # converting each column to float data type
    
    chronic_kidney_disease_dataframe[i] = chronic_kidney_disease_dataframe[i].astype('float64')

In [None]:
chronic_kidney_disease_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     391 non-null    float64
 1   bp      388 non-null    float64
 2   sg      353 non-null    float64
 3   al      354 non-null    float64
 4   su      351 non-null    float64
 5   rbc     400 non-null    float64
 6   pc      400 non-null    float64
 7   pcc     400 non-null    float64
 8   ba      400 non-null    float64
 9   bgr     356 non-null    float64
 10  bu      381 non-null    float64
 11  sc      383 non-null    float64
 12  sod     313 non-null    float64
 13  pot     312 non-null    float64
 14  hemo    348 non-null    float64
 15  pcv     400 non-null    float64
 16  wbcc    400 non-null    float64
 17  rbcc    400 non-null    float64
 18  htn     400 non-null    float64
 19  dm      400 non-null    float64
 20  cad     400 non-null    float64
 21  appet   400 non-null    float64
 22  pe

In [None]:
# Description:
chronic_kidney_disease_dataframe.describe()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
count,391.0,388.0,353.0,354.0,351.0,400.0,400.0,400.0,400.0,356.0,...,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,51.483376,76.469072,1.017408,1.016949,0.450142,0.1175,0.19,0.885,0.935,148.036517,...,31.9825,6178.5,3.16575,0.6275,0.645,0.905,0.7925,0.8075,0.8475,0.625
std,17.169714,13.683637,0.005717,1.352679,1.099191,0.322418,0.392792,0.319421,0.246835,79.281714,...,16.962799,4490.489839,2.36621,0.484076,0.479113,0.293582,0.406024,0.394757,0.359955,0.484729
min,2.0,50.0,1.005,0.0,0.0,0.0,0.0,0.0,0.0,22.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,42.0,70.0,1.01,0.0,0.0,0.0,0.0,1.0,1.0,99.0,...,26.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
50%,55.0,80.0,1.02,0.0,0.0,0.0,0.0,1.0,1.0,121.0,...,37.0,6900.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
75%,64.5,80.0,1.02,2.0,0.0,0.0,0.0,1.0,1.0,163.0,...,44.0,9400.0,5.1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,90.0,180.0,1.025,5.0,5.0,1.0,1.0,1.0,1.0,490.0,...,54.0,26400.0,8.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Checking for the null values

In [None]:
# Cheaking Missing (NaN) Values:
chronic_kidney_disease_dataframe.isnull().sum().sort_values(ascending=False)

pot      88
sod      87
hemo     52
su       49
sg       47
al       46
bgr      44
bu       19
sc       17
bp       12
age       9
ba        0
pcc       0
pc        0
rbc       0
pcv       0
wbcc      0
rbcc      0
htn       0
dm        0
cad       0
appet     0
pe        0
ane       0
class     0
dtype: int64

# handling null values

In [None]:
chronic_kidney_disease_dataframe.columns

Index(['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
       'sc', 'sod', 'pot', 'hemo', 'pcv', 'wbcc', 'rbcc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'class'],
      dtype='object')

In [None]:
features = ['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
           'sc', 'sod', 'pot', 'hemo', 'pcv', 'wbcc', 'rbcc', 'htn', 'dm', 'cad',
           'appet', 'pe', 'ane']

# Replacing the null value with the median

In [None]:
for feature in features:
    chronic_kidney_disease_dataframe[feature] = chronic_kidney_disease_dataframe[feature].fillna(chronic_kidney_disease_dataframe[feature].median())

In [None]:
chronic_kidney_disease_dataframe.isnull().any().sum()

0

# Ensembling technique

In [None]:
# importing utility modules

import pandas as pd
 
# importing machine learning models for prediction

from sklearn.ensemble import RandomForestClassifier

from sklearn.tree import DecisionTreeClassifier

from sklearn.linear_model import LogisticRegression
 
# importing voting classifier

# importing the classifier for the model
rf = RandomForestClassifier()
dt=DecisionTreeClassifier()
lg=LogisticRegression()


from sklearn.ensemble import VotingClassifier
 
# initializing all the model objects with default parameters

model_1 = dt

model_2 = rf

model_3 = lg
 
# Making the final model using voting classifier

final_model = VotingClassifier(

    estimators=[('dt', model_1), ('rf', model_2), ('lg', model_3)], voting='hard')
 
# training all the model on the train dataset
final_model.fit(X_train, y_train)
 
# predicting the output on the test dataset

pred_final = final_model.predict(X_test)
 
# printing log loss between actual and predicted value

print("Accuracy: ",metrics.accuracy_score(y_test, pred_final)*100)

Accuracy:  97.5


## Stacking Technique

In [None]:
from mlxtend.classifier import StackingCVClassifier
rf = RandomForestClassifier()
dt=DecisionTreeClassifier()
lg=LogisticRegression()

model_1 = dt

model_2 = rf

model_3 = lg


scv=StackingCVClassifier(classifiers= [model_1,model_2, model_3], meta_classifier=dt)
scv.fit(X_train,y_train)
scv_predicted = scv.predict(X_test)
scv_conf_matrix = metrics.confusion_matrix(y_test, scv_predicted)
scv_acc_score = metrics.accuracy_score(y_test, scv_predicted)
print("confussion matrix")
print(scv_conf_matrix)
print("\n")
print("Accuracy of StackingCVClassifier:",scv_acc_score*100,'\n')
print(metrics.classification_report(y_test,scv_predicted))

confussion matrix
[[48  2]
 [ 0 30]]


Accuracy of StackingCVClassifier: 97.5 

              precision    recall  f1-score   support

           0       1.00      0.96      0.98        50
           1       0.94      1.00      0.97        30

    accuracy                           0.97        80
   macro avg       0.97      0.98      0.97        80
weighted avg       0.98      0.97      0.98        80

