Data Read

In [32]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt

In [18]:
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [2]:
df=pd.read_csv("kidney_disease.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             391 non-null    float64
 2   bp              388 non-null    float64
 3   sg              353 non-null    float64
 4   al              354 non-null    float64
 5   su              351 non-null    float64
 6   rbc             248 non-null    object 
 7   pc              335 non-null    object 
 8   pcc             396 non-null    object 
 9   ba              396 non-null    object 
 10  bgr             356 non-null    float64
 11  bu              381 non-null    float64
 12  sc              383 non-null    float64
 13  sod             313 non-null    float64
 14  pot             312 non-null    float64
 15  hemo            348 non-null    float64
 16  pcv             330 non-null    object 
 17  wc              295 non-null    obj

In [4]:
# '?' character remove process in the dataset

for i in ['rc','wc','pcv']:
    df[i] = df[i].str.extract('(\d+)').astype(float)

In [5]:
# Filling missing numeric data in the dataset with mean

for i in ['age','bp','sg','al','su','bgr','bu','sc','sod','pot','hemo','rc','wc','pcv']:
    df[i].fillna(df[i].mean(),inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             400 non-null    float64
 2   bp              400 non-null    float64
 3   sg              400 non-null    float64
 4   al              400 non-null    float64
 5   su              400 non-null    float64
 6   rbc             248 non-null    object 
 7   pc              335 non-null    object 
 8   pcc             396 non-null    object 
 9   ba              396 non-null    object 
 10  bgr             400 non-null    float64
 11  bu              400 non-null    float64
 12  sc              400 non-null    float64
 13  sod             400 non-null    float64
 14  pot             400 non-null    float64
 15  hemo            400 non-null    float64
 16  pcv             400 non-null    float64
 17  wc              400 non-null    flo

In [7]:
# Removing tab spaces in the data

df['dm'] = df['dm'].replace(to_replace={'\tno':'no','\tyes':'yes',' yes':'yes'}) 
df['cad'] = df['cad'].replace(to_replace='\tno',value='no') 
df['classification'] = df['classification'].replace(to_replace='ckd\t',value='ckd')

In [8]:
# Mapping the text to 1/0 and cleaning the dataset 

df[['htn','dm','cad','pe','ane']] = df[['htn','dm','cad','pe','ane']].replace(to_replace={'yes':1,'no':0})
df[['rbc','pc']] = df[['rbc','pc']].replace(to_replace={'abnormal':1,'normal':0})
df[['pcc','ba']] = df[['pcc','ba']].replace(to_replace={'present':1,'notpresent':0})
df[['appet']] = df[['appet']].replace(to_replace={'good':1,'poor':0})
df['classification'] = df['classification'].replace(to_replace={'ckd':1,'notckd':0})

df.rename(columns={'classification':'class'},inplace=True)

In [9]:
df.drop('id',axis=1,inplace=True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     400 non-null    float64
 1   bp      400 non-null    float64
 2   sg      400 non-null    float64
 3   al      400 non-null    float64
 4   su      400 non-null    float64
 5   rbc     248 non-null    float64
 6   pc      335 non-null    float64
 7   pcc     396 non-null    float64
 8   ba      396 non-null    float64
 9   bgr     400 non-null    float64
 10  bu      400 non-null    float64
 11  sc      400 non-null    float64
 12  sod     400 non-null    float64
 13  pot     400 non-null    float64
 14  hemo    400 non-null    float64
 15  pcv     400 non-null    float64
 16  wc      400 non-null    float64
 17  rc      400 non-null    float64
 18  htn     398 non-null    float64
 19  dm      398 non-null    float64
 20  cad     398 non-null    float64
 21  appet   399 non-null    float64
 22  pe

In [11]:
# Filling the missing string data as the most repetitive (mod)
df=df.apply(lambda x:x.fillna(x.value_counts().index[0]))

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     400 non-null    float64
 1   bp      400 non-null    float64
 2   sg      400 non-null    float64
 3   al      400 non-null    float64
 4   su      400 non-null    float64
 5   rbc     400 non-null    float64
 6   pc      400 non-null    float64
 7   pcc     400 non-null    float64
 8   ba      400 non-null    float64
 9   bgr     400 non-null    float64
 10  bu      400 non-null    float64
 11  sc      400 non-null    float64
 12  sod     400 non-null    float64
 13  pot     400 non-null    float64
 14  hemo    400 non-null    float64
 15  pcv     400 non-null    float64
 16  wc      400 non-null    float64
 17  rc      400 non-null    float64
 18  htn     400 non-null    float64
 19  dm      400 non-null    float64
 20  cad     400 non-null    float64
 21  appet   400 non-null    float64
 22  pe

Preparation of Model Data and Scaling of Data

In [13]:
features = [['age', 'bp','sg','al','su','bgr','bu', 'sc', 'sod','pot','hemo','pcv','wc', 'rc']]

# Scaling of the data
for feature in features:
    df[feature]=(df[feature]-np.min(df[feature]))/(np.max(df[feature])-np.min(df[feature]))

In [14]:
x_data=df.drop(['class'],axis=1)
y=df['class'].values

Modelling

In [15]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test=train_test_split(x_data,y,test_size=0.3,random_state=42)

Support Vector Machine

In [16]:
from sklearn.svm import SVC

svm=SVC(random_state=1,gamma="auto")
svm.fit(x_train,y_train)

SVC(gamma='auto', random_state=1)

In [19]:
svm_pred=svm.predict(x_test)

f1_svm=f1_score(y_test,svm_pred)
precision_svm = precision_score(y_test, svm_pred)
recall_svm = recall_score(y_test, svm_pred)

print("SVM f1 score: ",f1_svm)
print("SVM Precision: ",precision_svm)
print("SVM Recall: ", recall_svm)
print("SVM accuracy score: ",svm.score(x_test,y_test))

SVM f1 score:  0.972972972972973
SVM Precision:  1.0
SVM Recall:  0.9473684210526315
SVM accuracy score:  0.9666666666666667


Logistic Regression

In [25]:
from sklearn.linear_model import LogisticRegression
x_train, x_test, y_train, y_test = train_test_split(x_data,y) #, test_size=0.30, random_state=42)

In [26]:
LogisticModel = LogisticRegression( random_state=0) # solver='liblinear',

In [27]:
LogisticModel.fit(x_train, y_train)

LogisticRegression(random_state=0)

In [28]:
Logi_pred = LogisticModel.predict(x_test)

In [30]:
f1_lg=f1_score(y_test,Logi_pred)
precision_lg = precision_score(y_test, Logi_pred)
recall_lg = recall_score(y_test, Logi_pred)

print("SVM f1 score: ",f1_lg)
print("SVM Precision: ",precision_lg)
print("SVM Recall: ", recall_lg)
print("SVM accuracy score: ",LogisticModel.score(x_test,y_test))

SVM f1 score:  0.9923664122137404
SVM Precision:  1.0
SVM Recall:  0.9848484848484849
SVM accuracy score:  0.99


In [79]:
Start = time.time()

# Model code 

End = time.time()



TotalTime = End-Start
print(round(TotalTime,3),"Sec")

In [80]:
End = time.time()

0.193 Sec


**Pros and Cons associated with Support Vector Machine**

  1. Pros:
  
        - It works really well with a clear margin of separation.                               
        - It is effective in high dimensional spaces.                                    
        - It is effective in cases where the number of dimensions is greater than the number of samples.
        - It uses a subset of training points in the decision function (called support vectors), so it is also memory efficient.
  
  
  2. Cons:
  
        - It doesn’t perform well when we have large data set because the required training time is higher.      
        - It also doesn’t perform very well, when the data set has more noise i.e. target classes are overlapping.     
        - SVM doesn’t directly provide probability estimates, these are calculated using an expensive five-fold cross-validation. It is included in the related SVC method of Python scikit-learn library.

**Pros and Cons associated with Logistic Regression**

  1. Pros:
  
        - 
  
  
  2. Cons:
  
        -       