In [118]:
import pandas as pd
from IPython.display import display
from scipy import stats
import random
import math



In [90]:
df = pd.read_csv('kidney_disease.csv', index_col='id')

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             391 non-null    float64
 1   bp              388 non-null    float64
 2   sg              353 non-null    float64
 3   al              354 non-null    float64
 4   su              351 non-null    float64
 5   rbc             248 non-null    object 
 6   pc              335 non-null    object 
 7   pcc             396 non-null    object 
 8   ba              396 non-null    object 
 9   bgr             356 non-null    float64
 10  bu              381 non-null    float64
 11  sc              383 non-null    float64
 12  sod             313 non-null    float64
 13  pot             312 non-null    float64
 14  hemo            348 non-null    float64
 15  pcv             330 non-null    object 
 16  wc              295 non-null    object 
 17  rc              270 non-null    obj

In [91]:
# filling numerical column's empty row with ffill method from panda
df['age'] = df['age'].fillna(method="ffill")

df['bp'] = df['bp'].fillna(method="ffill")

df['sg'] = df['sg'].fillna(method="ffill")

df['al'] = df['al'].fillna(method="ffill")

df['su'] = df['su'].fillna(method="ffill")

df['bgr'] = df['bgr'].fillna(method="ffill")

df['bu'] = df['bu'].fillna(method="ffill")

df['sc'] = df['sc'].fillna(method="ffill")

# filling with mean method as well since ffill still leave empty row

df['sod'] = df['sod'].fillna(method="ffill")
x = df["sod"].mean()
df["sod"].fillna(x, inplace = True)

df['pot'] = df['pot'].fillna(method="ffill")
x = df["pot"].mean()
df["pot"].fillna(x, inplace = True)

df['hemo'] = df['hemo'].fillna(method="ffill")

In [92]:
# converting necessary columns to numerical type, then filling empty row with ffill method from panda

df['pcv'] = pd.to_numeric(df['pcv'], errors='coerce')
df['pcv'] = df['pcv'].fillna(method="ffill")

df['wc'] = pd.to_numeric(df['wc'], errors='coerce')
df['wc'] = df['wc'].fillna(method="ffill")

df['rc'] = pd.to_numeric(df['rc'], errors='coerce')
df['rc'] = df['rc'].fillna(method="ffill")

In [93]:
# filling categorical column's nan with ffill method from panda

df['rbc'].fillna(method='ffill', inplace=True)
df['rbc'].fillna('normal', inplace=True)

df['pc'].fillna(method='ffill', inplace=True)
df['pc'].fillna('normal', inplace=True)

df['pcc'].fillna(method='ffill', inplace=True)
df['pcc'].fillna('normal', inplace=True)

df['ba'].fillna(method='ffill', inplace=True)
df['ba'].fillna('normal', inplace=True)

df['htn'].fillna(method='ffill', inplace=True)
df['htn'].fillna('normal', inplace=True)

df['dm'].fillna(method='ffill', inplace=True)
df['dm'].fillna('normal', inplace=True)

df['cad'].fillna(method='ffill', inplace=True)
df['cad'].fillna('normal', inplace=True)

df['appet'].fillna(method='ffill', inplace=True)
df['appet'].fillna('normal', inplace=True)

df['pe'].fillna(method='ffill', inplace=True)
df['pe'].fillna('normal', inplace=True)

df['ane'].fillna(method='ffill', inplace=True)
df['ane'].fillna('normal', inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             400 non-null    float64
 1   bp              400 non-null    float64
 2   sg              400 non-null    float64
 3   al              400 non-null    float64
 4   su              400 non-null    float64
 5   rbc             400 non-null    object 
 6   pc              400 non-null    object 
 7   pcc             400 non-null    object 
 8   ba              400 non-null    object 
 9   bgr             400 non-null    float64
 10  bu              400 non-null    float64
 11  sc              400 non-null    float64
 12  sod             400 non-null    float64
 13  pot             400 non-null    float64
 14  hemo            400 non-null    float64
 15  pcv             400 non-null    float64
 16  wc              400 non-null    float64
 17  rc              400 non-null    flo

In [94]:
# changing ckd\t value with ckd
for x in df.index:
  if df.loc[x, "classification"] == "ckd\t":
    df.loc[x, "classification"] = "ckd"
  if df.loc[x,"dm"] == "\tyes":
    df.loc[x,"dm"] = "yes"
  if df.loc[x,"dm"] == "\tno":
    df.loc[x,"dm"] = "no"
  if df.loc[x,"dm"] != "no":
    df.loc[x,"dm"] = "yes"
  if df.loc[x,"cad"] == "\tno":
    df.loc[x,"cad"] = "no"

df['classification'].value_counts()

ckd       250
notckd    150
Name: classification, dtype: int64

In [95]:
for x in df.select_dtypes(include=['object']).columns:
    display(pd.crosstab(df[x], df['classification']))

classification,ckd,notckd
rbc,Unnamed: 1_level_1,Unnamed: 2_level_1
abnormal,109,0
normal,141,150


classification,ckd,notckd
pc,Unnamed: 1_level_1,Unnamed: 2_level_1
abnormal,91,0
normal,159,150


classification,ckd,notckd
pcc,Unnamed: 1_level_1,Unnamed: 2_level_1
notpresent,208,150
present,42,0


classification,ckd,notckd
ba,Unnamed: 1_level_1,Unnamed: 2_level_1
notpresent,228,150
present,22,0


classification,ckd,notckd
htn,Unnamed: 1_level_1,Unnamed: 2_level_1
no,103,150
yes,147,0


classification,ckd,notckd
dm,Unnamed: 1_level_1,Unnamed: 2_level_1
no,113,150
yes,137,0


classification,ckd,notckd
cad,Unnamed: 1_level_1,Unnamed: 2_level_1
no,216,150
yes,34,0


classification,ckd,notckd
appet,Unnamed: 1_level_1,Unnamed: 2_level_1
good,168,150
poor,82,0


classification,ckd,notckd
pe,Unnamed: 1_level_1,Unnamed: 2_level_1
no,174,150
yes,76,0


classification,ckd,notckd
ane,Unnamed: 1_level_1,Unnamed: 2_level_1
no,190,150
yes,60,0


classification,ckd,notckd
classification,Unnamed: 1_level_1,Unnamed: 2_level_1
ckd,250,0
notckd,0,150


## Feature Extraction

Gunakan tes Chi-Square untuk menentukan data yang penting untuk data kategorikal

In [96]:
#Create contigency table for each object typed data in df
for col_names in df.drop('classification', axis=1).select_dtypes(include=['object']).columns:
  #p value is calculated by chi-square test
  p_value = stats.chi2_contingency(pd.crosstab(df[col_names], df['classification']))[1]
  #print("P-value of %s : %E"%(col_names, p_value))
  #if p is smaller than 0.05, then we can reject the null hypothesis
  if p_value < 0.05:
    print("%s is significant with p-value %E"%(col_names, p_value))
  else:
    print("%s is not significant with p-value %E"%(col_names, p_value))

  



rbc is significant with p-value 7.575018E-21
pc is significant with p-value 1.191701E-16
pcc is significant with p-value 2.779587E-07
ba is significant with p-value 4.465875E-04
htn is significant with p-value 1.250081E-31
dm is significant with p-value 1.708208E-28
cad is significant with p-value 5.716852E-06
appet is significant with p-value 1.002805E-14
pe is significant with p-value 1.687786E-13
ane is significant with p-value 1.974729E-10


Karena semua kolom kategorikal ketika diuji dengan Chi Square menunjukkan fakta bahwa kolom klasifikasi dan kolom tersebut tidak saling independenden, kita bisa memasukkan semuanya dalam klasifikasi Naive Bayes

In [97]:
# t test for numerical data
for col_names in df.drop('classification', axis=1).select_dtypes(include=['float64', 'int64']).columns:
    #p value is calculated by t test
    p_value = stats.ttest_ind(df[df['classification'] == 'ckd'][col_names], df[df['classification'] == 'notckd'][col_names])[1]
    #print("Mean of %s for ckd : %f while notckd : %f"%(col_names, df[df['classification'] == 'ckd'][col_names].mean(), df[df['classification'] == 'notckd'][col_names].mean()))
    #print("P-value of %s : %E"%(col_names, p_value))
    #if p is smaller than 0.05, then we can reject the null hypothesis
    if p_value < 0.05:
        print("%s is significant (p value : %E)"%(col_names, p_value))
    else:
        print("%s is not significant (p value : %E)"%(col_names, p_value))

age is significant (p value : 5.114717E-06)
bp is significant (p value : 4.674351E-08)
sg is significant (p value : 4.301695E-62)
al is significant (p value : 3.014338E-42)
su is significant (p value : 4.271137E-11)
bgr is significant (p value : 1.490430E-17)
bu is significant (p value : 6.461747E-15)
sc is significant (p value : 1.315751E-09)
sod is significant (p value : 2.685530E-15)
pot is not significant (p value : 1.685121E-01)
hemo is significant (p value : 3.408540E-75)
pcv is significant (p value : 1.700012E-67)
wc is significant (p value : 5.342427E-05)
rc is significant (p value : 2.112856E-58)


Dengan demikian, karena kolom 'pot' tidak signifikan, kita dapat menghapus kolom tersebut dari tabel untuk menghindari overfitting.

In [98]:
df = df.drop('pot', axis=1)

In [99]:
df

Unnamed: 0_level_0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,48.0,80.0,1.020,1.0,0.0,normal,normal,notpresent,notpresent,121.0,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.020,4.0,0.0,normal,normal,notpresent,notpresent,121.0,...,38.0,6000.0,5.2,no,no,no,good,no,no,ckd
2,62.0,80.0,1.010,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31.0,7500.0,5.2,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.010,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35.0,7300.0,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,140.0,...,47.0,6700.0,4.9,no,no,no,good,no,no,notckd
396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,75.0,...,54.0,7800.0,6.2,no,no,no,good,no,no,notckd
397,12.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,100.0,...,49.0,6600.0,5.4,no,no,no,good,no,no,notckd
398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,114.0,...,51.0,7200.0,5.9,no,no,no,good,no,no,notckd


In [100]:
categorical_col = df.select_dtypes(include=['object']).drop('classification',axis=1).columns
numerical_col = df.select_dtypes(include=['float64', 'int64']).columns

In [101]:
#encoding categorical data
for col in categorical_col:
    df[col] = pd.Categorical(df[col])
    df[col] = df[col].cat.codes
    

In [102]:
df

Unnamed: 0_level_0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,48.0,80.0,1.020,1.0,0.0,1,1,0,0,121.0,...,44.0,7800.0,5.2,1,1,0,0,0,0,ckd
1,7.0,50.0,1.020,4.0,0.0,1,1,0,0,121.0,...,38.0,6000.0,5.2,0,0,0,0,0,0,ckd
2,62.0,80.0,1.010,2.0,3.0,1,1,0,0,423.0,...,31.0,7500.0,5.2,0,1,0,1,0,1,ckd
3,48.0,70.0,1.005,4.0,0.0,1,0,1,0,117.0,...,32.0,6700.0,3.9,1,0,0,1,1,1,ckd
4,51.0,80.0,1.010,2.0,0.0,1,1,0,0,106.0,...,35.0,7300.0,4.6,0,0,0,0,0,0,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,0.0,1,1,0,0,140.0,...,47.0,6700.0,4.9,0,0,0,0,0,0,notckd
396,42.0,70.0,1.025,0.0,0.0,1,1,0,0,75.0,...,54.0,7800.0,6.2,0,0,0,0,0,0,notckd
397,12.0,80.0,1.020,0.0,0.0,1,1,0,0,100.0,...,49.0,6600.0,5.4,0,0,0,0,0,0,notckd
398,17.0,60.0,1.025,0.0,0.0,1,1,0,0,114.0,...,51.0,7200.0,5.9,0,0,0,0,0,0,notckd


# Naive Bayes

In [103]:
#Construct a priori table from categorical data
prior = pd.DataFrame(df[categorical_col].groupby(df['classification']).sum())
prior

Unnamed: 0_level_0,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane
classification,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ckd,141.0,159.0,42.0,22.0,147.0,137.0,34.0,82.0,76.0,60.0
notckd,150.0,150.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [104]:
class NBClassifier:
    
    def __init__(self, X, y):
        
        '''
        X and y denotes the features and the target labels respectively
        '''
        self.X, self.y = X, y 
        
        self.N = len(self.X) # Length of the training set

        self.dim = len(self.X[0]) # Dimension of the vector of features

        self.attrs = [[] for _ in range(self.dim)] # Here we'll store the columns of the training set

        self.output_dom = {} # Output classes with the number of ocurrences in the training set. In this case we have only 2 classes

        self.data = [] # To store every row [Xi, yi]
        
        
        for i in range(len(self.X)):
            for j in range(self.dim):
                # if we have never seen this value for this attr before, 
                # then we add it to the attrs array in the corresponding position
                if not self.X[i][j] in self.attrs[j]:
                    self.attrs[j].append(self.X[i][j])
                    
            # if we have never seen this output class before,
            # then we add it to the output_dom and count one occurrence for now
            if not self.y[i] in self.output_dom.keys():
                self.output_dom[self.y[i]] = 1
            # otherwise, we increment the occurrence of this output in the training set by 1
            else:
                self.output_dom[self.y[i]] += 1
            # store the row
            self.data.append([self.X[i], self.y[i]])
            
            

    def classify(self, entry):

        solve = None # Final result
        max_arg = -1 # partial maximum

        for y in self.output_dom.keys():

            prob = self.output_dom[y]/self.N # P(y)

            for i in range(self.dim):
                cases = [x for x in self.data if x[0][i] == entry[i] and x[1] == y] # all rows with Xi = xi
                n = len(cases)
                prob *= n/self.N # P *= P(Xi = xi)
                
            # if we have a greater prob for this output than the partial maximum...
            if prob > max_arg:
                max_arg = prob
                solve = y

        return solve

In [121]:
def splitDataSet(dataSet, splitRatio):
    """
    Split the dataset into training and test sets
    """
    trainSize = int(len(dataSet) * splitRatio)
    trainSet = []
    copy = list(dataSet)
    while len(trainSet) < trainSize:
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    return [trainSet, copy]

In [106]:
def separateByClass(dataset):
    """
    Separate the dataset by class
    """
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated

In [107]:
def mean(numbers):
    """
    Calculate the mean of a list of numbers
    """
    return sum(numbers)/float(len(numbers))

In [108]:
def stdev(numbers):
    """
    Calculate the standard deviation of a list of numbers
    """
    avg = mean(numbers)
    variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
    return math.sqrt(variance)

In [109]:
def summarize(dataset):
    """
    Summarize the dataset
    """
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
    del summaries[-1]
    return summaries

In [110]:
def summerizeByClass(dataset):
    """
    Summarize the dataset by class
    """
    separated = separateByClass(dataset)
    summaries = {}
    for classValue, instances in separated.items():
        summaries[classValue] = summarize(instances)
    return summaries

In [111]:
def calculateProbability(x, mean, stdev):
    """
    Calculate the probability of x for a gaussian distribution with mean and stdev
    """
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

In [112]:
def calculateClassProbabilities(summaries, inputVector):
    """
    Calculate the probabilities of each class for the given input
    """
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = inputVector[i]
            probabilities[classValue] *= calculateProbability(x, mean, stdev)
    return probabilities

In [113]:
def predict(summaries, inputVector):
    """
    Predict the class for a given input
    """
    probabilities = calculateClassProbabilities(summaries, inputVector)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel

In [114]:
def getPredictions(summaries, testSet):
    """
    Predict the class for each input in the test set
    """
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions

In [115]:
def getAccuracy(testSet, predictions):
    """
    Calculate the accuracy of the predictions
    """
    correct = 0
    for i in range(len(testSet)):
        if testSet[i][-1] == predictions[i]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

In [116]:
def main():
    splitRatio = 0.67
    trainingSet, testSet = splitDataSet(df, splitRatio)
    print('Split {0} rows into train={1} and test={2} rows'.format(len(df), len(trainingSet), len(testSet)))
    #prepare model
    summeries = summerizeByClass(trainingSet)
    #test Model
    predictions = getPredictions(summeries, testSet)
    accurary = getAccuracy(testSet, predictions)
    print('Accuracy: {0}%'.format(accurary))

In [124]:
df

Unnamed: 0_level_0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,48.0,80.0,1.020,1.0,0.0,1,1,0,0,121.0,...,44.0,7800.0,5.2,1,1,0,0,0,0,ckd
1,7.0,50.0,1.020,4.0,0.0,1,1,0,0,121.0,...,38.0,6000.0,5.2,0,0,0,0,0,0,ckd
2,62.0,80.0,1.010,2.0,3.0,1,1,0,0,423.0,...,31.0,7500.0,5.2,0,1,0,1,0,1,ckd
3,48.0,70.0,1.005,4.0,0.0,1,0,1,0,117.0,...,32.0,6700.0,3.9,1,0,0,1,1,1,ckd
4,51.0,80.0,1.010,2.0,0.0,1,1,0,0,106.0,...,35.0,7300.0,4.6,0,0,0,0,0,0,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,0.0,1,1,0,0,140.0,...,47.0,6700.0,4.9,0,0,0,0,0,0,notckd
396,42.0,70.0,1.025,0.0,0.0,1,1,0,0,75.0,...,54.0,7800.0,6.2,0,0,0,0,0,0,notckd
397,12.0,80.0,1.020,0.0,0.0,1,1,0,0,100.0,...,49.0,6600.0,5.4,0,0,0,0,0,0,notckd
398,17.0,60.0,1.025,0.0,0.0,1,1,0,0,114.0,...,51.0,7200.0,5.9,0,0,0,0,0,0,notckd


In [None]:
## Creating the Naive Bayes Classifier instance with the training data

nbc = NBClassifier(X_train, y_train)


total_cases = len(y_val) # size of validation set

# Well classified examples and bad classified examples
good = 0
bad = 0

for i in range(total_cases):
    predict = nbc.classify(X_val[i])
#     print(y_val[i] + ' --------------- ' + predict)
    if y_val[i] == predict:
        good += 1
    else:
        bad += 1

print('TOTAL EXAMPLES:', total_cases)
print('RIGHT:', good)
print('WRONG:', bad)
print('ACCURACY:', good/total_cases)

KeyError: 0