In [24]:
import pandas as pd
import numpy as np
from sklearn import *
from scipy.spatial import distance
import scipy.stats as st
# Loading the data and dropping the index axis
df = pd.read_csv('Datasets/kidney_disease.csv')
df  = df.drop(['id'],axis=1)


# separating data into different classes
real = ['sc','pot','hemo','rc', 'sg']
integer = ['age','bp','bgr','bu','sod','pcv','wc','su', 'al']
# binary = ['rbc', 'pc', 'pcc', 'pa', 'bgr', 'htn', 'dm','cad']
label = ['classification']
cat = list(set(df.columns) - set(real)-set(integer)-set(label))

# Removing parsing errors
df = df.replace('\t?',np.nan)
df = df.replace('\tyes','yes')
df = df.replace(' yes','yes')
df = df.replace('yes\t','yes')
df = df.replace('\tno','no')
df = df.replace('ckd\t','ckd')
df = df.replace('ckd',1)
df = df.replace('notckd',0)


# Filling the null values with mean you can also use other statistic like mode or median
for r in real:
    mean = np.array(df[r][~df[r].isna()]).astype('float').mean()
    df[r] = df[r].fillna(mean)
for i in integer:
    mean = np.array(df[i][~df[i].isna()]).astype('int').mean()
    df[i] = df[i].fillna(int(mean))

X = df.drop(label,axis=1)
y = df[label]

In [25]:
X_cat  = X[cat]
X_int = X[integer].astype('int64')
X_real = X[real]

In [26]:
X_cat = pd.get_dummies(X_cat, columns = X_cat.columns)
X_cat
#X_cat = X_cat.astype('bool')
X_cat

Unnamed: 0,dm_no,dm_yes,appet_good,appet_poor,htn_no,htn_yes,pcc_notpresent,pcc_present,pe_no,pe_yes,pc_abnormal,pc_normal,cad_no,cad_yes,ane_no,ane_yes,ba_notpresent,ba_present,rbc_abnormal,rbc_normal
0,0,1,1,0,0,1,1,0,1,0,0,1,1,0,1,0,1,0,0,0
1,1,0,1,0,1,0,1,0,1,0,0,1,1,0,1,0,1,0,0,0
2,0,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,1
3,1,0,0,1,0,1,0,1,0,1,1,0,1,0,0,1,1,0,0,1
4,1,0,1,0,1,0,1,0,1,0,0,1,1,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,1,0,1,0,1,0,1,0,1,0,0,1,1,0,1,0,1,0,0,1
396,1,0,1,0,1,0,1,0,1,0,0,1,1,0,1,0,1,0,0,1
397,1,0,1,0,1,0,1,0,1,0,0,1,1,0,1,0,1,0,0,1
398,1,0,1,0,1,0,1,0,1,0,0,1,1,0,1,0,1,0,0,1


In [27]:

def find_minkowski(a, b):
    return distance.minkowski(a,b)
def find_canberra(a, b):
    return distance.canberra(a, b)
def find_rusrao(a, b):
    return distance.russellrao(a, b)

In [28]:
def find_distance(a_real, b_real, a_int, b_int, a_cat, b_cat):
    mink = find_minkowski(a_real, b_real)
    # print(mink)
    canb = find_canberra(a_int, b_int)
    #print(canb)
    rus = find_rusrao(a_cat, b_cat)
    #print(rus)
    return mink + canb + rus

In [29]:
# X_real = X_real.to_numpy()
# X_cat = X_cat.to_numpy()
# X_int = X_int.to_numpy()
# y = y
X_real = X_real.astype('float64')
X_int = X_int.astype('int64')
X_cat = X_cat.astype('bool')

In [30]:
X_train_real, X_test_real, X_train_int, X_test_int, X_train_cat, X_test_cat, y_train, y_test = model_selection.train_test_split(X_real, X_int, X_cat, y, random_state = 42, test_size = 0.33)

In [31]:
X_train_real

Unnamed: 0,sc,pot,hemo,rc,sg
258,0.5,3.500000,13.900000,5.500000,1.020000
177,2.5,4.627244,13.200000,4.707435,1.015000
119,1.2,4.627244,12.526437,4.707435,1.010000
194,1.2,4.627244,12.526437,4.707435,1.010000
229,12.0,2.900000,9.600000,3.800000,1.010000
...,...,...,...,...,...
71,3.3,4.000000,9.800000,3.200000,1.010000
106,6.1,4.400000,6.000000,4.707435,1.017408
270,1.1,4.000000,14.300000,5.000000,1.025000
348,0.5,3.500000,13.600000,6.400000,1.020000


In [32]:
X_train_int

Unnamed: 0,age,bp,bgr,bu,sod,pcv,wc,su,al
258,42,80,98,20,140,44,8400,0,0
177,65,80,215,133,137,41,8406,1,2
119,60,70,140,27,137,38,8406,0,0
194,80,70,148,49,137,38,8406,0,2
229,59,50,241,191,114,31,15700,0,3
...,...,...,...,...,...,...,...,...,...
71,46,60,163,92,141,28,14600,0,1
106,50,90,89,118,127,17,6500,0,1
270,23,80,111,34,145,41,7200,0,0
348,38,80,99,19,147,44,7300,0,0


In [33]:
Xtrain_real,Xval_real, Xtrain_int,Xval_int,Xtrain_cat,Xval_cat,ytrain,yval = model_selection.train_test_split(X_train_real,X_train_int, X_train_cat, y_train, random_state = 42, test_size = 0.5)

In [34]:
print(Xtrain_real.shape)
print(X_test_real.shape)
print(Xval_real.shape)

(134, 5)
(132, 5)
(134, 5)


In [35]:
display(X_test_real)
X_test_real.dtypes

Unnamed: 0,sc,pot,hemo,rc,sg
209,3.072454,4.627244,11.5,4.707435,1.020000
280,0.900000,4.500000,13.3,5.200000,1.017408
33,2.500000,4.627244,10.1,4.707435,1.020000
210,12.800000,5.700000,7.3,3.900000,1.015000
93,5.600000,2.900000,9.2,3.200000,1.010000
...,...,...,...,...,...
332,1.000000,5.000000,15.3,6.100000,1.025000
167,0.900000,4.627244,12.7,4.707435,1.020000
245,5.300000,6.300000,6.3,2.600000,1.017408
311,1.100000,4.700000,13.7,5.600000,1.025000


sc      float64
pot     float64
hemo    float64
rc      float64
sg      float64
dtype: object

In [36]:
Xtrain_cat = Xtrain_cat.to_numpy()
Xval_cat = Xval_cat.to_numpy()
Xtest_cat = X_test_cat.to_numpy()
Xtrain_int = Xtrain_int.to_numpy()
Xval_int = Xval_int.to_numpy()
Xtest_int = X_test_int.to_numpy()
Xtrain_real = Xtrain_real.to_numpy()
Xval_real  = Xval_real.to_numpy()
Xtest_real = X_test_real.to_numpy()
ytrain = ytrain.to_numpy()
yval = yval.to_numpy()
ytest = y_test.to_numpy()

In [37]:
print(type(distance.russellrao(Xtest_cat[0], Xtest_cat[0])))
print(distance.minkowski(Xtest_real[1], Xtest_real[2]))
distance.canberra(Xtest_int[1], Xtest_int[2])

<class 'float'>
3.6136986570694023


1.345056872252295

In [38]:
def KNN(x_real, x_int, x_cat, X_train_real, X_train_int, X_train_cat,y_train, k=3):
    distances = []
    for i in range(X_train_real.shape[0]):
        distances.append(find_distance(x_real, X_train_real[i], x_int, X_train_int[i], x_cat, X_train_cat[i]))
    dist_array = np.array(distances)
    ind = np.argpartition(dist_array, k)
    return st.mode(y_train[ind[:k]]).mode[0][0]

In [39]:
label=  KNN(Xtest_real[1],Xtest_int[1],Xtest_cat[1], Xtrain_real, Xtrain_int, Xtrain_cat , ytrain)
print(label)
# find_distance(Xtest_real[0], Xtrain_real[0], Xtest_int[0], Xtrain_int[0], Xtest_cat[0], Xtrain_cat[0])

0


In [40]:
def find_accuracy(X_ds_real, X_ds_int, X_ds_cat, y_ds, k_nn):
    labels = []
    for i in range(X_ds_real.shape[0]):
        label = KNN(X_ds_real[i], X_ds_int[i], X_ds_cat[i], Xtrain_real, Xtrain_int, Xtrain_cat,ytrain, k=k_nn)
        labels.append(label)
    return metrics.explained_variance_score(y_ds, np.array(labels))

In [41]:
find_accuracy(Xtest_real, Xtest_int, Xtest_cat, ytest, 3)
find_accuracy(Xval_real, Xval_int, Xval_cat, yval, 3)

0.8095238095238095

In [42]:
def validate(k_start, k_end, Xval_real, Xval_int, Xval_cat, yval):
    k_best = 1
    acc_best = 0
    for k in range(k_start, k_end,2):
        acc = find_accuracy(Xval_real, Xval_int, Xval_cat, yval, k_nn=k)
        print("Test Accuracy for k = ", k, "is = ", acc)
        if(acc >= acc_best):
            k_best = k
        else:
            continue
    return k_best

In [43]:
k_best = validate(1, 11, Xval_real, Xval_int, Xval_cat, yval)
print("the best k  = ", k_best)

Test Accuracy for k =  1 is =  0.8123809523809523
Test Accuracy for k =  3 is =  0.8095238095238095
Test Accuracy for k =  5 is =  0.8095238095238095
Test Accuracy for k =  7 is =  0.7788095238095237
Test Accuracy for k =  9 is =  0.8171428571428572
the best k  =  9


### Kernel Density Estimation

---

$$ k(x, x_i) =  \frac{1}{\sqrt{2\pi}}e^{\| x - x_i\|^2}$$
the Kernel has to be a product of several other kernel density estimators. This will also represnt a kernel, we model the joint with a product of the individual densities. (independent assumption?) we can choose a poisson distribution for modelling the integer values
and we leave the binary distribution as it is, $p(x|y)$ is modelled using KDE and then the posterior is calculated.

$$ p(y|x) = \frac{ \sum_{i=1}^{i=m_i} \frac{1}{m_{y_i}} k(x - x_i)k_2(x - x_i)k_3(x - x_i)}{1} \times \frac{m_{y_i}}{m}$$

On simplifying the numerator we get the following expression.

$$ p(y = i|x) = \frac{ \frac{1}{m}\sum_{i=1}^{i = m}k_1(x - x_i)k_2(x - x_i)k_3(x - x_i)}{D}$$

The maximum value of the average product of kenrels will give me the correct class!

In [44]:
def gaussian_kernel(x_real, x_i_real):
    return (1/(np.sqrt(2* np.pi))) * np.exp(1/2 * (np.linalg.norm(x_real - x_i_real)^2))