In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math

## PREPROCESSING

In [2]:
def readdata():
    data = pd.read_csv("heart.csv")
    return data

In [3]:
def normalize(X, mean, std):
    return (X - mean) / std

In [4]:
# Normalize features for L2 distance calculation
def scaleFeature(df, col):
    mean = df[col].mean()
    std = df[col].std()
    df[col] = df[col].apply(lambda x: normalize(x, mean, std))

In [5]:
# Category features
continuous_features  = ['age', 'chol', 'oldpeak', 'thalach', 'trestbps']
categorical_features = ['ca', 'cp', 'restecg', 'slope', 'thal', 'sex', 'fbs', 'exang']


## KNN CLASSIFIER


In [6]:
# kNN Load data and preprocess

# Load data and preprocess
data = readdata()

for col in continuous_features:
    scaleFeature(data, col)

# separate into training and testing sets
train=data.sample(frac=0.8,random_state=200) #random state is a seed value
test=data.drop(train.index)
newtrain = train.sample(frac=0.8,random_state=200) #random state is a seed value
validation = train.drop(newtrain.index)


# save predictions
validation['y'] = 0
test['y'] = 0

print('data shape ' + str(data.shape))
print('train shape ' + str(newtrain.shape))
print('validation shape ' + str(validation.shape))
print('test shape ' + str(test.shape))

data shape (303, 14)
train shape (194, 14)
validation shape (48, 15)
test shape (61, 15)


In [7]:
# Calculate the L2 distance between the current row and a neighbor
def getDistance(row1, row2):
    columns = continuous_features
    
    squares = 0
    for column in columns:
        value1 = row1[column]
        value2 = row2[column]
    
        square = (value1 - value2)**2
        squares = squares + square

    dist = math.sqrt(squares)
    return dist

# get the K closest neighbors
def getNeighbors(X, y, curRow, k):
    row1 = y.iloc[curRow]

    for rowNum in range(len(X.index)):
        row = X.iloc[rowNum]
        dist = getDistance(row1, row)

        if curRow != rowNum:
            X.at[rowNum, 'dist'] = dist
        else:
            X.at[rowNum, 'dist'] = -1
    
    neighbors = X[X['dist']>=0].sort_values('dist', ascending=True).head(k)
    return neighbors


def knnClassify(df, curRow, neighbors):
    class0 = len(neighbors[neighbors['target']==0].index)
    class1 = len(neighbors[neighbors['target']==1].index)
    yidx = test.columns.get_loc("y")

    # print(class0, class1)
    index = df.iloc[rowNum].index
    if class0 > class1:
        df.iat[curRow, yidx] = 0
    else:
        df.iat[curRow, yidx] = 1

In [8]:
def printConfusionMatrix(tp, fp, tn, fn):
    print("\n%15sActual" % "")
    print("%6s %7s %7s" % ("", "1", "0"))
    print("P%6s +--------+--------+" % "")
    print("r%6s | %-6s | %-6s |" % ("1", 'TP='+str(tp), 'FP='+str(fp)))
    print("e%6s +--------+--------+" % "")
    print("d%6s | %-6s | %-6s |" % ("0", 'FN='+str(fn), 'TN='+str(tn)))
    print(".%6s +--------+--------+\n" % "")


def getConfusionMatrix(df):
    tp = len(df[(df['target']==1) & (df['y']==1)])
    fp = len(df[(df['target']==0) & (df['y']==1)])
    tn = len(df[(df['target']==0) & (df['y']==0)])
    fn = len(df[(df['target']==1) & (df['y']==0)])

    return tp, fp, tn, fn


def getAccuracy(tp, fp, tn, fn):
    return (tp+tn)/(tp+tn+fp+fn)

def getPrecision(tp, fp, tn, fn):
    return tp/(tp+fp)

def getRecall(tp, fp, tn, fn):
    return tp/(tp+fn)

def getFmeasure(tp, fp, tn, fn):
    recall = getRecall(tp, fp, tn, fn)
    prec = getPrecision(tp, fp, tn, fn)
    return 2*(recall*prec)/(recall+prec)

In [9]:
# fine-tuning k value on validation set
for k in range(1,5):
    print('***********')
    print('K value is  %d' % k)
    # Get neighbors and predict class for each test case
    for rowNum in range(len(validation.index)):
        neighbors = getNeighbors(newtrain, validation, rowNum, k)
        knnClassify(validation, rowNum, neighbors)
    tp, fp, tn, fn = getConfusionMatrix(validation)
    printConfusionMatrix(tp, fp, tn, fn)
                
    print('Accuracy:  %8.5f' % getAccuracy(tp, fp, tn, fn))
    print('Precison:  %8.5f' % getPrecision(tp, fp, tn, fn))
    print('Recall:    %8.5f' % getRecall(tp, fp, tn, fn))
    print('F-Measure: %8.5f' % getFmeasure(tp, fp, tn, fn))         

***********
K value is  1

               Actual
             1       0
P       +--------+--------+
r     1 | TP=29  | FP=18  |
e       +--------+--------+
d     0 | FN=0   | TN=1   |
.       +--------+--------+

Accuracy:   0.62500
Precison:   0.61702
Recall:     1.00000
F-Measure:  0.76316
***********
K value is  2

               Actual
             1       0
P       +--------+--------+
r     1 | TP=26  | FP=18  |
e       +--------+--------+
d     0 | FN=3   | TN=1   |
.       +--------+--------+

Accuracy:   0.56250
Precison:   0.59091
Recall:     0.89655
F-Measure:  0.71233
***********
K value is  3

               Actual
             1       0
P       +--------+--------+
r     1 | TP=26  | FP=19  |
e       +--------+--------+
d     0 | FN=3   | TN=0   |
.       +--------+--------+

Accuracy:   0.54167
Precison:   0.57778
Recall:     0.89655
F-Measure:  0.70270
***********
K value is  4

               Actual
             1       0
P       +--------+--------+
r     1 | TP=28  | FP

In [10]:
# best k
k = 1

In [11]:
# testing
for rowNum in range(len(test.index)):
    neighbors = getNeighbors(newtrain, test, rowNum, k)
    knnClassify(test, rowNum, neighbors)
tp, fp, tn, fn = getConfusionMatrix(test)
printConfusionMatrix(tp, fp, tn, fn)
                
print('Accuracy:  %8.5f' % getAccuracy(tp, fp, tn, fn))
print('Precison:  %8.5f' % getPrecision(tp, fp, tn, fn))
print('Recall:    %8.5f' % getRecall(tp, fp, tn, fn))
print('F-Measure: %8.5f' % getFmeasure(tp, fp, tn, fn))     


               Actual
             1       0
P       +--------+--------+
r     1 | TP=25  | FP=32  |
e       +--------+--------+
d     0 | FN=3   | TN=1   |
.       +--------+--------+

Accuracy:   0.42623
Precison:   0.43860
Recall:     0.89286
F-Measure:  0.58824
