In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math

## PREPROCESSING

In [2]:
def readdata():
    data = pd.read_csv("heart.csv")
    return data

In [3]:
def normalize(X, mean, std):
    return (X - mean) / std

In [4]:
# Normalize features for L2 distance calculation
def scaleFeature(df, col):
    mean = df[col].mean()
    std = df[col].std()
    df[col] = df[col].apply(lambda x: normalize(x, mean, std))

In [5]:
# Category features
continuous_features  = ['age', 'chol', 'oldpeak', 'thalach', 'trestbps']
categorical_features = ['ca', 'cp', 'restecg', 'slope', 'thal', 'sex', 'fbs', 'exang']

## NAIVE BAYES

In [6]:
# Load data and preprocess
data = readdata()

for col in continuous_features:
    scaleFeature(data, col)

In [7]:
# separate into training and testing sets
train=data.sample(frac=0.8,random_state=200) #random state is a seed value
test=data.drop(train.index)

# save predictions
test['y'] = 0

print('data shape ' + str(data.shape))
print('train shape ' + str(train.shape))
print('test shape ' + str(test.shape))

data shape (303, 14)
train shape (242, 14)
test shape (61, 15)


In [8]:
# NB algorithm

def calcCatProb(df, column, value):
    denom = len(df[column])
    num = len(df[df[column]==value])
    prob = num/denom
    return prob

def calcGaussProb(df, column, value):
    mean = df[column].mean()
    std = df[column].std()
    
    var = std**2
    denom = (2*math.pi*var)**.5
    num = math.exp(-(value - mean)**2/(2*var))
    prob = num/denom
    
    return prob

def updatePred(df, row, totProb0, totProb1):
    yidx = test.columns.get_loc('y')

    if totProb1 > totProb0:
        df.iat[row, yidx] = 1
    else:
        df.iat[row, yidx] = 0

In [9]:
def naiveBayes(X, y):

    # segment the data by class and calculate the priors    
    outcome0 = X[X['target']==0]
    prior0 = len(outcome0.index) / len(X.index)

    outcome1 = X[X['target']==1]
    prior1 = len(outcome1.index) / len(X.index)
    
    for row in range(len(y.index)):
        # initialize total probability to the priors
        totProb0 = prior0
        totProb1 = prior1
        
        # for each categorical column, calculate class probability
        for column in categorical_features:
            value = y.iloc[row][column]
            prob0 = calcCatProb(outcome0, column, value)
            prob1 = calcCatProb(outcome1, column, value)

            totProb0 = totProb0 * prob0
            totProb1 = totProb1 * prob1

        # for each gaussian column, calculate class probability
        for column in continuous_features:
            value = y.iloc[row][column]
            prob0 = calcGaussProb(outcome0, column, value)
            prob1 = calcGaussProb(outcome1, column, value)

            totProb0 = totProb0 * prob0
            totProb1 = totProb1 * prob1

        # update the prediction with the more likely class value
        updatePred(y, row, totProb0, totProb1)

In [10]:
naiveBayes(train, test)

In [11]:
# Print results of Naive Bayes classifier

def printConfusionMatrix(tp, fp, tn, fn):
    print("\n%15sActual" % "")
    print("%6s %7s %7s" % ("", "1", "0"))
    print("P%6s +--------+--------+" % "")
    print("r%6s | %-6s | %-6s |" % ("1", 'TP='+str(tp), 'FP='+str(fp)))
    print("e%6s +--------+--------+" % "")
    print("d%6s | %-6s | %-6s |" % ("0", 'FN='+str(fn), 'TN='+str(tn)))
    print(".%6s +--------+--------+\n" % "")

def getConfusionMatrix(df):
    tp = len(df[(df['target']==1) & (df['y']==1)])
    fp = len(df[(df['target']==0) & (df['y']==1)])
    tn = len(df[(df['target']==0) & (df['y']==0)])
    fn = len(df[(df['target']==1) & (df['y']==0)])

    return tp, fp, tn, fn

def getAccuracy(tp, fp, tn, fn):
    return (tp+tn)/(tp+tn+fp+fn)

def getPrecision(tp, fp, tn, fn):
    return tp/(tp+fp)

def getRecall(tp, fp, tn, fn):
    return tp/(tp+fn)

def getFmeasure(tp, fp, tn, fn):
    recall = getRecall(tp, fp, tn, fn)
    prec = getPrecision(tp, fp, tn, fn)
    return 2*(recall*prec)/(recall+prec)

tp, fp, tn, fn = getConfusionMatrix(test)
printConfusionMatrix(tp, fp, tn, fn)

print('Accuracy:  %8.5f' % getAccuracy(tp, fp, tn, fn))
print('Precison:  %8.5f' % getPrecision(tp, fp, tn, fn))
print('Recall:    %8.5f' % getRecall(tp, fp, tn, fn))
print('F-Measure: %8.5f' % getFmeasure(tp, fp, tn, fn))


               Actual
             1       0
P       +--------+--------+
r     1 | TP=20  | FP=6   |
e       +--------+--------+
d     0 | FN=8   | TN=27  |
.       +--------+--------+

Accuracy:   0.77049
Precison:   0.76923
Recall:     0.71429
F-Measure:  0.74074
