In [1]:
import numpy as np
import pandas as pda
import warnings
from sklearn.model_selection import train_test_split as tts
from math import log2

In [2]:
warnings.filterwarnings(action='ignore')

In [3]:
#reading the csv file with the help of pandas
heartData = pda.read_csv('heart_data.csv')

In [4]:
heartData
# Age
# Sex: Male => 1.0 , Female => 0.0
# cp_type: 1 => typical angina , 2 => atypical angina
#          3 => non- anginal pain, 4 => asymptomatic
# trestbps: resting blood pressure (in mm Hg on admission to the hospital)
# chol: serum cholestoral in mg/dl
# fbs: fasting blood sugar > 120 mg/dl (1 = true; 0 = false)
# restecg: resting electrocardiographic results (0 = normal, 1= having ST-T wave abnormality (T wave)
# thalach: maximum heart rate achieved during thalium stress test
# exang: exercise induced angina (1 = yes; 0 = no)
# oldspeak: ST depression induced by exercise relative to rest
# slope: the slope of the peak exercise ST segment (1 = upsloping, 2 = flat, 3 = downsloping)
# ca: number of major vessels (0-3) colored by flourosopy
# thal: results of thallium stress test (3 = normal; 6 = fixed defect; 7 = reversable defect )
# label: our y (1 = yes , 0 = no)

Unnamed: 0,Age,Sex,cp_type,trestbps,chol,fbs,restecg,thalach,exang,oldspeak,slope,ca,thal,label
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,1
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,1
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1


In [5]:
trainingSet,testingSet = tts(heartData,test_size = 0.3,random_state = 45)

In [6]:
trainingX = trainingSet['slope']
trainingY = trainingSet['label']
trainingX = trainingX.to_numpy()
trainingY = trainingY.to_numpy()

In [7]:
T = np.count_nonzero(trainingY == 1)
F = np.count_nonzero(trainingY == 0)

In [8]:
def entropy(T,F):
    tot = T + F
    pT = T/tot
    pF = F/tot
    
    return -(pT*log2(pT)) - (pF*log2(pF))

In [9]:
def InformationGain(T,F,Y,N):
    tot = T + F
    pT = T/tot
    pF = F/tot
    entrop = entropy(T,F)
    L = Y + N
    R = T - L
    return entrop - (L/tot)*entropy(Y,N) - (R/tot)*entropy(T - Y, F - N)

In [10]:
oneT = 0 
oneF = 0 
twoT = 0
twoF = 0 
threeT = 0 
threeF = 0 

In [11]:
for i in range(len(trainingX)):
    if trainingY[i]==1:
        if trainingX[i] == 1:
            oneT = oneT + 1 # When Label is True and Slope is One
        elif trainingX[i] == 2:
            twoT = twoT + 1 # When Label is True and Slope is Two
        elif trainingX[i] == 3:
            threeT = threeT + 1 # When Label is True and Slope is Three
    else:
        if trainingX[i] == 1:
            oneF = oneF + 1 # When Label is False and Slope is One
        elif trainingX[i] == 2:
            twoF = twoF + 1 # When Label is False and Slope is Two
        elif trainingX[i] == 3:
            threeF = threeF + 1 # When Label is False and Slope is Three

In [12]:
infoGainOne = InformationGain(T,F,oneT,oneF)
infoGainTwo = InformationGain(T,F,twoT,twoF)
infoGainThree = InformationGain(T,F,threeT,threeF)

In [13]:
maxInfoGain = max(infoGainOne,max(infoGainTwo,infoGainThree))

In [14]:
bestSlope = 0
if maxInfoGain == infoGainOne:
    bestSlope = 1
elif maxInfoGain == infoGainTwo:
    bestSlope = 2
else:
    bestSlope = 3

In [15]:
print("Best Slope is: ",bestSlope)
print("Best Information Gain is: ",maxInfoGain)

Best Slope is:  1
Best Information Gain is:  0.6074070159348276


In [16]:
testingX = testingSet['slope']
testingY = testingSet['label']
testingX = testingX.to_numpy()
testingY = testingY.to_numpy()

In [17]:
count = 0
tot = len(testingX)
for i in range(tot):
    if testingX[i] == bestSlope:
        predY = 1
    else:
        predY = 0
    if predY == testingY[i]:
        count = count + 1
        
accu = np.multiply(np.divide(count,tot),100)
print("Accuracy comes out to be: ", accu)

Accuracy comes out to be:  34.065934065934066
