In [1]:
import numpy as np
import pandas as pda
import warnings
from sklearn.model_selection import train_test_split as tts

In [2]:
warnings.filterwarnings(action='ignore')

In [3]:
#reading the csv file with the help of pandas
heartData = pda.read_csv('heart_data.csv')

In [4]:
heartData
# Age
# Sex: Male => 1.0 , Female => 0.0
# cp_type: 1 => typical angina , 2 => atypical angina
#          3 => non- anginal pain, 4 => asymptomatic
# trestbps: resting blood pressure (in mm Hg on admission to the hospital)
# chol: serum cholestoral in mg/dl
# fbs: fasting blood sugar > 120 mg/dl (1 = true; 0 = false)
# restecg: resting electrocardiographic results (0 = normal, 1= having ST-T wave abnormality (T wave)
# thalach: maximum heart rate achieved during thalium stress test
# exang: exercise induced angina (1 = yes; 0 = no)
# oldspeak: ST depression induced by exercise relative to rest
# slope: the slope of the peak exercise ST segment (1 = upsloping, 2 = flat, 3 = downsloping)
# ca: number of major vessels (0-3) colored by flourosopy
# thal: results of thallium stress test (3 = normal; 6 = fixed defect; 7 = reversable defect )
# label: our y (1 = yes , 0 = no)

Unnamed: 0,Age,Sex,cp_type,trestbps,chol,fbs,restecg,thalach,exang,oldspeak,slope,ca,thal,label
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,1
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,1
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1


In [5]:
trainingSet,testingSet = tts(heartData,test_size = 0.3,random_state = 30)

In [6]:
target = trainingSet['label']

In [7]:
feature = trainingSet['Age']

In [8]:
Y = np.sort(target)
X = np.sort(feature)

In [9]:
def giniImpurity(rootAge,X,Y):
    tot = len(X)
    T = 0 # True
    TY = 0 # True Yes
    TN = 0 # True No
    F = 0 # False
    FY = 0 # False Yes
    FN = 0 # False No
    
    for i in range(tot):
        if(X[i]<=rootAge):
            T = T + 1
            if(Y[i] == 1):
                TY = TY + 1
            else:
                TN = TN + 1
        else:
            F = F + 1
            if(Y[i] == 1):
                FY = FY + 1
            else:
                FN = FN + 1
                
    TGNI = 1 - ((TY/T)**2) - ((TN/T)**2) # True Gini Impurity
    FGNI = 1 - ((FY/F)**2) - ((FN/F)**2) # False Gini Impurity
        
    GNI = ((T/tot)*TGNI) + ((F/tot)*FGNI) # Gini Impurity
       
    return GNI       

In [10]:
def optimalNode(X,Y):
    optimalNodeAge = X[1]
    optimalNodeGNI = giniImpurity(optimalNodeAge,X,Y)
    tot = len(X)
    count = 0
    for i in range(tot-1):
        rootAge = X[i]
        count = count + 1
        print("*******Iteration: ",count ,"********\n")
        tempGNI = giniImpurity(rootAge,X,Y)
        print("tempGNI: ", tempGNI , "     " , "Optimal GNI: " ,optimalNodeGNI)
        if tempGNI < optimalNodeGNI:
            optimalNodeGNI = tempGNI
            optimalNodeAge = rootAge
        print("Optimal Age: ",optimalNodeAge)
        print("\n")
            
    return optimalNodeAge,optimalNodeGNI

In [11]:
bestAge,minGNI = optimalNode(X,Y)

*******Iteration:  1 ********

tempGNI:  0.49736206742376826       Optimal GNI:  0.49510332434860743
Optimal Age:  34.0


*******Iteration:  2 ********

tempGNI:  0.49510332434860743       Optimal GNI:  0.49510332434860743
Optimal Age:  34.0


*******Iteration:  3 ********

tempGNI:  0.4881961534955792       Optimal GNI:  0.49510332434860743
Optimal Age:  35.0


*******Iteration:  4 ********

tempGNI:  0.4881961534955792       Optimal GNI:  0.4881961534955792
Optimal Age:  35.0


*******Iteration:  5 ********

tempGNI:  0.4881961534955792       Optimal GNI:  0.4881961534955792
Optimal Age:  35.0


*******Iteration:  6 ********

tempGNI:  0.4858490566037736       Optimal GNI:  0.4881961534955792
Optimal Age:  37.0


*******Iteration:  7 ********

tempGNI:  0.48108583055863857       Optimal GNI:  0.4858490566037736
Optimal Age:  38.0


*******Iteration:  8 ********

tempGNI:  0.48108583055863857       Optimal GNI:  0.48108583055863857
Optimal Age:  38.0


*******Iteration:  9 ********

t

tempGNI:  0.492010472149499       Optimal GNI:  0.009348198970840452
Optimal Age:  56.0


*******Iteration:  210 ********

tempGNI:  0.4945642407906558       Optimal GNI:  0.009348198970840452
Optimal Age:  56.0


*******Iteration:  211 ********

tempGNI:  0.49709380309398193       Optimal GNI:  0.009348198970840452
Optimal Age:  56.0




In [12]:
print("The Best Age comes out to be: ", bestAge)
print("Minimum Gini Impurity comes out to be: ", minGNI)

The Best Age comes out to be:  56.0
Minimum Gini Impurity comes out to be:  0.009348198970840452


In [13]:
testingX = testingSet['Age']
testingY = testingSet['label']
testingX = testingX.to_numpy()
testingY = testingY.to_numpy()

In [14]:
count = 0
tot = len(testingX)
for i in range(tot):
    if testingX[i] > bestAge:
        predY = 1
    else:
        predY = 0
    if predY == testingY[i]:
        count = count + 1
        
accu = np.multiply(np.divide(count,tot),100)
print("Accuracy comes out to be: ", accu)
    

Accuracy comes out to be:  65.93406593406593
