In [94]:
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.decomposition import PCA

In [95]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [96]:
df[df.RestingBP>(df.RestingBP.mean()+3*df.RestingBP.std())]
df.shape

(918, 12)

In [97]:
df1 = df[df.RestingBP<=(df.RestingBP.mean()+3*df.RestingBP.std())]
print(df1.shape)

(911, 12)


In [98]:
df1[df1.Cholesterol>(df1.Cholesterol.mean()+3*df1.Cholesterol.std())]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
76,32,M,ASY,118,529,0,Normal,130,N,0.0,Flat,1
149,54,M,ASY,130,603,1,Normal,125,Y,1.0,Flat,1
616,67,F,NAP,115,564,0,LVH,160,N,1.6,Flat,0


In [99]:
df2 = df1[df1.Cholesterol<=(df1.Cholesterol.mean()+3*df1.Cholesterol.std())]
print(df2.shape)

(908, 12)


In [100]:
df2[df2.MaxHR>(df2.MaxHR.mean()+3*df2.MaxHR.std())]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease


In [101]:
df2[df2.Oldpeak>(df2.Oldpeak.mean()+3*df2.Oldpeak.std())]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
166,50,M,ASY,140,231,0,ST,140,Y,5.0,Flat,1
702,59,M,TA,178,270,0,LVH,145,N,4.2,Down,0
771,55,M,ASY,140,217,0,Normal,111,Y,5.6,Down,1
791,51,M,ASY,140,298,0,Normal,122,Y,4.2,Flat,1
850,62,F,ASY,160,164,0,LVH,145,N,6.2,Down,1
900,58,M,ASY,114,318,0,ST,140,N,4.4,Down,1


In [102]:
df3 = df2[df2.Oldpeak<=(df2.Oldpeak.mean()+3*df2.Oldpeak.std())]
df3.shape

(902, 12)

In [103]:
df3.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [104]:
print(df3.Sex.unique())
print(df3.ChestPainType.unique())
print(df3.RestingECG.unique())
print(df3.ExerciseAngina.unique())
print(df3.ST_Slope.unique())

['M' 'F']
['ATA' 'NAP' 'ASY' 'TA']
['Normal' 'ST' 'LVH']
['N' 'Y']
['Up' 'Flat' 'Down']


In [105]:
df4 = df3.copy()

df4.Sex.replace({
    'M':1,
    'F':2
}, inplace=True)
df4.head()

df4.replace({
    'ChestPainType' : {
        'ATA':1,
        'NAP':2,
        'ASY':3,
        'TA':4
    },
    'RestingECG':{
        'Normal':1,
        'ST':2,
        'LVH':3
    },
    'ExerciseAngina':{
        'N':0,
        'Y':1
    },
    'ST_Slope':{
        'Up':1,
        'Flat':2,
        'Down':3
    }
}, inplace=True)

In [106]:
df4.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,0,0.0,1,0
1,49,2,2,160,180,0,1,156,0,1.0,2,1
2,37,1,1,130,283,0,2,98,0,0.0,1,0
3,48,2,3,138,214,0,1,108,1,1.5,2,1
4,54,1,2,150,195,0,1,122,0,0.0,1,0


In [107]:
X = df4.drop('HeartDisease', axis='columns')
y = df4.HeartDisease

In [108]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [109]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y)

In [110]:
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

svm_model = SVC(gamma='auto', kernel='rbf', C=10)
svm_model.fit(X_train, y_train)

rf_model = RandomForestClassifier(n_estimators=5)
rf_model.fit(X_train, y_train)

RandomForestClassifier(n_estimators=5)

In [111]:
print(lr_model.score(X_test, y_test))
print(svm_model.score(X_test, y_test))
print(rf_model.score(X_test, y_test))

0.8362831858407079
0.8141592920353983
0.8097345132743363


In [112]:
pca = PCA(0.95)
X_pca = pca.fit_transform(X)
X_pca.shape

(902, 2)

In [113]:
pca.explained_variance_ratio_

array([0.92008284, 0.05066138])

In [114]:
pca.n_components_

2

In [115]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y)

In [116]:
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

svm_model = SVC(gamma='auto', kernel='rbf', C=10)
svm_model.fit(X_train, y_train)

rf_model = RandomForestClassifier(n_estimators=5)
rf_model.fit(X_train, y_train)

RandomForestClassifier(n_estimators=5)

In [117]:
print(lr_model.score(X_test, y_test))
print(svm_model.score(X_test, y_test))
print(rf_model.score(X_test, y_test))

0.6946902654867256
0.584070796460177
0.6592920353982301
