In [76]:
import pandas as pd
import numpy as np

### load csv file into dataframe

In [49]:
df = pd.read_csv("heart.csv")
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [50]:
df.shape

(918, 12)

In [7]:
df.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [75]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


### Remove Outlier using Z score (-3<=Z<=3)

In [66]:
from scipy import stats
max(stats.zscore(df.Oldpeak,axis = 0))

4.983762314391507

In [59]:
z1 = df.RestingBP.mean()+3*df.RestingBP.std()
z2 = df.RestingBP.mean()-3*df.RestingBP.std()
df1 = df[df.RestingBP<=z1]
df2 = df1[df1.RestingBP>z2]
df2.shape

(910, 12)

In [67]:
z1 = df.Cholesterol.mean()+3*df.Cholesterol.std()
df3 = df2[df2.Cholesterol<=z1]
df3.shape

(907, 12)

In [85]:
z1 = df.Oldpeak.mean()+3*df.Oldpeak.std()
df4 = df3[df3.Oldpeak<=z1]
df4.shape

(901, 12)

### Label Encoding

In [80]:
df4.RestingECG.unique()

array(['Normal', 'ST', 'LVH'], dtype=object)

In [81]:
df4.ST_Slope.unique()

array(['Up', 'Flat', 'Down'], dtype=object)

In [83]:
df4.ExerciseAngina.unique()

array(['N', 'Y'], dtype=object)

In [96]:
df5 =df4.copy()
df5.RestingECG.replace({'Normal': 1,'ST': 2,'LVH': 3},inplace = True)
df5.ST_Slope.replace({'Down': 1,'Flat': 2,'Up': 3},inplace = True)
df5.ExerciseAngina.replace({'N':0,'Y':1},inplace = True)
df5.sample(3)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
795,42,M,NAP,120,240,1,1,194,0,0.8,1,0
576,62,M,ASY,139,170,0,2,120,1,3.0,2,1
715,44,F,NAP,108,141,0,1,175,0,0.6,2,0


### One Hot Encoding

In [88]:
df6 = pd.get_dummies(df5,drop_first = True)
df6.sample(3)

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA
135,49,115,265,0,1,175,0,0.0,2,1,1,0,1,0
717,49,130,269,0,1,163,0,0.0,3,0,0,0,0,0
176,54,140,216,0,1,105,0,1.5,2,1,1,0,0,0


In [92]:
x = df6.drop('HeartDisease',axis = 1)

In [97]:
y = df6.HeartDisease

### Scaling

In [112]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)
x_scaled

array([[-1.42801299,  0.46701542,  0.85146478, ...,  2.06615727,
        -0.53414109, -0.22928169],
       [-0.47502498,  1.63471598, -0.16358104, ..., -0.48399026,
         1.87216452, -0.22928169],
       [-1.74567565, -0.11683486,  0.7955907 , ...,  2.06615727,
        -0.53414109, -0.22928169],
       ...,
       [ 0.37207548, -0.11683486, -0.61988604, ..., -0.48399026,
        -0.53414109, -0.22928169],
       [ 0.37207548, -0.11683486,  0.35791039, ...,  2.06615727,
        -0.53414109, -0.22928169],
       [-1.6397881 ,  0.35024537, -0.21014277, ..., -0.48399026,
         1.87216452, -0.22928169]])

### Compare different model score

In [95]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(x_scaled,y,test_size = 0.2,random_state = 30)

In [100]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.8342541436464088

In [109]:
from sklearn.svm import SVC 
model = SVC(C = 5,gamma = 0.1,kernel = "rbf")
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.856353591160221

In [113]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.850828729281768

### PCA

In [123]:
from sklearn.decomposition import PCA
pca = PCA(0.99)
x_pca = pca.fit_transform(x)
x_pca

array([[ 93.60706725, -29.61311634,  10.93513613],
       [-15.86293805, -14.84681149,  31.13296305],
       [ 83.05802439,  38.97363557, -14.84072921],
       ...,
       [-67.80264733,  17.56740624,  -4.26969618],
       [ 40.51240598, -33.49723545,   5.39745545],
       [-20.12673467, -37.66366952,  12.21393393]])

In [124]:
pca.explained_variance_ratio_

array([0.92123925, 0.05064839, 0.02247744])

In [125]:
pca.n_components_

3

In [126]:
X_train_pca,X_test_pca,y_train,y_test = train_test_split(x_pca,y,test_size = 0.2,random_state = 30)

model = LogisticRegression(max_iter = 1000)
model.fit(X_train_pca,y_train)

In [131]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train_pca,y_train)
model.score(X_test_pca,y_test)

0.6961325966850829

In [133]:
from sklearn.svm import SVC 
model = SVC(C = 5,gamma = 0.1,kernel = "rbf")
model.fit(X_train_pca,y_train)
model.score(X_test_pca,y_test)

0.6298342541436464

In [134]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train_pca,y_train)
model.score(X_test_pca,y_test)

0.6408839779005525

In [136]:
pca = PCA(n_components = 2)
x_pca = pca.fit_transform(x)
x_pca

array([[ 93.60706725, -29.61311634],
       [-15.86293805, -14.84681149],
       [ 83.05802439,  38.97363557],
       ...,
       [-67.80264733,  17.56740624],
       [ 40.51240598, -33.49723545],
       [-20.12673467, -37.66366952]])

In [137]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train_pca,y_train)
model.score(X_test_pca,y_test)

0.6961325966850829