In [73]:
import pandas as pd
from path import Path
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

In [74]:
# Load CSV dataset and start cleaning the data 
hf_data = pd.read_csv("heart.csv")

In [75]:
hf_data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [76]:
hf_data.columns

Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'],
      dtype='object')

In [77]:
hf_subset = hf_data[['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina','HeartDisease']]
hf_subset

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0
1,49,F,NAP,160,180,0,Normal,156,N,1
2,37,M,ATA,130,283,0,ST,98,N,0
3,48,F,ASY,138,214,0,Normal,108,Y,1
4,54,M,NAP,150,195,0,Normal,122,N,0
...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1
914,68,M,ASY,144,193,1,Normal,141,N,1
915,57,M,ASY,130,131,0,Normal,115,Y,1
916,57,F,ATA,130,236,0,LVH,174,N,1


In [78]:
hf_subset.columns = [
    'Age', 
    'Sex', 
    'Chest Pain Type', 
    'Resting Blood Pressure', 
    'Cholesterol', 
    'Fasting Blood Sugar',
    'Resting ECG', 
    'Max Heart Rate', 
    'Exercise Angina',
    'Heart Disease'   
]
hf_subset

Unnamed: 0,Age,Sex,Chest Pain Type,Resting Blood Pressure,Cholesterol,Fasting Blood Sugar,Resting ECG,Max Heart Rate,Exercise Angina,Heart Disease
0,40,M,ATA,140,289,0,Normal,172,N,0
1,49,F,NAP,160,180,0,Normal,156,N,1
2,37,M,ATA,130,283,0,ST,98,N,0
3,48,F,ASY,138,214,0,Normal,108,Y,1
4,54,M,NAP,150,195,0,Normal,122,N,0
...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1
914,68,M,ASY,144,193,1,Normal,141,N,1
915,57,M,ASY,130,131,0,Normal,115,Y,1
916,57,F,ATA,130,236,0,LVH,174,N,1


In [79]:
hf_subset.to_html("hfstats.html")

In [80]:
hf_subset.describe()

Unnamed: 0,Age,Resting Blood Pressure,Cholesterol,Fasting Blood Sugar,Max Heart Rate,Heart Disease
count,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,0.497414
min,28.0,0.0,0.0,0.0,60.0,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.0
max,77.0,200.0,603.0,1.0,202.0,1.0


In [81]:
hf_subset[hf_subset["Heart Disease"]>=1]

Unnamed: 0,Age,Sex,Chest Pain Type,Resting Blood Pressure,Cholesterol,Fasting Blood Sugar,Resting ECG,Max Heart Rate,Exercise Angina,Heart Disease
1,49,F,NAP,160,180,0,Normal,156,N,1
3,48,F,ASY,138,214,0,Normal,108,Y,1
8,37,M,ASY,140,207,0,Normal,130,Y,1
11,58,M,ATA,136,164,0,ST,99,Y,1
13,49,M,ASY,140,234,0,Normal,140,Y,1
...,...,...,...,...,...,...,...,...,...,...
912,57,F,ASY,140,241,0,Normal,123,Y,1
913,45,M,TA,110,264,0,Normal,132,N,1
914,68,M,ASY,144,193,1,Normal,141,N,1
915,57,M,ASY,130,131,0,Normal,115,Y,1


In [82]:
pd.DataFrame(hf_subset["Age"].value_counts())

Unnamed: 0,Age
54,51
58,42
55,41
57,38
56,38
52,36
62,35
59,35
51,35
53,33


In [83]:
pd.DataFrame(hf_subset["Chest Pain Type"].value_counts())

Unnamed: 0,Chest Pain Type
ASY,496
NAP,203
ATA,173
TA,46


In [84]:
pd.DataFrame(hf_subset["Resting ECG"].value_counts())

Unnamed: 0,Resting ECG
Normal,552
LVH,188
ST,178


In [85]:
pd.DataFrame(hf_subset["Heart Disease"].value_counts())

Unnamed: 0,Heart Disease
1,508
0,410


In [86]:
pd.DataFrame(hf_subset["Sex"].value_counts())

Unnamed: 0,Sex
M,725
F,193


In [87]:
# Stage 2 hypertension 
hf_subset[hf_subset["Resting Blood Pressure"]>=140]


Unnamed: 0,Age,Sex,Chest Pain Type,Resting Blood Pressure,Cholesterol,Fasting Blood Sugar,Resting ECG,Max Heart Rate,Exercise Angina,Heart Disease
0,40,M,ATA,140,289,0,Normal,172,N,0
1,49,F,NAP,160,180,0,Normal,156,N,1
4,54,M,NAP,150,195,0,Normal,122,N,0
8,37,M,ASY,140,207,0,Normal,130,Y,1
13,49,M,ASY,140,234,0,Normal,140,Y,1
...,...,...,...,...,...,...,...,...,...,...
905,67,M,NAP,152,212,0,LVH,150,N,1
908,63,M,ASY,140,187,0,LVH,144,Y,1
911,59,M,ASY,164,176,1,LVH,90,N,1
912,57,F,ASY,140,241,0,Normal,123,Y,1


In [88]:
# Hypertension crisis
hf_subset[hf_subset["Resting Blood Pressure"]>=180]

Unnamed: 0,Age,Sex,Chest Pain Type,Resting Blood Pressure,Cholesterol,Fasting Blood Sugar,Resting ECG,Max Heart Rate,Exercise Angina,Heart Disease
109,39,M,ATA,190,241,0,Normal,106,N,0
123,58,F,ATA,180,393,0,Normal,110,Y,1
189,53,M,ASY,180,285,0,ST,120,Y,1
190,46,M,ASY,180,280,0,ST,120,N,0
241,54,M,ASY,200,198,0,Normal,142,Y,1
274,45,F,ATA,180,295,0,Normal,180,N,0
275,59,M,NAP,180,213,0,Normal,100,N,0
278,57,F,ASY,180,347,0,ST,126,Y,0
365,64,F,ASY,200,0,0,Normal,140,Y,1
372,63,M,ASY,185,0,0,Normal,98,Y,1


In [89]:
hf_subset[hf_subset["Cholesterol"]>=240]

Unnamed: 0,Age,Sex,Chest Pain Type,Resting Blood Pressure,Cholesterol,Fasting Blood Sugar,Resting ECG,Max Heart Rate,Exercise Angina,Heart Disease
0,40,M,ATA,140,289,0,Normal,172,N,0
2,37,M,ATA,130,283,0,ST,98,N,0
5,39,M,NAP,120,339,0,Normal,170,N,0
9,48,F,ATA,120,284,0,Normal,120,N,0
15,54,F,ATA,120,273,0,Normal,150,N,0
...,...,...,...,...,...,...,...,...,...,...
900,58,M,ASY,114,318,0,ST,140,N,1
904,56,M,ATA,120,240,0,Normal,169,N,0
906,55,F,ATA,132,342,0,Normal,166,N,0
912,57,F,ASY,140,241,0,Normal,123,Y,1


In [90]:
df = pd.read_csv("heart.csv")
df.head(20)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
5,39,M,NAP,120,339,0,Normal,170,N,0.0,Up,0
6,45,F,ATA,130,237,0,Normal,170,N,0.0,Up,0
7,54,M,ATA,110,208,0,Normal,142,N,0.0,Up,0
8,37,M,ASY,140,207,0,Normal,130,Y,1.5,Flat,1
9,48,F,ATA,120,284,0,Normal,120,N,0.0,Up,0


In [91]:
cp_type = set(df.ChestPainType.values)
print(cp_type)

#
#

{'ASY', 'TA', 'NAP', 'ATA'}


In [92]:
df['HeartDisease'].value_counts()

1    508
0    410
Name: HeartDisease, dtype: int64

In [93]:
# Selecting Heart Disease data equals to 1 
df = df.loc[df['HeartDisease'] == 1]
df['HeartDisease'].value_counts()

1    508
Name: HeartDisease, dtype: int64

In [94]:
# Delete the Heart Disease column 
df = df.drop('HeartDisease', axis=1)
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat
8,37,M,ASY,140,207,0,Normal,130,Y,1.5,Flat
11,58,M,ATA,136,164,0,ST,99,Y,2.0,Flat
13,49,M,ASY,140,234,0,Normal,140,Y,1.0,Flat


In [95]:
# # Look for the null values and remove from the data 
# df = df.dropna(axis=0, how='any')
# df.shape

In [96]:
df.RestingBP.sort_values()

449      0
227     92
328     95
333     95
294     95
      ... 
759    192
241    200
399    200
732    200
365    200
Name: RestingBP, Length: 508, dtype: int64

In [98]:
df = df[df['RestingBP'] >= 180]
len(df)

13

In [101]:
# Create variable and columns with string values
X = pd.get_dummies(data=df, columns=['ChestPainType', 'RestingBP'])
print(X.shape)
X.head()

(13, 17)


Unnamed: 0,Age,Sex,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,RestingBP_180,RestingBP_185,RestingBP_190,RestingBP_192,RestingBP_200
123,58,F,393,0,Normal,110,Y,1.0,Flat,0,1,0,1,0,0,0,0
189,53,M,285,0,ST,120,Y,1.5,Flat,1,0,0,1,0,0,0,0
241,54,M,198,0,Normal,142,Y,2.0,Flat,1,0,0,0,0,0,0,1
365,64,F,0,0,Normal,140,Y,1.0,Flat,1,0,0,0,0,0,0,1
372,63,M,0,0,Normal,98,Y,0.0,Up,1,0,0,0,1,0,0,0


In [None]:
# # Use standardscaler to standardize the data 
# scaler = StandardScaler()
# xscaled = scaler.fit_transform(X)