In [2]:
import pandas as pd

train_val_split = pd.read_csv("heart_failure/train_val_split.csv")
print(train_val_split.shape)
pd.set_option("display.max.columns", None)
train_val_split.head()

(734, 12)


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
2,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
3,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
4,39,M,NAP,120,339,0,Normal,170,N,0.0,Up,0


We see we have 12 features. Their description from Kaggle is:

* Age: age of the patient [years]
* Sex: sex of the patient [M: Male, F: Female]
* ChestPainType: chest pain type [TA: Typical Angina, ATA: Atypical Angina, NAP: Non-Anginal Pain, ASY: Asymptomatic]
* RestingBP: resting blood pressure [mm Hg]
* Cholesterol: serum cholesterol [mm/dl]
* FastingBS: fasting blood sugar [1: if FastingBS > 120 mg/dl, 0: otherwise]
* RestingECG: resting electrocardiogram results [Normal: Normal, ST: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV), LVH: showing probable or definite left ventricular hypertrophy by Estes' criteria]
* MaxHR: maximum heart rate achieved [Numeric value between 60 and 202]
* ExerciseAngina: exercise-induced angina [Y: Yes, N: No]
* Oldpeak: oldpeak = ST [Numeric value measured in depression]
* ST_Slope: the slope of the peak exercise ST segment [Up: upsloping, Flat: flat, Down: downsloping]

And we want to classify:
* HeartDisease: output class [1: heart disease, 0: Normal]

In [3]:
# describe non-categorical values
train_val_split.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,734.0,734.0,734.0,734.0,734.0,734.0,734.0
mean,53.523161,132.06267,197.588556,0.23297,136.167575,0.873978,0.542234
std,9.42111,18.61849,108.979439,0.423012,25.334552,1.082222,0.498553
min,29.0,0.0,0.0,0.0,60.0,-2.0,0.0
25%,47.0,120.0,172.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,222.0,0.0,138.0,0.5,1.0
75%,60.0,140.0,267.0,0.0,155.0,1.5,1.0
max,77.0,200.0,529.0,1.0,195.0,6.2,1.0


In [4]:
# describe categorical values
train_val_split.describe(include='object')

Unnamed: 0,Sex,ChestPainType,RestingECG,ExerciseAngina,ST_Slope
count,734,734,734,734,734
unique,2,4,3,2,3
top,M,ASY,Normal,N,Flat
freq,573,381,440,439,364


In [5]:
# the dataset seems relatively balanced as to number of positive/negative samples
train_val_split["HeartDisease"].value_counts()

HeartDisease
1    398
0    336
Name: count, dtype: int64

In [6]:
# we have no duplicated rows, that is good.
train_val_split[train_val_split.duplicated()]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease


In [7]:
# there do not seem to be any missing values
train_val_split.isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [10]:
# however, there are quite some data points where cholesterol is 0, which is clinically impossible
# we deal with this by replacing it with a median value
sum(train_val_split['Cholesterol'] == 0)

141