In [1]:
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd
import os
from pathlib import Path

In [2]:
	df = pd.read_csv("data/data.csv", dtype={'PostCode': 'string'})
	df.head()

Unnamed: 0,CaseNumber,LastName,PostCode,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,431-06-4243,Richard,99847,40.0,M,ATA,140,289,0,0,172,N,0.0,Up,0
1,415-39-7809,Sheppard,64192,49.0,F,NAP,160,180,0,0,156,N,1.0,Flat,1
2,517-18-4618,Howard,29132,37.0,M,ATA,130,283,0,1,98,N,0.0,Up,0
3,634-33-8726,Taylor,12930,48.0,F,ASY,138,214,0,0,108,Y,1.5,Flat,1
4,151-40-1619,Mcgrath,79393,54.0,M,NAP,150,195,0,0,122,N,0.0,Up,0


In [3]:

df = df.dropna(subset="Age")
df["Age"] = df["Age"].astype(int)

In [4]:
df = df.drop_duplicates("CaseNumber")
df.head()

Unnamed: 0,CaseNumber,LastName,PostCode,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,431-06-4243,Richard,99847,40,M,ATA,140,289,0,0,172,N,0.0,Up,0
1,415-39-7809,Sheppard,64192,49,F,NAP,160,180,0,0,156,N,1.0,Flat,1
2,517-18-4618,Howard,29132,37,M,ATA,130,283,0,1,98,N,0.0,Up,0
3,634-33-8726,Taylor,12930,48,F,ASY,138,214,0,0,108,Y,1.5,Flat,1
4,151-40-1619,Mcgrath,79393,54,M,NAP,150,195,0,0,122,N,0.0,Up,0


In [5]:
# Parameters for splitting column into multiple columns
sep = "_"
catagory = "Sex"
catagory_values = df[catagory].unique()
catagory_map = {
	"M": "Male",
	"F": "Female"
}

# Split values of column RestingECG into seperate columns called RestingECG_$i
df = pd.get_dummies(df, columns=[catagory], dtype=int, prefix=[catagory], prefix_sep=sep)

# Rename every Sex_$catagory_value column to Sex_$catagory_map[catagory_value]
for catagory_value in catagory_values:
	column_name = f"{catagory}{sep}{catagory_value}"
	if catagory_value in catagory_map:
		new_column_name = f"{catagory_map[catagory_value]}"
		df[new_column_name] = df[column_name]
	df = df.drop(columns=[column_name])

# Display the dataframe
df.head()

Unnamed: 0,CaseNumber,LastName,PostCode,Age,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Male,Female
0,431-06-4243,Richard,99847,40,ATA,140,289,0,0,172,N,0.0,Up,0,1,0
1,415-39-7809,Sheppard,64192,49,NAP,160,180,0,0,156,N,1.0,Flat,1,0,1
2,517-18-4618,Howard,29132,37,ATA,130,283,0,1,98,N,0.0,Up,0,1,0
3,634-33-8726,Taylor,12930,48,ASY,138,214,0,0,108,Y,1.5,Flat,1,0,1
4,151-40-1619,Mcgrath,79393,54,NAP,150,195,0,0,122,N,0.0,Up,0,1,0


In [6]:
df = pd.get_dummies(df, columns=['ChestPainType'], dtype=int, prefix_sep="_", prefix=["ChestPainType"])
df

Unnamed: 0,CaseNumber,LastName,PostCode,Age,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Male,Female,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA
0,431-06-4243,Richard,99847,40,140,289,0,0,172,N,0.0,Up,0,1,0,0,1,0,0
1,415-39-7809,Sheppard,64192,49,160,180,0,0,156,N,1.0,Flat,1,0,1,0,0,1,0
2,517-18-4618,Howard,29132,37,130,283,0,1,98,N,0.0,Up,0,1,0,0,1,0,0
3,634-33-8726,Taylor,12930,48,138,214,0,0,108,Y,1.5,Flat,1,0,1,1,0,0,0
4,151-40-1619,Mcgrath,79393,54,150,195,0,0,122,N,0.0,Up,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,623-13-2778,Galloway,94384,45,110,264,0,0,132,N,1.2,Flat,1,1,0,0,0,0,1
914,785-78-9640,Cochran,21419,68,144,193,1,0,141,N,3.4,Flat,1,1,0,1,0,0,0
915,795-34-3503,Franklin,57897,57,130,131,0,0,115,Y,1.2,Flat,1,1,0,1,0,0,0
916,076-03-0800,Herrera,65198,57,130,236,0,2,174,N,0.0,Flat,1,0,1,0,1,0,0


In [7]:
df["RestingBP"] = df["RestingBP"].apply(lambda v: int(str(v).replace("-", "")))
df = df[df['RestingBP'] > 0]
df.head()

Unnamed: 0,CaseNumber,LastName,PostCode,Age,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Male,Female,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA
0,431-06-4243,Richard,99847,40,140,289,0,0,172,N,0.0,Up,0,1,0,0,1,0,0
1,415-39-7809,Sheppard,64192,49,160,180,0,0,156,N,1.0,Flat,1,0,1,0,0,1,0
2,517-18-4618,Howard,29132,37,130,283,0,1,98,N,0.0,Up,0,1,0,0,1,0,0
3,634-33-8726,Taylor,12930,48,138,214,0,0,108,Y,1.5,Flat,1,0,1,1,0,0,0
4,151-40-1619,Mcgrath,79393,54,150,195,0,0,122,N,0.0,Up,0,1,0,0,0,1,0


In [8]:
# Suppress the SettingWithCopyWarning because we know what we are doing here
pd.options.mode.chained_assignment = None
# Add an extra column to the dataset with an binary number 1 or 0
df['HasCholesterol'] = (df.Cholesterol > 0).astype(int)
# Reset the warning behavior to its default (display warning)
pd.options.mode.chained_assignment = "warn"
# Display the dataframe
df.head()


Unnamed: 0,CaseNumber,LastName,PostCode,Age,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Male,Female,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,HasCholesterol
0,431-06-4243,Richard,99847,40,140,289,0,0,172,N,0.0,Up,0,1,0,0,1,0,0,1
1,415-39-7809,Sheppard,64192,49,160,180,0,0,156,N,1.0,Flat,1,0,1,0,0,1,0,1
2,517-18-4618,Howard,29132,37,130,283,0,1,98,N,0.0,Up,0,1,0,0,1,0,0,1
3,634-33-8726,Taylor,12930,48,138,214,0,0,108,Y,1.5,Flat,1,0,1,1,0,0,0,1
4,151-40-1619,Mcgrath,79393,54,150,195,0,0,122,N,0.0,Up,0,1,0,0,0,1,0,1


In [9]:
# Parameters for splitting column into multiple columns
column_sep = "_"
catagory = "RestingECG"
catagory_values = ["Normal", "ST", "LVH"]

# Split values of column RestingECG into seperate columns called RestingECG_$i
df = pd.get_dummies(df, columns=[catagory], dtype=int, prefix=[catagory], prefix_sep=column_sep)

# Rename every RestingECG_$i column to RestingECG_$catagory_values[i]
for i in range(len(catagory_values)):
	column_name = f"{catagory}{column_sep}{i}"
	new_column_name = f"{catagory}{column_sep}{catagory_values[i]}"
	df[new_column_name] = df[column_name]
	df = df.drop(columns=[column_name])

# Display the dataframe
df.head()

Unnamed: 0,CaseNumber,LastName,PostCode,Age,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,...,Male,Female,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,HasCholesterol,RestingECG_Normal,RestingECG_ST,RestingECG_LVH
0,431-06-4243,Richard,99847,40,140,289,0,172,N,0.0,...,1,0,0,1,0,0,1,1,0,0
1,415-39-7809,Sheppard,64192,49,160,180,0,156,N,1.0,...,0,1,0,0,1,0,1,1,0,0
2,517-18-4618,Howard,29132,37,130,283,0,98,N,0.0,...,1,0,0,1,0,0,1,0,1,0
3,634-33-8726,Taylor,12930,48,138,214,0,108,Y,1.5,...,0,1,1,0,0,0,1,1,0,0
4,151-40-1619,Mcgrath,79393,54,150,195,0,122,N,0.0,...,1,0,0,0,1,0,1,1,0,0


In [10]:
df["ExerciseAngina"].unique()

array(['N', 'Y'], dtype=object)

In [11]:
df["ExerciseAngina"] = df["ExerciseAngina"].apply(lambda v: 0 if v == "N" else 1)
df.head()

Unnamed: 0,CaseNumber,LastName,PostCode,Age,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,...,Male,Female,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,HasCholesterol,RestingECG_Normal,RestingECG_ST,RestingECG_LVH
0,431-06-4243,Richard,99847,40,140,289,0,172,0,0.0,...,1,0,0,1,0,0,1,1,0,0
1,415-39-7809,Sheppard,64192,49,160,180,0,156,0,1.0,...,0,1,0,0,1,0,1,1,0,0
2,517-18-4618,Howard,29132,37,130,283,0,98,0,0.0,...,1,0,0,1,0,0,1,0,1,0
3,634-33-8726,Taylor,12930,48,138,214,0,108,1,1.5,...,0,1,1,0,0,0,1,1,0,0
4,151-40-1619,Mcgrath,79393,54,150,195,0,122,0,0.0,...,1,0,0,0,1,0,1,1,0,0


In [12]:
df = pd.get_dummies(df, columns=['ST_Slope'], dtype=int, prefix=["ST_Slope"])
df.head()

Unnamed: 0,CaseNumber,LastName,PostCode,Age,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,...,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,HasCholesterol,RestingECG_Normal,RestingECG_ST,RestingECG_LVH,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,431-06-4243,Richard,99847,40,140,289,0,172,0,0.0,...,1,0,0,1,1,0,0,0,0,1
1,415-39-7809,Sheppard,64192,49,160,180,0,156,0,1.0,...,0,1,0,1,1,0,0,0,1,0
2,517-18-4618,Howard,29132,37,130,283,0,98,0,0.0,...,1,0,0,1,0,1,0,0,0,1
3,634-33-8726,Taylor,12930,48,138,214,0,108,1,1.5,...,0,0,0,1,1,0,0,0,1,0
4,151-40-1619,Mcgrath,79393,54,150,195,0,122,0,0.0,...,0,1,0,1,1,0,0,0,0,1


In [13]:
# Replace values to numeric values so that they can be summed up later
print(f"Before cleaning:{df['HeartDisease'].unique()}")
df["HeartDisease"] = df["HeartDisease"].str.replace("yes", "1")
df["HeartDisease"] = df["HeartDisease"].str.replace("no", "0")
df["HeartDisease"] = df["HeartDisease"].astype(int)
print(f"After cleaning:{df['HeartDisease'].unique()}")
df.head()

Before cleaning:['0' '1' 'yes']
After cleaning:[0 1]


Unnamed: 0,CaseNumber,LastName,PostCode,Age,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,...,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,HasCholesterol,RestingECG_Normal,RestingECG_ST,RestingECG_LVH,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,431-06-4243,Richard,99847,40,140,289,0,172,0,0.0,...,1,0,0,1,1,0,0,0,0,1
1,415-39-7809,Sheppard,64192,49,160,180,0,156,0,1.0,...,0,1,0,1,1,0,0,0,1,0
2,517-18-4618,Howard,29132,37,130,283,0,98,0,0.0,...,1,0,0,1,0,1,0,0,0,1
3,634-33-8726,Taylor,12930,48,138,214,0,108,1,1.5,...,0,0,0,1,1,0,0,0,1,0
4,151-40-1619,Mcgrath,79393,54,150,195,0,122,0,0.0,...,0,1,0,1,1,0,0,0,0,1


In [14]:
if Path("out").exists() == False: 
	os.mkdir("out")
	
df.to_csv("out/Clean.csv", index=False, sep=";", decimal=",")