## 0. Imports 

In [16]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

## 1. Data preprocessing

### 1.1. Combining files

In [33]:
## Combining the kaggle test & train datasets
df1 = pd.read_csv("data/kaggletest.csv")
df2 = pd.read_csv("data/kaggletrain.csv")
combined = pd.concat([df1, df2], ignore_index=True)
combined.to_csv("data/kaggle.csv", index=False)

## Check that it worked
len(df1) + len(df2) == len(combined)

True

### 1.2. Cleaning up column names

In [34]:
combined = combined.rename(columns={
    'cp': 'chest pain type',
    'trestbps': 'rest blood presure',
    'chol': 'serum cholesterol',
    'fbs': 'fasting blood sugar',
    'restecg': 'rest ecg',
    'thalach': 'max heart rate',
    'exang': 'exercise induced angina',
    'thal': 'thalassemia',
    'ca': 'major vessels'
})

combined.head()


Unnamed: 0,age,sex,chest pain type,rest blood presure,serum cholesterol,fasting blood sugar,rest ecg,max heart rate,exercise induced angina,oldpeak,slope,major vessels,thalassemia,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


### 1.3. One-hot encoding categorical variables

In [36]:
encoder = OneHotEncoder(sparse_output=False)

onehotcols = ['chest pain type', 'rest ecg', 'thalassemia', 'major vessels']

encoded = encoder.fit_transform(combined[onehotcols])

encoded_df = pd.DataFrame(
    encoded, 
    columns=encoder.get_feature_names_out()
)

df = combined.drop(columns=onehotcols)

df = pd.concat([df, encoded_df], axis=1)

list(df.columns)

['age',
 'sex',
 'rest blood presure',
 'serum cholesterol',
 'fasting blood sugar',
 'max heart rate',
 'exercise induced angina',
 'oldpeak',
 'slope',
 'target',
 'chest pain type_0',
 'chest pain type_1',
 'chest pain type_2',
 'chest pain type_3',
 'rest ecg_0',
 'rest ecg_1',
 'rest ecg_2',
 'thalassemia_0',
 'thalassemia_1',
 'thalassemia_2',
 'thalassemia_3',
 'major vessels_0',
 'major vessels_1',
 'major vessels_2',
 'major vessels_3',
 'major vessels_4']

## 2. EDA

### 2.1. Intro plots