# :two_hearts: 2 - Preprocessing

## Imports

In [35]:
import joblib
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [36]:
import warnings
warnings.filterwarnings("ignore")

## Predefined variables

In [37]:
DATA_DIR_PATH = "../data/heart-failure/"
CSV_PATH = "../data/heart-failure/heart.csv"
PIPELINE_PATH = "../pipelines/"

COLOR = "crimson"
PALETTE = "flare"

## Preprocessing

In [38]:
heart_data = pd.read_csv(CSV_PATH)
heart_data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


### Feature & target splitting

In [39]:
X = heart_data.drop(columns=["HeartDisease"], axis=1)
y = heart_data["HeartDisease"]

### Feature types

In [40]:
numerical_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object", "bool"]).columns.tolist()

print("Numerical features:", numerical_features)
print("Categorical features:", categorical_features)

Numerical features: ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']
Categorical features: ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']


### Data cleaning

***BASED ON CONCLUSIONS FROM 01-exploration.ipynb***

Features to be cleaned:
- RestingBP: this contains some values that are physically impossible; imputation with median is required
- Cholesterol: this contains a lot of zeroes and is physically impossible; imputation with median is required

Features that are normal and don't need to be cleaned:
- Age
- FastingBS
- MaxHR
- Oldpeak (this does have some extreme values but they are medically and physically plausible so they won't be cleaned)
- Sex
- ChestPainType
- RestingECG
- ExerciseAngina
- ST_Slope

In [41]:
# Show the resting blood pressure values that are lower than 90 (which is the limit to be considered low)
print(X[X["RestingBP"] < 90]["RestingBP"].value_counts())

RestingBP
80    1
0     1
Name: count, dtype: int64


In [42]:
# Show the cholesterol values that are lower than 100 (there is no lower limit, but these values are quite low in a positive way)
print(X[X["Cholesterol"] < 100]["Cholesterol"].value_counts())

Cholesterol
0     172
85      1
Name: count, dtype: int64


In [43]:
# Show the cholesterol values that are higher than 240 (which is the limit to be considered high risk)
print(X[X["Cholesterol"] > 240]["Cholesterol"].value_counts())

Cholesterol
254    11
246     8
260     8
263     8
243     7
       ..
262     1
409     1
321     1
353     1
278     1
Name: count, Length: 123, dtype: int64


In [44]:
# Show the cholesterol values that are higher than 500 (which is a very high value already)
print(X[X["Cholesterol"] > 500]["Cholesterol"].value_counts())

Cholesterol
518    1
529    1
603    1
564    1
Name: count, dtype: int64


In [45]:
# Replace the implausible values in RestingBP and Cholesterol with NaN (to be imputed later in the pipeline)
X['RestingBP'] = X['RestingBP'].replace(to_replace=0, value=np.nan)
X['Cholesterol'] = X['Cholesterol'].replace(to_replace=0, value=np.nan)

In [46]:
# Save the cleaned data for future use
clean_data = pd.concat([X, y], axis=1)
clean_data.to_csv(DATA_DIR_PATH + "heart_clean.csv", index=False)

### Preprocessing pipelines

In [47]:
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
])

In [48]:
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

### Column transformation for all features in the dataset

In [49]:
preprocessor_all = ColumnTransformer(transformers=[
    ("num", numerical_transformer, numerical_features),
    ("cat", categorical_transformer, categorical_features)
])

### Feature subsets

In [50]:
feature_subsets = {
    "all": numerical_features + categorical_features,
    "numerical": numerical_features,
    "categorical": categorical_features,
    "FS-1": ["MaxHR", "Oldpeak", "ST_Slope", "ChestPainType"],
    "FS-2": ["MaxHR", "Oldpeak", "ST_Slope", "ChestPainType", "RestingECG", "Age"],
    "FS-3": ["MaxHR", "Oldpeak", "ST_Slope", "ChestPainType", "RestingECG", "Age", "Cholesterol", "ExerciseAngina"]
}

### Saving pipelines and artifacts for further use

In [51]:
joblib.dump(preprocessor_all, PIPELINE_PATH + "preprocessor_all.joblib") # optional because here all the features are hardcoded
joblib.dump(feature_subsets, PIPELINE_PATH + "feature_subsets.joblib")
joblib.dump(numerical_transformer, PIPELINE_PATH + "numerical_transformer.joblib")
joblib.dump(categorical_transformer, PIPELINE_PATH + "categorical_transformer.joblib")
joblib.dump(numerical_features, PIPELINE_PATH + "numerical_features.joblib")
joblib.dump(categorical_features, PIPELINE_PATH + "categorical_features.joblib")

['../pipelines/categorical_features.joblib']