# PROJECT 4.2 - HEART DECEASE
## `DATA PREPROCESSING`

This module is to create the original dataset and preprocessing (prp_dataset) after standardized.

### Import Library

In [1]:
import os
import numpy as np
import pandas as pd
import random

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from transformers.image_utils import valid_coco_detection_annotations

# import warnings
# warnings.filterwarnings("ignore")
os.environ["PYTHONHASHSEED"] = "42"
np.random.seed(42)
random.seed(42)

datapath = "../dataset/cleveland.csv"

## 1.LOADING DATASET

In [2]:
COLS = ["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach",
           "exang", "oldpeak", "slope", "ca", "thal", "target"]

df = pd.read_csv(datapath, header=None, names=COLS)
print(df.info())
df.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        299 non-null    float64
 12  thal      301 non-null    float64
 13  target    303 non-null    int64  
dtypes: float64(3), int64(11)
memory usage: 33.3 KB
None


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,2
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0,0
5,56,1,2,120,236,0,0,178,0,0.8,1,0.0,3.0,0
6,62,0,4,140,268,0,2,160,0,3.6,3,2.0,3.0,3
7,57,0,4,120,354,0,0,163,1,0.6,1,0.0,3.0,0
8,63,1,4,130,254,0,2,147,0,1.4,2,1.0,7.0,2
9,53,1,4,140,203,1,2,155,1,3.1,3,0.0,7.0,1


Setting columns for dataset loaded from Internet.

👉 Normally this data type will be defined by the medical experts and they can provide us the information of this data.

In [3]:
numeric_cols = ["age","trestbps","chol","thalach","oldpeak"]
categorical_cols = ["sex","cp","fbs","restecg","exang","slope","ca","thal"]
feature_cols = [_ for _ in df.columns if "target" not in _]
print(f"❕ Feature names list: {feature_cols}")

❕ Feature names list: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']


## 2. COARSE CLEAN DATA

In [4]:
# Convert all value from cols of numerical cols to value
# Any undefined value shall be changed to NAN
numeric_converted_cols = ["age","trestbps","chol","thalach","oldpeak", "ca", "thal"]

for c in numeric_converted_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")

print(df.isna().sum())

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
target      0
dtype: int64


Features `thal` and `ca` have `null` values. We need to verify these value for data cleaning

In [5]:
print(df[df["thal"].isnull()])

     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
87    53    0   3       128   216    0        2      115      0      0.0   
266   52    1   4       128   204    1        0      156      1      1.0   

     slope   ca  thal  target  
87       1  0.0   NaN       0  
266      2  0.0   NaN       2  


In [6]:
print(df[df["ca"].isnull()])

     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
166   52    1   3       138   223    0        0      169      0      0.0   
192   43    1   4       132   247    1        2      143      1      0.1   
287   58    1   2       125   220    0        0      144      0      0.4   
302   38    1   3       138   175    0        0      173      0      0.0   

     slope  ca  thal  target  
166      1 NaN   3.0       0  
192      2 NaN   7.0       1  
287      2 NaN   7.0       0  
302      1 NaN   3.0       0  


## 4. EXPORT TO TRAIN/ TEST/ VALIDATION SET

In [5]:
feature_df = df.iloc[:, :-1]
target_df = df.iloc[:, -1]

X_train, X_buff, y_train, y_buff = train_test_split(feature_df, target_df, stratify=target_df, test_size=.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_buff, y_buff, stratify=y_buff, test_size=.5, random_state=42)
print(f"✅ Train data size: {X_train.shape}  Test data size: {X_test.shape}  Validation data size: {X_val.shape}")
print(f"✅ Train target size: {y_train.shape}  Test target size: {y_test.shape}  Validation target size: {y_val.shape}")

✅ Train data size: (242, 13)  Test data size: (31, 13)  Validation data size: (30, 13)
✅ Train target size: (242,)  Test target size: (31,)  Validation target size: (30,)


### Export to original dataset

In [8]:
dir_path = "../dataset/"
train_dataset = pd.concat([X_train, y_train], axis=1)
val_dataset = pd.concat([X_val, y_val], axis=1)
test_dataset = pd.concat([X_test, y_test], axis=1)

train_dataset.to_csv(dir_path + "train.csv", index=False)
val_dataset.to_csv(dir_path + "val.csv", index=False)
test_dataset.to_csv(dir_path + "test.csv", index=False)
print("✅ Save original dataset to train, validation, test csv files successfully")

✅ Save original dataset to train, validation, test csv files successfully


### Preprocessing PIPELINES data

`
Chú ý trong quá tình khi thực hiện processing data, toàn bộ index sẽ bị lost với dữ liệu feature. Do đó khi ghép lại với target cần phải xử lý lại bằng cách reset_index
`

In [9]:
# Use MinMaxScaler for category features and fill "nan" by most frequent value
category_ppl = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent", fill_value=-np.inf)),
    ("scaler", MinMaxScaler())
])
# Use SimpleImputer and StandardScaler to standardize numerical features then fill "nan" by median value
numerical_ppl = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median", fill_value=-np.inf)),
    ("scaler", StandardScaler())
])

preprocessing = ColumnTransformer([
    ("numerical", numerical_ppl, numeric_cols),
    ("categorical", category_ppl, categorical_cols)
])

processing_ppl = Pipeline([
    ("preprocessing", preprocessing)
])

X_train_ppl = processing_ppl.fit_transform(X_train)
X_val_ppl = processing_ppl.transform(X_val)
X_test_ppl = processing_ppl.transform(X_test)
print(X_train_ppl.shape)
print(X_val_ppl.shape)
print(X_test_ppl.shape)
X_train_ppl[:10]

(242, 13)
(30, 13)
(31, 13)


array([[ 0.59494536,  1.0209107 ,  0.16901357,  0.30768014,  1.32013204,
         0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         0.5       ,  0.66666667,  1.        ],
       [-1.7113188 ,  0.46333817,  1.32711737,  1.41175179, -0.90079598,
         1.        ,  0.66666667,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [-0.06398726,  0.01728014,  0.72049157,  0.39600587, -0.90079598,
         0.        ,  0.33333333,  1.        ,  1.        ,  1.        ,
         0.        ,  0.33333333,  0.        ],
       [ 0.15565695, -0.09423437, -0.51114262,  0.57265733, -0.90079598,
         1.        ,  0.33333333,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [ 1.36370008, -0.37302063,  0.09548317,  0.57265733, -0.72995536,
         1.        ,  1.        ,  1.        ,  0.        ,  0.        ,
         0.5       ,  0.66666667,  1.        ],
       [ 0.04583484, -0.094234

In [10]:
# This function is to verify again the feature column names after processing the pipelines
# yếu tố này rất quan trọng vì nếu apply lại nguyên feature cũ các cột feature name sẽ không còn chính xác nữa
preprocess_feature_name = []
for name, transformer, columns in preprocessing.transformers_:
    if hasattr(transformer, "get_feature_names_out"):
        preprocess_feature_name.extend(transformer.get_feature_names_out(columns))
    else:
        preprocess_feature_name.append(columns)

print(preprocessing.transformers_)
print(feature_cols)
print(preprocess_feature_name)

[('numerical', Pipeline(steps=[('imputer', SimpleImputer(fill_value=-inf, strategy='median')),
                ('scaler', StandardScaler())]), ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']), ('categorical', Pipeline(steps=[('imputer',
                 SimpleImputer(fill_value=-inf, strategy='most_frequent')),
                ('scaler', MinMaxScaler())]), ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal'])]
['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']


In [11]:
# Create fine-tuned train dataset
train_feature_df = pd.DataFrame(X_train_ppl, columns=preprocess_feature_name, index=X_train.index)

prp_train_dataset = pd.concat([train_feature_df, y_train], axis=1)
prp_train_dataset.head()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,sex,cp,fbs,restecg,exang,slope,ca,thal,target
97,0.594945,1.020911,0.169014,0.30768,1.320132,0.0,1.0,0.0,1.0,0.0,0.5,0.666667,1.0,3
82,-1.711319,0.463338,1.327117,1.411752,-0.900796,1.0,0.666667,0.0,1.0,0.0,0.0,0.0,0.0,0
167,-0.063987,0.01728,0.720492,0.396006,-0.900796,0.0,0.333333,1.0,1.0,1.0,0.0,0.333333,0.0,0
288,0.155657,-0.094234,-0.511143,0.572657,-0.900796,1.0,0.333333,0.0,1.0,0.0,0.0,0.0,1.0,0
71,1.3637,-0.373021,0.095483,0.572657,-0.729955,1.0,1.0,1.0,0.0,0.0,0.5,0.666667,1.0,3


In [12]:
# Create fine-tuned validation set
val_feature_df = pd.DataFrame(X_val_ppl, columns=preprocess_feature_name, index=X_val.index)

prp_val_dataset = pd.concat([val_feature_df, y_val], axis=1)
prp_val_dataset.head()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,sex,cp,fbs,restecg,exang,slope,ca,thal,target
197,-1.052386,0.351824,-0.235404,0.086866,-0.729955,0.0,1.0,0.0,1.0,1.0,0.5,0.0,0.0,0
137,0.81459,-0.651807,0.591813,-2.077115,0.295088,1.0,0.333333,0.0,1.0,0.0,0.5,0.333333,1.0,3
67,-0.063987,1.020911,-0.308934,0.660983,0.465929,1.0,0.666667,0.0,1.0,0.0,0.0,0.0,1.0,0
94,0.924412,0.184552,0.058718,0.970123,-0.900796,0.0,0.666667,0.0,1.0,0.0,0.0,0.0,0.0,0
33,0.485123,0.184552,-0.272169,0.484332,-0.473694,1.0,1.0,0.0,0.0,0.0,0.5,0.0,1.0,0


In [13]:
# Create fine-tuned test set
test_feature_df = pd.DataFrame(X_test_ppl, columns=preprocess_feature_name, index=X_test.index)

prp_test_dataset = pd.concat([test_feature_df, y_test], axis=1)
prp_test_dataset.head()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,sex,cp,fbs,restecg,exang,slope,ca,thal,target
157,0.375301,-0.373021,0.941083,0.92596,-0.900796,1.0,1.0,0.0,1.0,0.0,0.0,0.666667,1.0,1
101,-2.260429,-0.763321,-1.228064,1.058449,-0.900796,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
92,0.81459,-0.094234,-0.327317,-0.178111,0.63677,1.0,0.666667,0.0,0.0,0.0,0.5,1.0,1.0,0
89,-0.393454,-0.094234,0.132248,-0.045623,-0.473694,0.0,0.666667,0.0,1.0,0.0,0.0,0.0,0.0,0
186,-1.381852,-0.651807,-0.161873,1.941706,-0.217434,1.0,0.666667,1.0,0.0,0.0,1.0,0.0,1.0,0


### Export to csv for Pre processing dataset

In [14]:
dir_path = "../dataset/"
prp_train_dataset.to_csv(dir_path + "prp_train.csv", index=False)
prp_val_dataset.to_csv(dir_path + "prp_val.csv", index=False)
prp_test_dataset.to_csv(dir_path + "prp_test.csv", index=False)
print("✅ Save preprocessing dataset to train, validation, test csv files successfully")

✅ Save preprocessing dataset to train, validation, test csv files successfully
