In [21]:
# This notebook demonstrates a complete ETL pipeline for the UCI Heart Disease dataset.
# We will load, clean, preprocess, and transform the data using Pandas and Scikit-learn,
# then export the processed dataset and pipeline for future modeling tasks.
# 2. Import required libraries
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import joblib

In [23]:

# 3. Load the dataset
# URL with data, no header row in original data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"

In [25]:
# Column names from dataset documentation
column_names = ["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach",
                "exang", "oldpeak", "slope", "ca", "thal", "target"]

# Read dataset into Pandas DataFrame
df = pd.read_csv(url, names=column_names, na_values='?')


print(df.head())

    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  63.0  1.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3   
1  67.0  1.0  4.0     160.0  286.0  0.0      2.0    108.0    1.0      1.5   
2  67.0  1.0  4.0     120.0  229.0  0.0      2.0    129.0    1.0      2.6   
3  37.0  1.0  3.0     130.0  250.0  0.0      0.0    187.0    0.0      3.5   
4  41.0  0.0  2.0     130.0  204.0  0.0      2.0    172.0    0.0      1.4   

   slope   ca  thal  target  
0    3.0  0.0   6.0       0  
1    2.0  3.0   3.0       2  
2    2.0  2.0   7.0       1  
3    3.0  0.0   3.0       0  
4    1.0  0.0   3.0       0  


In [27]:

print(df.info())
print("\nMissing values by column:\n", df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    float64
 1   sex       303 non-null    float64
 2   cp        303 non-null    float64
 3   trestbps  303 non-null    float64
 4   chol      303 non-null    float64
 5   fbs       303 non-null    float64
 6   restecg   303 non-null    float64
 7   thalach   303 non-null    float64
 8   exang     303 non-null    float64
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    float64
 11  ca        299 non-null    float64
 12  thal      301 non-null    float64
 13  target    303 non-null    int64  
dtypes: float64(13), int64(1)
memory usage: 33.3 KB
None

Missing values by column:
 age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
targ

In [39]:
# %% [markdown]
"""
## Target Variable Overview

Target variable indicates presence of heart disease (0 = no disease, 1-4 = disease).

We will convert this into a binary classification target: 0 = no disease, 1 = disease present.
"""

# %%
# Transform target into binary outcome
df['target'] = df['target'].apply(lambda x: 1 if x > 0 else 0)
print(df['target'].value_counts())

target
0    164
1    139
Name: count, dtype: int64


In [31]:
# %% [markdown]
"""
## Handling Missing Values & Data Types

- 'ca' and 'thal' columns have missing values.
- Some columns are categorical (cp, restecg, slope, ca, thal).
- Prepare column lists for numeric and categorical separately.
"""

# %%
# Define numeric and categorical columns for pipeline
numeric_features = ["age", "trestbps", "chol", "thalach", "oldpeak"]
categorical_features = ["sex", "cp", "fbs", "restecg", "exang", "slope", "ca", "thal"]


In [33]:
# %% [markdown]
"""
## Build Preprocessing Pipelines

- Numeric pipeline: impute missing with median, then scale.
- Categorical pipeline: impute missing with mode, one-hot encode.
"""

# %%
# Numeric transformer pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), 
    ('scaler', StandardScaler())
])

# Categorical transformer pipeline
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), 
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Full ColumnTransformer applying preprocessing steps
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])


In [35]:
# %% [markdown]
"""
## Apply Preprocessing Pipeline to Features

Separate target and features, fit and transform features using pipeline.
"""

# %%
X = df.drop('target', axis=1)
y = df['target']

X_processed = preprocessor.fit_transform(X)

print("Processed feature shape:", X_processed.shape)


Processed feature shape: (303, 28)


In [37]:
# %% [markdown]
"""
## Export Processed Dataset and Preprocessing Pipeline

- Save transformed dataset to CSV.
- Save pipeline as a pickle file for reuse in modeling.
"""

# %%
# Save processed data to CSV (as dense array)
processed_df = pd.DataFrame(X_processed.toarray() if hasattr(X_processed, "toarray") else X_processed)
processed_df['target'] = y.values  # Append target back
processed_df.to_csv('heart_disease_processed.csv', index=False)

# Save pipeline
joblib.dump(preprocessor, 'heart_disease_preprocessor.pkl')

['heart_disease_preprocessor.pkl']