In [1]:
import pandas as pd
import numpy as np


# Load Raw Data

In [2]:
df = pd.read_csv("../data/raw/heart.csv")
df.head()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


# Handle Missing Values

In [3]:
df.isnull().sum()


age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

# Separate Target

In [4]:
X = df.drop("target", axis=1)
y = df["target"]


# Age-Based Risk Features

In [5]:
X["age_squared"] = X["age"] ** 2


# Cholesterol Risk Ratio

In [6]:
X["chol_age_ratio"] = X["chol"] / X["age"]


# Blood Pressure Stress Feature

In [7]:
X["bp_age_ratio"] = X["trestbps"] / X["age"]


# Cardiac Stress Score

In [8]:
X["cardiac_stress_score"] = (
    X["oldpeak"] +
    X["exang"] +
    (1 / (X["thalach"] + 1))
)


# Vessel Severity Feature

In [9]:
X["vessel_severity"] = X["ca"] * X["thal"]


# Check New Features

In [10]:
X.head()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,age_squared,chol_age_ratio,bp_age_ratio,cardiac_stress_score,vessel_severity
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,2704,4.076923,2.403846,1.005917,6
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,2809,3.830189,2.641509,4.10641,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,4900,2.485714,2.071429,3.607937,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,3721,3.327869,2.42623,0.006173,3
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,3844,4.741935,2.225806,1.909346,6


In [11]:
X.describe()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,age_squared,chol_age_ratio,bp_age_ratio,cardiac_stress_score,vessel_severity
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434146,0.69561,0.942439,131.611707,246.0,0.149268,0.529756,149.114146,0.336585,1.071512,1.385366,0.754146,2.323902,3045.302439,4.618424,2.477227,1.414943,1.847805
std,9.07229,0.460373,1.029641,17.516718,51.59251,0.356527,0.527878,23.005724,0.472772,1.175053,0.617755,1.030798,0.62066,972.159492,1.095417,0.480763,1.396753,2.656114
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,841.0,2.098592,1.492537,0.004926,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,2.0,2304.0,3.836066,2.166667,0.007634,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,3136.0,4.462687,2.4,1.008264,0.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.8,2.0,1.0,3.0,3721.0,5.296296,2.727273,2.307194,3.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,5929.0,8.41791,4.482759,6.608929,12.0


# Feature Correlation After Engineering

In [12]:
engineered_df = X.copy()
engineered_df["target"] = y

engineered_df.corr()["target"].sort_values(ascending=False)


target                  1.000000
cp                      0.434854
thalach                 0.422895
slope                   0.345512
bp_age_ratio            0.134978
restecg                 0.134468
chol_age_ratio          0.084453
fbs                    -0.041164
chol                   -0.099966
trestbps               -0.138772
age_squared            -0.217695
age                    -0.229324
sex                    -0.279501
thal                   -0.337838
ca                     -0.382085
vessel_severity        -0.393307
exang                  -0.438029
oldpeak                -0.438441
cardiac_stress_score   -0.517464
Name: target, dtype: float64

# Save Processed Dataset

In [13]:
processed_df = X.copy()
processed_df["target"] = y

processed_df.to_csv("../data/processed/heart_cleaned.csv", index=False)


## Feature Engineering Summary

    - Created age-based and ratio-based risk features.
    - Designed a custom cardiac stress score combining exercise-induced factors.
    - Engineered features aim to reflect medical risk patterns rather than raw measurements.
    - The processed dataset will be used for model training and risk scoring.
