In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# 1. Load Data
Load the cleaned dataset from the previous step. 
This dataset already contains the imputed values and the 'target' binary column.

In [None]:
df = pd.read_csv("processed_data/heart_disease_cleaned.csv")

# Ensure 'target' column exists, if not recreate it from 'num'
if 'target' not in df.columns:
    df['target'] = df['num'].apply(lambda x: 1 if x > 0 else 0)

# Drop the original 'num' column as we are doing binary classification
if 'num' in df.columns:
    df = df.drop('num', axis=1)

# Check data
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
df.head()

# 2. Encoding Categorical Variables
Use One-Hot Encoding for categorical variables with more than two levels:
- cp (Chest Pain Type)
- restecg (Resting ECG)
- slope (Slope of peak exercise ST segment)
- thal (Thalassemia)

In [None]:
# Define categorical columns
categorical_cols = ['cp', 'restecg', 'slope', 'thal']

# Apply One-Hot Encoding using pandas get_dummies
# drop_first=True helps avoid multicollinearity (dummy variable trap)
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print("Encoded Shape:", df_encoded.shape)
df_encoded.head()

# 3. Splitting the Data
Perform an 80/20 Train-Test split. Use a random_state for reproducibility.

In [None]:
# Define Features (X) and Target (y)
X = df_encoded.drop('target', axis=1)
y = df_encoded['target']

# Split the data
# random_state=42 is a common convention for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training Shape: {X_train.shape}")
print(f"Testing Shape: {X_test.shape}")

# 4. Feature Scaling
Apply StandardScaler to numerical features. 
Crucial step: We fit the scaler ONLY on the training data, then transform both training and testing data.
This prevents 'Data Leakage' (peeking at the test set).

In [None]:
# Define numerical columns that need scaling
# Note: 'sex', 'fbs', 'exang', 'ca' are essentially categorical/ordinal or binary.
# We will scale the continuous variables: Age, Trestbps, Chol, Thalach, Oldpeak
numeric_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

scaler = StandardScaler()

# Fit on Training Data ONLY
scaler.fit(X_train[numeric_cols])

# Transform Training and Testing Data
X_train[numeric_cols] = scaler.transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

print("Scaling done.")
X_train.head()

# 5. Save Processed Data
Save the processed training and testing sets separately to preserve the split and scaling.

In [None]:
# Recombine X and y for saving (optional, but convenient)
train_data = X_train.copy()
train_data['target'] = y_train

test_data = X_test.copy()
test_data['target'] = y_test

# Save to CSV
train_data.to_csv("processed_data/train_processed.csv", index=False)
test_data.to_csv("processed_data/test_processed.csv", index=False)

print("Processed files saved:")
print("- processed_data/train_processed.csv")
print("- processed_data/test_processed.csv")