In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures

# Load the processed data
data = pd.read_csv('data/processed_heart_data.csv')

# Split features and target
X = data.drop('target', axis=1)
y = data['target']

# Create interaction terms
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(X)

# Convert to dataframe with feature names
feature_names = poly.get_feature_names_out(X.columns)
X_poly_df = pd.DataFrame(X_poly, columns=feature_names)

# Print the first few rows of the new features
print(f"Original features: {X.shape[1]}")
print(f"After polynomial features: {X_poly_df.shape[1]}")
print(X_poly_df.head())

# Add some custom features
data['age_chol_ratio'] = data['age'] / (data['chol'] + 1)  # Add 1 to avoid division by zero
data['trestbps_thalach_ratio'] = data['trestbps'] / (data['thalach'] + 1)

# Save enhanced dataset
data.to_csv('data/enhanced_heart_data.csv', index=False)

# Create new train/test split with enhanced features
from sklearn.model_selection import train_test_split

X_enhanced = data.drop('target', axis=1)
y = data['target']

X_train_enh, X_test_enh, y_train_enh, y_test_enh = train_test_split(
    X_enhanced, y, test_size=0.2, random_state=42)

# Save the enhanced splits
import pickle
with open('data/enhanced_train_test_split.pkl', 'wb') as f:
    pickle.dump((X_train_enh, X_test_enh, y_train_enh, y_test_enh), f)

Original features: 13
After polynomial features: 91
        age  sex   cp  trestbps      chol  fbs  restecg   thalach  exang  \
0  0.948726  1.0  1.0  0.757525 -0.264900  1.0      2.0  0.017197    0.0   
1  1.392002  1.0  4.0  1.611220  0.760415  0.0      2.0 -1.821905    1.0   
2  1.392002  1.0  4.0 -0.665300 -0.342283  0.0      2.0 -0.902354    1.0   
3 -1.932564  1.0  3.0 -0.096170  0.063974  0.0      0.0  1.637359    0.0   
4 -1.489288  0.0  2.0 -0.096170 -0.825922  0.0      2.0  0.980537    0.0   

    oldpeak  ...  exang oldpeak  exang slope  exang ca  exang thal  \
0  1.087338  ...       0.000000          0.0 -0.000000         0.0   
1  0.397182  ...       0.397182          2.0  2.504881         3.0   
2  1.346147  ...       1.346147          2.0  1.432877         7.0   
3  2.122573  ...       0.000000          0.0 -0.000000         0.0   
4  0.310912  ...       0.000000          0.0 -0.000000         0.0   

   oldpeak slope  oldpeak ca  oldpeak thal  slope ca  slope thal    ca