In [7]:
import pandas as pd
import numpy as np
import json
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [8]:
# load dataset
df = pd.read_csv('heart_2020_uncleaned.csv')
print("Initial shape:", df.shape)
df.head()

# data cleaning
df.fillna(df.mean(numeric_only=True), inplace=True)

for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].fillna(df[col].mode()[0])

for col in df.select_dtypes(include='object'):
    df[col] = df[col].str.strip().str.lower().str.capitalize()


df.drop_duplicates(inplace=True)

print("Cleaned shape:", df.shape)
df.isnull().sum()

Initial shape: (319795, 18)
Cleaned shape: (301752, 18)


HeartDisease        0
BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64

In [9]:
# define features and target
target_col = 'HeartDisease' 
X = df.drop(columns=[target_col])
y = df[target_col]

# split (num vs. cat)
num_feats = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_feats = X.select_dtypes(include='object').columns.tolist()

print("Numerical Features:", num_feats)
print("Categorical Features:", cat_feats)


Numerical Features: ['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']
Categorical Features: ['Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'AgeCategory', 'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer']


In [10]:
# preprocessing and pipeline
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(handle_unknown='ignore')

# column transformer
preprocessor = ColumnTransformer([
    ('num', num_transformer, num_feats),
    ('cat', cat_transformer, cat_feats)
])

# full pipeline w/ random forest
pipe = Pipeline(steps=[
    ('pre', preprocessor),
    ('clf', RandomForestClassifier(n_estimators=10, random_state=42))
])

In [11]:
# train-test split + model training
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

pipe.fit(X_train, y_train)

score = pipe.score(X_test, y_test)
print(f"Test Accuracy: {score:.2f}")

Test Accuracy: 0.90


In [12]:
# save pipeline and metadata
joblib.dump(pipe, 'model_pipeline.pkl')

meta = {
    'num_feats': num_feats,
    'cat_feats': cat_feats
}
with open('feature_metadata.json', 'w') as f:
    json.dump(meta, f)

print("✅ Model and metadata saved!")

✅ Model and metadata saved!
