# :two_hearts: 3 - Modeling

## Imports

In [125]:
import joblib
import pandas as pd

from pathlib import Path
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

## Predefined variables

In [126]:
CSV_PATH = "../data/heart-failure/heart_clean.csv"
PIPELINE_PATH = "../pipelines/"

## Create dirs

In [127]:
Path("../models").mkdir(exist_ok=True)
Path("../results").mkdir(exist_ok=True)
Path("../data/splits").mkdir(parents=True, exist_ok=True)

## Data and artifacts loading

In [128]:
heart_data = pd.read_csv(CSV_PATH)
heart_data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140.0,289.0,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160.0,180.0,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130.0,283.0,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138.0,214.0,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150.0,195.0,0,Normal,122,N,0.0,Up,0


In [129]:
X = heart_data.drop("HeartDisease", axis=1)
y = heart_data["HeartDisease"]

In [130]:
feature_subsets = joblib.load(f"{PIPELINE_PATH}feature_subsets.joblib")
numerical_pipeline = joblib.load(f"{PIPELINE_PATH}numerical_transformer.joblib")
categorical_pipeline = joblib.load(f"{PIPELINE_PATH}categorical_transformer.joblib")
numerical_features = joblib.load(f"{PIPELINE_PATH}numerical_features.joblib")
categorical_features = joblib.load(f"{PIPELINE_PATH}categorical_features.joblib")

In [131]:
def build_preprocessor(subset):
    num_feats = [f for f in subset if f in numerical_features]
    cat_feats = [f for f in subset if f in categorical_features]

    return ColumnTransformer(
        transformers=[
            ('num', numerical_pipeline, num_feats),
            ('cat', categorical_pipeline, cat_feats)
        ],
        remainder='drop'
    )

## Train-test split

In [132]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [133]:
X_test.to_csv("../data/splits/X_test.csv", index=False)
y_test.to_csv("../data/splits/y_test.csv", index=False)

## Models & hyperparameter grids

In [134]:
param_grid_xgb = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__learning_rate': [0.03, 0.05, 0.1],
    'classifier__max_depth': [3, 5, 7],
    'classifier__subsample': [0.7, 0.8, 0.9]
}

param_grid_rf = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 5, 10],
    'classifier__min_samples_leaf': [1, 5, 10]
}

In [135]:
models = {
    "XGBoost": {
        "model": XGBClassifier(eval_metric='logloss', random_state=42),
        "param_grid": param_grid_xgb
    },
    "RandomForest": {
        "model": RandomForestClassifier(random_state=42, class_weight='balanced'),
        "param_grid": param_grid_rf
    }
}

### Training loop

In [136]:
results = []

cv_options = [5, 10]

for subset_name, subset in feature_subsets.items():
    print(f"Processing feature subset: {subset_name}")

    preprocessor = build_preprocessor(subset)

    for model_name, model_cfg in models.items():
        print(f"\tTraining {model_name}...")

        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', model_cfg["model"])
        ])

        best_overall_score = -1
        best_overall_model = None
        best_overall_params = None
        best_cv = None

        for cv in cv_options:
            print(f"\t\tCV = {cv}")

            grid = GridSearchCV(
                pipeline,
                model_cfg["param_grid"],
                cv=cv,
                scoring='roc_auc',
                n_jobs=-1
            )

            grid.fit(X_train, y_train)

            if grid.best_score_ > best_overall_score:
                best_overall_score = grid.best_score_
                best_overall_model = grid.best_estimator_
                best_overall_params = grid.best_params_
                best_cv = cv

        model_filename = f"../models/{model_name}_{subset_name}_cv{best_cv}.joblib"
        joblib.dump(best_overall_model, model_filename)

        results.append({
            "model": model_name,
            "feature_subset": subset_name,
            "best_cv": best_cv,
            "cv_roc_auc": best_overall_score,
            "best_params": best_overall_params,
            "model_path": model_filename
        })

Processing feature subset: all
	Training XGBoost...
		CV = 5
		CV = 10
	Training RandomForest...
		CV = 5
		CV = 10
Processing feature subset: numerical
	Training XGBoost...
		CV = 5
		CV = 10
	Training RandomForest...
		CV = 5
		CV = 10
Processing feature subset: categorical
	Training XGBoost...
		CV = 5
		CV = 10
	Training RandomForest...
		CV = 5
		CV = 10
Processing feature subset: FS-1
	Training XGBoost...
		CV = 5
		CV = 10
	Training RandomForest...
		CV = 5
		CV = 10
Processing feature subset: FS-2
	Training XGBoost...
		CV = 5
		CV = 10
	Training RandomForest...
		CV = 5
		CV = 10
Processing feature subset: FS-3
	Training XGBoost...
		CV = 5
		CV = 10
	Training RandomForest...
		CV = 5
		CV = 10


In [137]:
results_df = pd.DataFrame(results)
results_df.sort_values("cv_roc_auc", ascending=False, inplace=True)

results_df.to_csv("../results/model_overview.csv", index=False)
results_df

Unnamed: 0,model,feature_subset,best_cv,cv_roc_auc,best_params,model_path
0,XGBoost,all,10,0.929082,"{'classifier__learning_rate': 0.03, 'classifie...",../models/XGBoost_all_cv10.joblib
1,RandomForest,all,10,0.927438,"{'classifier__max_depth': None, 'classifier__m...",../models/RandomForest_all_cv10.joblib
10,XGBoost,FS-3,10,0.917774,"{'classifier__learning_rate': 0.03, 'classifie...",../models/XGBoost_FS-3_cv10.joblib
7,RandomForest,FS-1,5,0.916203,"{'classifier__max_depth': 10, 'classifier__min...",../models/RandomForest_FS-1_cv5.joblib
6,XGBoost,FS-1,10,0.915968,"{'classifier__learning_rate': 0.03, 'classifie...",../models/XGBoost_FS-1_cv10.joblib
11,RandomForest,FS-3,5,0.913712,"{'classifier__max_depth': 5, 'classifier__min_...",../models/RandomForest_FS-3_cv5.joblib
8,XGBoost,FS-2,10,0.913466,"{'classifier__learning_rate': 0.03, 'classifie...",../models/XGBoost_FS-2_cv10.joblib
9,RandomForest,FS-2,10,0.91288,"{'classifier__max_depth': None, 'classifier__m...",../models/RandomForest_FS-2_cv10.joblib
4,XGBoost,categorical,10,0.909979,"{'classifier__learning_rate': 0.03, 'classifie...",../models/XGBoost_categorical_cv10.joblib
5,RandomForest,categorical,10,0.909072,"{'classifier__max_depth': 5, 'classifier__min_...",../models/RandomForest_categorical_cv10.joblib
