In [3]:
import pickle
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from scipy.stats import uniform, randint
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split



In [4]:
df = pd.read_csv("data/heart_disease_selected_features.csv")
df.head()

Unnamed: 0,thal_reversable defect,oldpeak,thal_normal,cp_asymptomatic,chol,slope_upsloping,trestbps,cp_non-anginal pain,exang_no,sex_female,thalach,age,ca,num
0,False,2.3,False,False,233,False,145,False,True,False,150,63,0.0,0
1,False,1.5,True,True,286,False,160,False,False,False,108,67,3.0,1
2,True,2.6,False,True,229,False,120,False,False,False,129,67,2.0,1
3,False,3.5,True,False,250,False,130,True,True,False,187,37,0.0,0
4,False,1.4,True,False,204,True,130,False,True,True,172,41,0.0,0


In [5]:
X = df.drop("num", axis=1)
y = df["num"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
model_files = {
    "Logistic Regression": "models/Logistic Regression_model.pkl",
    "Decision Tree": "models/Decision Tree_model.pkl",
    "Random Forest": "models/Random Forest_model.pkl",
    "SVM": "models/SVM_model.pkl"
}

loaded_models = {}
for name, file in model_files.items():
    with open(file, "rb") as f:
        loaded_models[name] = pickle.load(f)

print("✅ Loaded models:", list(loaded_models.keys()))

✅ Loaded models: ['Logistic Regression', 'Decision Tree', 'Random Forest', 'SVM']


In [7]:
models_params = {
    "Logistic Regression": {
        "model": loaded_models["Logistic Regression"],
        "params": {
            "C": uniform(0.01, 10),
            "penalty": ["l1","l2"],
            "solver": ["liblinear"]
        }
    },
    "Decision Tree": {
        "model": loaded_models["Decision Tree"],
        "params": {
            "max_depth": randint(3,15),
            "min_samples_split": randint(2,10),
            "min_samples_leaf": randint(1,4)
        }
    },
    "Random Forest": {
        "model": loaded_models["Random Forest"],
        "params": {
            "n_estimators": randint(100,300),
            "max_depth": randint(3,15),
            "min_samples_split": randint(2,10),
            "min_samples_leaf": randint(1,4)
        }
    },
    "SVM": {
        "model": loaded_models["SVM"],
        "params": {
            "C": uniform(0.1, 10),
            "kernel": ["linear","rbf"],
            "gamma": ["scale","auto"]
        }
    }
}

In [9]:
best_models = {}
results = []

for name, mp in models_params.items():
    print(f"\nRandomizedSearchCV for {name}...")
    rs = RandomizedSearchCV(mp["model"], mp["params"], n_iter=20, cv=5, scoring="roc_auc", n_jobs=1, verbose=1, random_state=42)
    rs.fit(X_train, y_train)
    
    best_model = rs.best_estimator_
    best_models[name] = best_model
    
    # Evaluate before tuning
    y_proba_orig = mp["model"].predict_proba(X_test)[:,1] if hasattr(mp["model"], "predict_proba") else mp["model"].decision_function(X_test)
    roc_auc_orig = roc_auc_score(y_test, y_proba_orig)
    
    # Evaluate after tuning
    y_proba_best = best_model.predict_proba(X_test)[:,1] if hasattr(best_model, "predict_proba") else best_model.decision_function(X_test)
    roc_auc_best = roc_auc_score(y_test, y_proba_best)
    
    results.append([name, roc_auc_orig, roc_auc_best, rs.best_params_])
    print(f"Original ROC-AUC: {roc_auc_orig:.4f}, Tuned ROC-AUC: {roc_auc_best:.4f}")


RandomizedSearchCV for Logistic Regression...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Original ROC-AUC: 0.9479, Tuned ROC-AUC: 0.9572

RandomizedSearchCV for Decision Tree...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Original ROC-AUC: 0.8056, Tuned ROC-AUC: 0.8524

RandomizedSearchCV for Random Forest...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Original ROC-AUC: 0.9398, Tuned ROC-AUC: 0.9583

RandomizedSearchCV for SVM...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Original ROC-AUC: 0.9688, Tuned ROC-AUC: 0.9363


In [11]:
results_df = pd.DataFrame(results, columns=["Model", "ROC-AUC Original", "ROC-AUC Tuned", "Best Hyperparameters"])
results_df = results_df.sort_values(by="ROC-AUC Tuned", ascending=False)
print("\nComparison of Original vs Tuned Models:")
results_df


Comparison of Original vs Tuned Models:


Unnamed: 0,Model,ROC-AUC Original,ROC-AUC Tuned,Best Hyperparameters
2,Random Forest,0.939815,0.958333,"{'max_depth': 5, 'min_samples_leaf': 3, 'min_s..."
0,Logistic Regression,0.947917,0.957176,"{'C': 0.017787658410143285, 'penalty': 'l2', '..."
3,SVM,0.96875,0.936343,"{'C': 3.1424224295953773, 'gamma': 'auto', 'ke..."
1,Decision Tree,0.805556,0.852431,"{'max_depth': 14, 'min_samples_leaf': 2, 'min_..."


In [12]:
best_model_name = results_df.iloc[0]["Model"]
best_model_final = best_models[best_model_name]

with open("models/final_model.pkl", "wb") as f:
    pickle.dump(best_model_final, f)


In [13]:
with open("models/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)