In [1]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

df = pd.read_csv("../heart.csv")
df_encoded = pd.get_dummies(df, drop_first=True)

X = df_encoded.drop("HeartDisease", axis=1)
y = df_encoded["HeartDisease"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

param_grid = {
    'n_estimators': [200,300,400],
    'max_depth': [10,20,30,None],
    'min_samples_split': [2,5,10],
    'min_samples_leaf': [1,2,4],
    'bootstrap': [True, False]
}

rf = RandomForestClassifier(random_state=42)

rf_random = RandomizedSearchCV(
    rf, param_grid, n_iter=20, cv=5, random_state=42, n_jobs=-1
)

rf_random.fit(X_train, y_train)

best_rf = rf_random.best_estimator_

with open("../heart_disease_model.pkl", "wb") as f:
    pickle.dump(best_rf, f)

with open("../heart_disease_model.pkl", "rb") as f:
    loaded_model = pickle.load(f)

preds = loaded_model.predict(X_test)
print("Final Tuned Model Accuracy:", accuracy_score(y_test, preds))


Final Tuned Model Accuracy: 0.8695652173913043
