In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold


In [14]:
import pandas as pd

df = pd.read_csv("outputs/cleaned_heart.csv")
target_col = "target"   # or the exact name in your dataset

X = df.drop(columns=[target_col])
y = df[target_col].astype(int)


In [17]:
df.dtypes


age                                int64
sex                               object
chest_pain_type                   object
resting_blood_pressure             int64
cholestoral                        int64
fasting_blood_sugar               object
rest_ecg                          object
Max_heart_rate                     int64
exercise_induced_angina           object
oldpeak                          float64
slope                             object
vessels_colored_by_flourosopy     object
thalassemia                       object
target                             int64
dtype: object

In [18]:
categorical_cols = df.select_dtypes(include=['object']).columns
print("Categorical:", categorical_cols.tolist())

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


Categorical: ['sex', 'chest_pain_type', 'fasting_blood_sugar', 'rest_ecg', 'exercise_induced_angina', 'slope', 'vessels_colored_by_flourosopy', 'thalassemia']


In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

target_col = "target"  # or the exact label name
X = df.drop(columns=[target_col])
y = df[target_col].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)


In [20]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [21]:
param_grid = {
    'n_estimators': [100, 200, 300],   # number of trees
    'max_depth': [None, 5, 10, 15],    # depth of trees
    'min_samples_split': [2, 5, 10]    # min samples to split
}


In [22]:
rf = RandomForestClassifier(random_state=42)

grid = GridSearchCV(
    rf,
    param_grid,
    scoring='recall',   # focus on recall (catching positives)
    cv=cv,
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)
print("Best CV recall:", grid.best_score_)


Best parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Best CV recall: 0.9857142857142858


In [23]:
best_rf = grid.best_estimator_
from sklearn.metrics import classification_report

y_pred = best_rf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00       100
           1       1.00      1.00      1.00       105

    accuracy                           1.00       205
   macro avg       1.00      1.00      1.00       205
weighted avg       1.00      1.00      1.00       205



In [24]:
best_rf = grid.best_estimator_
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, classification_report

y_pred = best_rf.predict(X_test)
print(classification_report(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, best_rf.predict_proba(X_test)[:,1]))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00       100
           1       1.00      1.00      1.00       105

    accuracy                           1.00       205
   macro avg       1.00      1.00      1.00       205
weighted avg       1.00      1.00      1.00       205

AUC: 1.0


In [25]:
from sklearn.metrics import precision_recall_curve

probs = best_rf.predict_proba(X_test)[:,1]
prec, rec, thresh = precision_recall_curve(y_test, probs)

# choose threshold where recall ~0.9 (example)
import numpy as np
idx = np.where(rec >= 0.90)[0][-1]
best_thresh = thresh[idx]
print("Chosen threshold:", best_thresh)

y_pred_adj = (probs >= best_thresh).astype(int)
print(classification_report(y_test, y_pred_adj))


Chosen threshold: 0.83
              precision    recall  f1-score   support

           0       0.92      1.00      0.96       100
           1       1.00      0.91      0.96       105

    accuracy                           0.96       205
   macro avg       0.96      0.96      0.96       205
weighted avg       0.96      0.96      0.96       205



In [26]:
import joblib
joblib.dump(best_rf, "outputs/rf_model.joblib")
joblib.dump(scaler, "outputs/scaler.joblib")
joblib.dump(X.columns.tolist(), "outputs/columns.joblib")

['outputs/columns.joblib']

In [31]:
import pandas as pd

df = pd.read_csv("outputs/cleaned_heart.csv")
print(train_df.columns.tolist())


NameError: name 'train_df' is not defined

In [30]:
print(df.columns.tolist())


['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholestoral', 'fasting_blood_sugar', 'rest_ecg', 'Max_heart_rate', 'exercise_induced_angina', 'oldpeak', 'slope', 'vessels_colored_by_flourosopy', 'thalassemia', 'target']
