<a href="https://colab.research.google.com/github/jason-0512/jason-0512-BMCS2203-ARTIFICIAL-INTELLIGENCE---Heart-Failure-Death-Prediction-Prototype/blob/main/RandomForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install pandas numpy scikit-learn matplotlib seaborn joblib

Import libraries

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_score, recall_score, f1_score
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

Step 1: Load dataset

In [None]:
data = pd.read_csv('heart_failure_clinical_records.csv')
print("Dataset Shape:", data.shape)
print("\nMissing Values:\n", data.isnull().sum())
print("\nClass Balance:\n", data['DEATH_EVENT'].value_counts(normalize=True))
print("\nSummary Statistics:\n", data.describe())

Step 2: Data Preprocessing
Handle missing values and define features and target variables. Exclude the 'time' column from features.

In [5]:
if data.isnull().sum().sum() > 0:
    data.fillna(data.median(), inplace=True)

X = data.drop(['DEATH_EVENT', 'time'], axis=1)
y = data['DEATH_EVENT']
feature_names = X.columns

Step 3: Stratified 10-Fold Cross-Validation
Train a Random Forest model in each fold, compute metrics, feature importance, and ROC curve data.

In [6]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

accuracies = []
conf_matrices = []
class_reports = []
feature_importances = []
roc_aucs = []
fpr_list = []
tpr_list = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', max_depth=5)
    rf_model.fit(X_train, y_train)

    y_pred = rf_model.predict(X_test)
    y_pred_proba = rf_model.predict_proba(X_test)[:, 1]

    accuracies.append(accuracy_score(y_test, y_pred))
    conf_matrices.append(confusion_matrix(y_test, y_pred))
    class_reports.append(classification_report(y_test, y_pred, output_dict=True, target_names=['Survived', 'Died']))
    feature_importances.append(rf_model.feature_importances_)
    roc_aucs.append(roc_auc_score(y_test, y_pred_proba))

    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    fpr_list.append(fpr)
    tpr_list.append(tpr)


Step 4: Train Final Random Forest on Full Dataset
Train a Random Forest with the chosen hyperparameters on the entire dataset for final evaluation.

In [None]:
final_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight='balanced',
    max_depth=5
)
final_model.fit(X, y)

step 5: Compute Average Metrics Across Folds
Compute average accuracy, ROC-AUC, classification report, and confusion matrix from cross-validation.

In [None]:
average_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)
average_roc_auc = np.mean(roc_aucs)
std_roc_auc = np.std(roc_aucs)

print(f"Average Accuracy: {average_accuracy:.2f} (±{std_accuracy:.2f})")
print(f"Average ROC-AUC: {average_roc_auc:.2f} (±{std_roc_auc:.2f})")

avg_report = {'Survived': {'precision': 0, 'recall': 0, 'f1-score': 0, 'support': 0},
              'Died': {'precision': 0, 'recall': 0, 'f1-score': 0, 'support': 0}}
for report in class_reports:
    for cls in ['Survived', 'Died']:
        for metric in ['precision', 'recall', 'f1-score', 'support']:
            avg_report[cls][metric] += report[cls][metric] / len(class_reports)

print("\nAverage Classification Report:")
print("              precision    recall  f1-score   support")
for cls in ['Survived', 'Died']:
    print(f"{cls:<12} {avg_report[cls]['precision']:.2f}      {avg_report[cls]['recall']:.2f}      {avg_report[cls]['f1-score']:.2f}      {avg_report[cls]['support']:.1f}")

average_cm = np.mean(conf_matrices, axis=0)
print("\nAverage Confusion Matrix:")
print(np.round(average_cm).astype(int))

average_feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Importance': np.mean(feature_importances, axis=0)
}).sort_values(by='Importance', ascending=False)
print("\nAverage Feature Importance:")
print(average_feature_importance.round(6))

Step 6: Final Evaluation on Full Dataset
Evaluate the trained Random Forest on the full dataset and display metrics.

In [None]:
y_pred = final_model.predict(X)
y_pred_proba = final_model.predict_proba(X)[:, 1]

accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
f1 = f1_score(y, y_pred)
roc_auc = roc_auc_score(y, y_pred_proba)

print("\nFinal Evaluation on Full Dataset:")
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-Score: {f1:.3f}")
print(f"ROC-AUC: {roc_auc:.3f}")

Step 7: visualization

In [None]:
plt.figure(figsize=(6, 4))
mean_fpr = np.linspace(0, 1, 100)
mean_tpr = np.zeros_like(mean_fpr)
for fpr, tpr in zip(fpr_list, tpr_list):
    mean_tpr += np.interp(mean_fpr, fpr, tpr)
mean_tpr /= len(fpr_list)
plt.plot(mean_fpr, mean_tpr, label=f'Mean ROC (AUC = {average_roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Average ROC Curve')
plt.legend(loc='lower right')
plt.show()

plt.figure(figsize=(6, 4))
sns.heatmap(average_cm, annot=True, fmt='.2f', cmap='Blues', xticklabels=['Survived', 'Died'], yticklabels=['Survived', 'Died'])
plt.title('Average Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=average_feature_importance)
plt.title('Average Feature Importance')
plt.show()

In [11]:
joblib.dump(final_model, 'random_forest_model.pkl')

['random_forest_model.pkl']