# Model 3 — Random Forest Classifier

This notebook trains a Random Forest classifier with GridSearchCV, evaluates performance (accuracy, precision, recall, F1, ROC-AUC), plots confusion matrix and ROC curve, visualizes feature importance, saves the best model to `models/random_forest_best.pkl`, and appends metrics to a model comparison file.

In [None]:
# 1) Imports
import os
import time
import joblib
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
                             confusion_matrix, classification_report, roc_curve)

# Configure plots
sns.set(style='whitegrid')

# Create output directories
os.makedirs('../outputs/models', exist_ok=True)
os.makedirs('../outputs/metrics', exist_ok=True)
os.makedirs('..\models', exist_ok=True)  # repo-level models folder

print('✅ Imports and output directories ready')

In [None]:
# 2) Load preprocessed data
# The preprocessing notebook saved pickles to ../outputs/preprocessed/*.pkl
from pathlib import Path
data_dir = Path('../outputs/preprocessed')
candidates = {
    'X_train': data_dir / 'X_train.pkl',
    'X_val': data_dir / 'X_val.pkl',
    'X_test': data_dir / 'X_test.pkl',
    'y_train': data_dir / 'y_train.pkl',
    'y_val': data_dir / 'y_val.pkl',
    'y_test': data_dir / 'y_test.pkl'
}
loaded = {}
for name, path in candidates.items():
    if path.exists():
        loaded[name] = joblib.load(path)
        print(f'Loaded {name} from {path}')
    else:
        print(f'Warning: {path} not found. Attempting fallback to ../outputs/preprocessed/{name}.pkl')
        # keep going; user may need to run preprocessing first

# Quick check that required objects loaded
required = ['X_train','X_val','X_test','y_train','y_val','y_test']
missing = [r for r in required if r not in loaded]
if missing:
    print('Some preprocessed datasets are missing:', missing)
    print('Please run the preprocessing notebook (02_data_preprocessing.ipynb) or move the pickles to ../outputs/preprocessed/')
else:
    X_train = loaded['X_train']
    X_val = loaded['X_val']
    X_test = loaded['X_test']
    y_train = loaded['y_train']
    y_val = loaded['y_val']
    y_test = loaded['y_test']
    print(f'✅ Data shapes — X_train: {X_train.shape}, X_val: {X_val.shape}, X_test: {X_test.shape}')

In [None]:
# 3) Evaluation helper
def evaluate_model(model, X, y, labels=None, prefix='val'):
    y_pred = model.predict(X)
    y_proba = None
    try:
        # try predict_proba for ROC AUC; if not available, try decision_function
        if hasattr(model, 'predict_proba'):
            y_proba = model.predict_proba(X)[:, 1]
        elif hasattr(model, 'decision_function'):
            y_proba = model.decision_function(X)
    except Exception:
        y_proba = None

    metrics = {}
    metrics['accuracy'] = float(accuracy_score(y, y_pred))
    metrics['precision'] = float(precision_score(y, y_pred, zero_division=0))
    metrics['recall'] = float(recall_score(y, y_pred, zero_division=0))
    metrics['f1'] = float(f1_score(y, y_pred, zero_division=0))
    if y_proba is not None:
        try:
            metrics['roc_auc'] = float(roc_auc_score(y, y_proba))
        except Exception:
            metrics['roc_auc'] = None
    else:
        metrics['roc_auc'] = None

    # Confusion matrix plot
    cm = confusion_matrix(y, y_pred)
    plt.figure(figsize=(6,5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix — {prefix}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    cm_path = f'../outputs/models/confusion_matrix_{prefix}.png'
    plt.savefig(cm_path, dpi=200, bbox_inches='tight')
    plt.close()

    # ROC curve
    if y_proba is not None:
        fpr, tpr, _ = roc_curve(y, y_proba)
        plt.figure(figsize=(6,5))
        plt.plot(fpr, tpr, label=f'AUC = {metrics.get(
):.3f}' if metrics.get('roc_auc') else 'ROC')
        plt.plot([0,1],[0,1],'k--', alpha=0.6)
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curve — {prefix}')
        plt.legend(loc='lower right')
        roc_path = f'../outputs/models/roc_curve_{prefix}.png'
        plt.savefig(roc_path, dpi=200, bbox_inches='tight')
        plt.close()
    else:
        roc_path = None

    # Classification report (string)
    metrics['classification_report'] = classification_report(y, y_pred, zero_division=0, output_dict=True)
    metrics['confusion_matrix_path'] = cm_path
    metrics['roc_curve_path'] = roc_path

    return metrics

print('✅ Evaluation helper defined')

In [None]:
# 4) Parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'max_features': ['sqrt', 'log2']
}

print('✅ Parameter grid ready')

In [None]:
# 5-8) GridSearchCV and training on training set, evaluation on validation set
if 'X_train' in globals():
    rf = RandomForestClassifier(random_state=42, n_jobs=-1, oob_score=True)
    grid = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

    print('Starting GridSearchCV (this may take some time) ...')
    start = time.time()
    grid.fit(X_train, y_train)
    end = time.time()
    print(f'GridSearchCV done. Time elapsed: {(end-start)/60:.2f} minutes')

    print('Best parameters:')
    print(grid.best_params_)
    print(f'Best CV score: {grid.best_score_:.4f}')

    best_model = grid.best_estimator_

    # Evaluate on validation set
    val_metrics = evaluate_model(best_model, X_val, y_val, prefix='validation')
    print('Validation metrics:')
    for k,v in val_metrics.items():
        if k not in ['classification_report','confusion_matrix_path','roc_curve_path']:
            print(f'  {k}: {v}')

    # Evaluate on test set as well (final evaluation)
    test_metrics = evaluate_model(best_model, X_test, y_test, prefix='test')
    print('Test metrics:')
    for k,v in test_metrics.items():
        if k not in ['classification_report','confusion_matrix_path','roc_curve_path']:
            print(f'  {k}: {v}')

    # 9) Feature importance (top 20)
    try:
        importances = pd.Series(best_model.feature_importances_, index=X_train.columns)
        top20 = importances.sort_values(ascending=False).head(20)
        plt.figure(figsize=(10,8))
        sns.barplot(x=top20.values, y=top20.index, palette='viridis')
        plt.title('Top 20 Feature Importances')
        plt.xlabel('Importance')
        fi_path = '../outputs/models/feature_importances_top20.png'
        plt.tight_layout()
        plt.savefig(fi_path, dpi=200, bbox_inches='tight')
        plt.close()
        print(f'Feature importance plot saved to {fi_path}')
    except Exception as e:
        print('Could not compute feature importances:', e)

    # OOB score if available
    oob = getattr(best_model, 'oob_score_', None)
    print(f'OOB score: {oob}')

    # 10) Save the trained model
    model_path = 'models/random_forest_best.pkl'
    joblib.dump(best_model, model_path)
    print(f'Best model saved to {model_path}')

    # 11) Append metrics to global results JSON for model comparison
    results_path = '../outputs/models/model_comparison.json'
    all_results = {}
    if os.path.exists(results_path):
        try:
            with open(results_path, 'r') as f:
                all_results = json.load(f)
        except Exception:
            all_results = {}

    model_name = 'RandomForest_GridSearch'
    all_results[model_name] = {
        'best_params': grid.best_params_,
        'best_cv_score': grid.best_score_,
        'validation_metrics': val_metrics,
        'test_metrics': test_metrics,
        'oob_score': oob,
        'saved_model_path': model_path,
        'feature_importance_plot': fi_path if 'fi_path' in locals() else None,
        'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
    }

    with open(results_path, 'w') as f:
        json.dump(all_results, f, indent=2)

    print(f'Metrics appended to {results_path}')
else:
    print('Preprocessed data not loaded; cannot run training. Please run preprocessing first.')

## Notes / Explanations

### Why Random Forest?
Random Forests are robust, handle mixed data types, resist overfitting through ensembling, provide feature importance measures, and generally perform well out-of-the-box for tabular health data.

### Hyperparameters tuned
- n_estimators: Number of trees in the forest. More trees can improve performance but increase training time.
- max_depth: Maximum depth of each tree. Controls complexity; None means nodes expanded until pure or min_samples_split.
- min_samples_split: Minimum number of samples required to split an internal node. Larger values prevent small splits and can reduce overfitting.
- max_features: Number of features to consider when looking for the best split. 'sqrt' and 'log2' are common choices that reduce correlation between trees.

### Insights from Feature Importance
The top features (shown in the saved plot) indicate which clinical, cognitive, or lifestyle variables the model relies on most. These insights can guide feature selection and clinical interpretation.

---
*Next steps:* review the generated `model_comparison.json`, inspect the saved plots in `../outputs/models/`, and, if desired, run a lighter grid (fewer combinations) for faster experimentation or use RandomizedSearchCV for larger hyperparameter spaces.