# 6. Algorithm Comparison with Loss Curves & Training Visualization

## Objective
Compare top 4 performing algorithms:
1. **Gradient Boosting** - 100% Accuracy
2. **Stacking** - 100% Accuracy
3. **XGBoost** - 99.57% Accuracy
4. **Random Forest** - 99.71% Accuracy

Add loss curves and training history for each model.

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc
import xgboost as xgb
import joblib

# Set style
sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (15, 10)
plt.rcParams['font.size'] = 11

print("‚úì Libraries imported successfully")

## Step 1: Load and Prepare Data

In [None]:
# Load preprocessed data
df = pd.read_csv('data/preprocessed_data.csv')

# Separate features and target
X = df.drop(columns=['Osteoporosis_Risk'])
y = df['Osteoporosis_Risk']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"‚úì Data loaded")
print(f"  Training set: {X_train.shape}")
print(f"  Test set: {X_test.shape}")
print(f"  Features: {X.shape[1]}")

## Step 2: 1. GRADIENT BOOSTING with Loss Curves

In [None]:
print("\n" + "="*60)
print("ALGORITHM 1: GRADIENT BOOSTING")
print("="*60)

# Configure Gradient Boosting
gb_model = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=5,
    min_samples_split=5,
    min_samples_leaf=2,
    subsample=0.8,
    validation_fraction=0.1,
    n_iter_no_change=10,
    random_state=42,
    verbose=0
)

# Train model
print("\n[Training Gradient Boosting...]")
gb_model.fit(X_train, y_train)

# Predictions
gb_pred = gb_model.predict(X_test)
gb_pred_proba = gb_model.predict_proba(X_test)[:, 1]

# Metrics
gb_accuracy = accuracy_score(y_test, gb_pred)
gb_auc = roc_auc_score(y_test, gb_pred_proba)

print(f"\n‚úì Gradient Boosting Results:")
print(f"  Accuracy: {gb_accuracy:.6f} ({gb_accuracy*100:.2f}%)")
print(f"  ROC-AUC: {gb_auc:.6f}")
print(f"  Trees trained: {len(gb_model.estimators_)}")

# Extract loss values (training loss)
gb_train_loss = []
for i, y_pred in enumerate(gb_model.staged_predict_proba(X_train)[:, 1]):
    # Log loss
    loss = -np.mean(y_train * np.log(y_pred + 1e-15) + (1 - y_train) * np.log(1 - y_pred + 1e-15))
    gb_train_loss.append(loss)

gb_val_loss = []
for i, y_pred in enumerate(gb_model.staged_predict_proba(X_test)[:, 1]):
    # Log loss
    loss = -np.mean(y_test * np.log(y_pred + 1e-15) + (1 - y_test) * np.log(1 - y_pred + 1e-15))
    gb_val_loss.append(loss)

print(f"  Initial Training Loss: {gb_train_loss[0]:.6f}")
print(f"  Final Training Loss: {gb_train_loss[-1]:.6f}")
print(f"  Initial Validation Loss: {gb_val_loss[0]:.6f}")
print(f"  Final Validation Loss: {gb_val_loss[-1]:.6f}")

## Step 3: 2. XGBOOST with Loss Curves

In [None]:
print("\n" + "="*60)
print("ALGORITHM 2: XGBOOST")
print("="*60)

# Configure XGBoost with eval_set for loss tracking
xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    reg_lambda=1.0,
    reg_alpha=0.5,
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42,
    verbosity=0,
    device='cpu'
)

# Train with eval_set to track loss
print("\n[Training XGBoost with loss tracking...]")
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    verbose=False
)

# Predictions
xgb_pred = xgb_model.predict(X_test)
xgb_pred_proba = xgb_model.predict_proba(X_test)[:, 1]

# Metrics
xgb_accuracy = accuracy_score(y_test, xgb_pred)
xgb_auc = roc_auc_score(y_test, xgb_pred_proba)

print(f"\n‚úì XGBoost Results:")
print(f"  Accuracy: {xgb_accuracy:.6f} ({xgb_accuracy*100:.2f}%)")
print(f"  ROC-AUC: {xgb_auc:.6f}")
print(f"  Trees trained: {xgb_model.n_estimators}")

# Extract loss history
xgb_results = xgb_model.evals_result()
xgb_train_loss = xgb_results['validation_0']['logloss']
xgb_val_loss = xgb_results['validation_1']['logloss']

print(f"  Initial Training Loss: {xgb_train_loss[0]:.6f}")
print(f"  Final Training Loss: {xgb_train_loss[-1]:.6f}")
print(f"  Initial Validation Loss: {xgb_val_loss[0]:.6f}")
print(f"  Final Validation Loss: {xgb_val_loss[-1]:.6f}")

## Step 4: 3. RANDOM FOREST with Loss Curves

In [None]:
print("\n" + "="*60)
print("ALGORITHM 3: RANDOM FOREST")
print("="*60)

# Configure Random Forest
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    bootstrap=True,
    warm_start=False,
    random_state=42,
    n_jobs=-1,
    verbose=0
)

# Train model
print("\n[Training Random Forest...]")
rf_model.fit(X_train, y_train)

# Predictions
rf_pred = rf_model.predict(X_test)
rf_pred_proba = rf_model.predict_proba(X_test)[:, 1]

# Metrics
rf_accuracy = accuracy_score(y_test, rf_pred)
rf_auc = roc_auc_score(y_test, rf_pred_proba)

print(f"\n‚úì Random Forest Results:")
print(f"  Accuracy: {rf_accuracy:.6f} ({rf_accuracy*100:.2f}%)")
print(f"  ROC-AUC: {rf_auc:.6f}")
print(f"  Trees trained: {rf_model.n_estimators}")

# Calculate OOB loss (Out-of-Bag estimate)
rf_oob_loss = []
rf_test_loss = []

for i in range(1, rf_model.n_estimators + 1):
    # Create partial model
    rf_partial = RandomForestClassifier(
        n_estimators=i,
        max_depth=15,
        min_samples_split=5,
        min_samples_leaf=2,
        max_features='sqrt',
        bootstrap=True,
        oob_score=True,
        random_state=42,
        n_jobs=-1
    )
    rf_partial.fit(X_train, y_train)
    
    # OOB loss
    oob_loss = 1 - rf_partial.oob_score_
    rf_oob_loss.append(oob_loss)
    
    # Test loss
    test_pred_proba = rf_partial.predict_proba(X_test)[:, 1]
    test_loss = -np.mean(y_test * np.log(test_pred_proba + 1e-15) + (1 - y_test) * np.log(1 - test_pred_proba + 1e-15))
    rf_test_loss.append(test_loss)
    
    if i % 50 == 0:
        print(f"    Trained {i}/200 trees...")

print(f"  OOB Loss (initial): {rf_oob_loss[0]:.6f}")
print(f"  OOB Loss (final): {rf_oob_loss[-1]:.6f}")
print(f"  Test Loss (initial): {rf_test_loss[0]:.6f}")
print(f"  Test Loss (final): {rf_test_loss[-1]:.6f}")

## Step 5: 4. STACKING with Loss Curves

In [None]:
print("\n" + "="*60)
print("ALGORITHM 4: STACKING")
print("="*60)

# Define base learners
base_learners = [
    ('xgb', xgb.XGBClassifier(n_estimators=100, learning_rate=0.05, max_depth=5, random_state=42, verbose=0)),
    ('gb', GradientBoostingClassifier(n_estimators=100, learning_rate=0.05, max_depth=5, random_state=42, verbose=0)),
    ('rf', RandomForestClassifier(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1, verbose=0))
]

# Meta-learner
meta_learner = LogisticRegression(random_state=42, max_iter=1000, verbose=0)

# Create stacking classifier
stacking_model = StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_learner,
    cv=5
)

# Train model
print("\n[Training Stacking (5-fold CV)...]")
stacking_model.fit(X_train, y_train)

# Predictions
stacking_pred = stacking_model.predict(X_test)
stacking_pred_proba = stacking_model.predict_proba(X_test)[:, 1]

# Metrics
stacking_accuracy = accuracy_score(y_test, stacking_pred)
stacking_auc = roc_auc_score(y_test, stacking_pred_proba)

print(f"\n‚úì Stacking Results:")
print(f"  Accuracy: {stacking_accuracy:.6f} ({stacking_accuracy*100:.2f}%)")
print(f"  ROC-AUC: {stacking_auc:.6f}")
print(f"  Base learners: {len(base_learners)}")
print(f"  CV folds: 5")

# Calculate training loss for stacking
stacking_train_pred_proba = stacking_model.predict_proba(X_train)[:, 1]
stacking_train_loss = -np.mean(y_train * np.log(stacking_train_pred_proba + 1e-15) + 
                                 (1 - y_train) * np.log(1 - stacking_train_pred_proba + 1e-15))
stacking_test_loss = -np.mean(y_test * np.log(stacking_pred_proba + 1e-15) + 
                                (1 - y_test) * np.log(1 - stacking_pred_proba + 1e-15))

print(f"  Training Loss: {stacking_train_loss:.6f}")
print(f"  Test Loss: {stacking_test_loss:.6f}")

## Step 6: Loss Curves Comparison

In [None]:
# Create comprehensive loss curves figure
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Loss Curves for Top 4 Algorithms', fontsize=18, fontweight='bold', y=1.00)

# ============ GRADIENT BOOSTING ============
ax = axes[0, 0]
iterations = range(1, len(gb_train_loss) + 1)
ax.plot(iterations, gb_train_loss, label='Training Loss', linewidth=2.5, color='#2E86AB', marker='o', markersize=3, markevery=10)
ax.plot(iterations, gb_val_loss, label='Validation Loss', linewidth=2.5, color='#A23B72', marker='s', markersize=3, markevery=10)
ax.set_xlabel('Iteration (Tree Number)', fontsize=11, fontweight='bold')
ax.set_ylabel('Log Loss', fontsize=11, fontweight='bold')
ax.set_title(f'Gradient Boosting\nAccuracy: {gb_accuracy*100:.2f}% | AUC: {gb_auc:.4f}', fontsize=12, fontweight='bold')
ax.legend(fontsize=10, loc='best')
ax.grid(True, alpha=0.3)
ax.set_ylim([min(gb_train_loss + gb_val_loss) * 0.95, max(gb_train_loss + gb_val_loss) * 1.05])

# ============ XGBOOST ============
ax = axes[0, 1]
iterations_xgb = range(1, len(xgb_train_loss) + 1)
ax.plot(iterations_xgb, xgb_train_loss, label='Training Loss', linewidth=2.5, color='#F18F01', marker='o', markersize=3, markevery=10)
ax.plot(iterations_xgb, xgb_val_loss, label='Validation Loss', linewidth=2.5, color='#C73E1D', marker='s', markersize=3, markevery=10)
ax.set_xlabel('Iteration (Tree Number)', fontsize=11, fontweight='bold')
ax.set_ylabel('Log Loss', fontsize=11, fontweight='bold')
ax.set_title(f'XGBoost\nAccuracy: {xgb_accuracy*100:.2f}% | AUC: {xgb_auc:.4f}', fontsize=12, fontweight='bold')
ax.legend(fontsize=10, loc='best')
ax.grid(True, alpha=0.3)
ax.set_ylim([min(xgb_train_loss + xgb_val_loss) * 0.95, max(xgb_train_loss + xgb_val_loss) * 1.05])

# ============ RANDOM FOREST ============
ax = axes[1, 0]
iterations_rf = range(1, len(rf_oob_loss) + 1)
ax.plot(iterations_rf, rf_oob_loss, label='OOB Loss (Training)', linewidth=2.5, color='#06A77D', marker='o', markersize=3, markevery=10)
ax.plot(iterations_rf, rf_test_loss, label='Test Loss', linewidth=2.5, color='#D62828', marker='s', markersize=3, markevery=10)
ax.set_xlabel('Number of Trees', fontsize=11, fontweight='bold')
ax.set_ylabel('Log Loss', fontsize=11, fontweight='bold')
ax.set_title(f'Random Forest\nAccuracy: {rf_accuracy*100:.2f}% | AUC: {rf_auc:.4f}', fontsize=12, fontweight='bold')
ax.legend(fontsize=10, loc='best')
ax.grid(True, alpha=0.3)
ax.set_ylim([min(rf_oob_loss + rf_test_loss) * 0.95, max(rf_oob_loss + rf_test_loss) * 1.05])

# ============ STACKING ============
ax = axes[1, 1]
algorithms = ['Gradient\nBoosting', 'XGBoost', 'Random\nForest', 'Stacking']
losses = [gb_test_loss[-1], xgb_val_loss[-1], rf_test_loss[-1], stacking_test_loss]
accuracies = [gb_accuracy*100, xgb_accuracy*100, rf_accuracy*100, stacking_accuracy*100]

colors_bar = ['#2E86AB', '#F18F01', '#06A77D', '#9D4EDD']
bars = ax.bar(algorithms, losses, color=colors_bar, alpha=0.8, edgecolor='black', linewidth=1.5)

# Add accuracy labels on bars
for bar, acc in zip(bars, accuracies):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{acc:.2f}%',
            ha='center', va='bottom', fontweight='bold', fontsize=10)

ax.set_ylabel('Test Loss', fontsize=11, fontweight='bold')
ax.set_title(f'Final Test Loss Comparison\nStacking: {stacking_accuracy*100:.2f}% | AUC: {stacking_auc:.4f}', fontsize=12, fontweight='bold')
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('figures/algorithm_loss_curves_comparison.png', dpi=300, bbox_inches='tight')
print("\n‚úì Loss curves comparison saved to: figures/algorithm_loss_curves_comparison.png")
plt.show()

## Step 7: Individual Detailed Loss Curves

In [None]:
# Create individual detailed plots
fig = plt.figure(figsize=(18, 14))
gs = fig.add_gridspec(3, 2, hspace=0.35, wspace=0.3)

# ============ GRADIENT BOOSTING - Detailed ============
ax1 = fig.add_subplot(gs[0, 0])
ax1.plot(range(1, len(gb_train_loss)+1), gb_train_loss, linewidth=2.5, color='#2E86AB', label='Train', alpha=0.8)
ax1.plot(range(1, len(gb_val_loss)+1), gb_val_loss, linewidth=2.5, color='#A23B72', label='Val', alpha=0.8)
ax1.fill_between(range(1, len(gb_train_loss)+1), gb_train_loss, gb_val_loss, alpha=0.1, color='gray')
ax1.set_xlabel('Boosting Iteration', fontsize=10, fontweight='bold')
ax1.set_ylabel('Log Loss', fontsize=10, fontweight='bold')
ax1.set_title('Gradient Boosting - Loss Progression', fontsize=11, fontweight='bold')
ax1.legend(fontsize=9)
ax1.grid(True, alpha=0.3)

# ============ XGBOOST - Detailed ============
ax2 = fig.add_subplot(gs[0, 1])
ax2.plot(range(1, len(xgb_train_loss)+1), xgb_train_loss, linewidth=2.5, color='#F18F01', label='Train', alpha=0.8)
ax2.plot(range(1, len(xgb_val_loss)+1), xgb_val_loss, linewidth=2.5, color='#C73E1D', label='Val', alpha=0.8)
ax2.fill_between(range(1, len(xgb_train_loss)+1), xgb_train_loss, xgb_val_loss, alpha=0.1, color='gray')
ax2.set_xlabel('Boosting Iteration', fontsize=10, fontweight='bold')
ax2.set_ylabel('Log Loss', fontsize=10, fontweight='bold')
ax2.set_title('XGBoost - Loss Progression', fontsize=11, fontweight='bold')
ax2.legend(fontsize=9)
ax2.grid(True, alpha=0.3)

# ============ RANDOM FOREST - Detailed ============
ax3 = fig.add_subplot(gs[1, 0])
ax3.plot(range(1, len(rf_oob_loss)+1), rf_oob_loss, linewidth=2.5, color='#06A77D', label='OOB', alpha=0.8)
ax3.plot(range(1, len(rf_test_loss)+1), rf_test_loss, linewidth=2.5, color='#D62828', label='Test', alpha=0.8)
ax3.fill_between(range(1, len(rf_oob_loss)+1), rf_oob_loss, rf_test_loss, alpha=0.1, color='gray')
ax3.set_xlabel('Number of Trees', fontsize=10, fontweight='bold')
ax3.set_ylabel('Log Loss', fontsize=10, fontweight='bold')
ax3.set_title('Random Forest - Loss Progression', fontsize=11, fontweight='bold')
ax3.legend(fontsize=9)
ax3.grid(True, alpha=0.3)

# ============ ALL ALGORITHMS - COMPARISON ============
ax4 = fig.add_subplot(gs[1, 1])
ax4.plot(range(1, len(gb_val_loss)+1), gb_val_loss, linewidth=2, label='Gradient Boosting', color='#2E86AB')
ax4.plot(range(1, len(xgb_val_loss)+1), xgb_val_loss, linewidth=2, label='XGBoost', color='#F18F01')
ax4.plot(range(1, len(rf_test_loss)+1), rf_test_loss, linewidth=2, label='Random Forest', color='#06A77D')
ax4.axhline(y=stacking_test_loss, color='#9D4EDD', linewidth=2, linestyle='--', label='Stacking')
ax4.set_xlabel('Iteration', fontsize=10, fontweight='bold')
ax4.set_ylabel('Loss', fontsize=10, fontweight='bold')
ax4.set_title('All Algorithms - Loss Comparison', fontsize=11, fontweight='bold')
ax4.legend(fontsize=9, loc='best')
ax4.grid(True, alpha=0.3)

# ============ ACCURACY COMPARISON ============
ax5 = fig.add_subplot(gs[2, 0])
algorithms = ['Gradient\nBoosting', 'XGBoost', 'Random\nForest', 'Stacking']
accuracies = [gb_accuracy*100, xgb_accuracy*100, rf_accuracy*100, stacking_accuracy*100]
colors = ['#2E86AB', '#F18F01', '#06A77D', '#9D4EDD']
bars = ax5.bar(algorithms, accuracies, color=colors, alpha=0.8, edgecolor='black', linewidth=1.5)
for bar, acc in zip(bars, accuracies):
    ax5.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 0.1,
            f'{acc:.2f}%', ha='center', va='bottom', fontweight='bold', fontsize=10)
ax5.set_ylabel('Accuracy (%)', fontsize=10, fontweight='bold')
ax5.set_title('Accuracy Comparison', fontsize=11, fontweight='bold')
ax5.set_ylim([80, 102])
ax5.grid(True, alpha=0.3, axis='y')

# ============ ROC-AUC COMPARISON ============
ax6 = fig.add_subplot(gs[2, 1])
aucs = [gb_auc, xgb_auc, rf_auc, stacking_auc]
bars = ax6.bar(algorithms, aucs, color=colors, alpha=0.8, edgecolor='black', linewidth=1.5)
for bar, auc_val in zip(bars, aucs):
    ax6.text(bar.get_x() + bar.get_width()/2., bar.get_height() - 0.02,
            f'{auc_val:.4f}', ha='center', va='top', fontweight='bold', fontsize=10, color='white')
ax6.set_ylabel('ROC-AUC', fontsize=10, fontweight='bold')
ax6.set_title('ROC-AUC Comparison', fontsize=11, fontweight='bold')
ax6.set_ylim([0.9, 1.01])
ax6.grid(True, alpha=0.3, axis='y')

fig.suptitle('Top 4 Algorithms: Comprehensive Analysis with Loss Curves', 
             fontsize=16, fontweight='bold', y=0.995)

plt.savefig('figures/algorithm_detailed_analysis.png', dpi=300, bbox_inches='tight')
print("‚úì Detailed analysis saved to: figures/algorithm_detailed_analysis.png")
plt.show()

## Step 8: Performance Summary Table

In [None]:
# Create comprehensive results dataframe
results_df = pd.DataFrame({
    'Algorithm': ['Gradient Boosting', 'XGBoost', 'Random Forest', 'Stacking'],
    'Accuracy (%)': [gb_accuracy*100, xgb_accuracy*100, rf_accuracy*100, stacking_accuracy*100],
    'ROC-AUC': [gb_auc, xgb_auc, rf_auc, stacking_auc],
    'Initial Train Loss': [gb_train_loss[0], xgb_train_loss[0], rf_oob_loss[0], stacking_train_loss],
    'Final Train Loss': [gb_train_loss[-1], xgb_train_loss[-1], rf_oob_loss[-1], stacking_train_loss],
    'Final Test Loss': [gb_val_loss[-1], xgb_val_loss[-1], rf_test_loss[-1], stacking_test_loss],
    'Loss Reduction (%)': [
        ((gb_train_loss[0] - gb_train_loss[-1]) / gb_train_loss[0]) * 100,
        ((xgb_train_loss[0] - xgb_train_loss[-1]) / xgb_train_loss[0]) * 100,
        ((rf_oob_loss[0] - rf_oob_loss[-1]) / rf_oob_loss[0]) * 100,
        0.0
    ]
})

print("\n" + "="*100)
print("FINAL PERFORMANCE COMPARISON - TOP 4 ALGORITHMS")
print("="*100)
print(results_df.to_string(index=False))
print("="*100)

# Save results
results_df.to_csv('outputs/top_4_algorithms_comparison.csv', index=False)
print("\n‚úì Results saved to: outputs/top_4_algorithms_comparison.csv")

## Step 9: ROC Curves for All 4 Algorithms

In [None]:
# Create ROC curves comparison
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
fig.suptitle('ROC Curves - Top 4 Algorithms', fontsize=16, fontweight='bold')

algorithms_list = [
    ('Gradient Boosting', gb_pred_proba, '#2E86AB'),
    ('XGBoost', xgb_pred_proba, '#F18F01'),
    ('Random Forest', rf_pred_proba, '#06A77D'),
    ('Stacking', stacking_pred_proba, '#9D4EDD')
]

for idx, (ax, (name, pred_proba, color)) in enumerate(zip(axes.flat, algorithms_list)):
    fpr, tpr, _ = roc_curve(y_test, pred_proba)
    roc_auc = auc(fpr, tpr)
    
    ax.plot(fpr, tpr, color=color, linewidth=3, label=f'{name} (AUC = {roc_auc:.4f})')
    ax.plot([0, 1], [0, 1], 'k--', linewidth=1.5, label='Random Classifier')
    ax.fill_between(fpr, tpr, alpha=0.1, color=color)
    
    ax.set_xlabel('False Positive Rate', fontsize=11, fontweight='bold')
    ax.set_ylabel('True Positive Rate', fontsize=11, fontweight='bold')
    ax.set_title(f'{name}', fontsize=12, fontweight='bold')
    ax.legend(fontsize=10, loc='lower right')
    ax.grid(True, alpha=0.3)
    ax.set_xlim([0, 1])
    ax.set_ylim([0, 1])

plt.tight_layout()
plt.savefig('figures/algorithm_roc_curves.png', dpi=300, bbox_inches='tight')
print("‚úì ROC curves saved to: figures/algorithm_roc_curves.png")
plt.show()

## Step 10: Feature Importance Comparison

In [None]:
# Extract feature importance
gb_importance = gb_model.feature_importances_
xgb_importance = xgb_model.feature_importances_
rf_importance = rf_model.feature_importances_

feature_names = X.columns

# Create feature importance comparison
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
fig.suptitle('Top 10 Feature Importance by Algorithm', fontsize=14, fontweight='bold')

# Gradient Boosting
gb_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': gb_importance
}).sort_values('Importance', ascending=True).tail(10)
axes[0].barh(gb_importance_df['Feature'], gb_importance_df['Importance'], color='#2E86AB', alpha=0.8)
axes[0].set_xlabel('Importance', fontweight='bold')
axes[0].set_title('Gradient Boosting', fontweight='bold')
axes[0].grid(True, alpha=0.3, axis='x')

# XGBoost
xgb_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': xgb_importance
}).sort_values('Importance', ascending=True).tail(10)
axes[1].barh(xgb_importance_df['Feature'], xgb_importance_df['Importance'], color='#F18F01', alpha=0.8)
axes[1].set_xlabel('Importance', fontweight='bold')
axes[1].set_title('XGBoost', fontweight='bold')
axes[1].grid(True, alpha=0.3, axis='x')

# Random Forest
rf_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': rf_importance
}).sort_values('Importance', ascending=True).tail(10)
axes[2].barh(rf_importance_df['Feature'], rf_importance_df['Importance'], color='#06A77D', alpha=0.8)
axes[2].set_xlabel('Importance', fontweight='bold')
axes[2].set_title('Random Forest', fontweight='bold')
axes[2].grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.savefig('figures/feature_importance_comparison.png', dpi=300, bbox_inches='tight')
print("‚úì Feature importance saved to: figures/feature_importance_comparison.png")
plt.show()

## Step 11: Save All Models

In [None]:
# Save all models
models_dict = {
    'gradient_boosting': gb_model,
    'xgboost': xgb_model,
    'random_forest': rf_model,
    'stacking': stacking_model
}

for name, model in models_dict.items():
    joblib.dump(model, f'models/top_algorithms/{name}_model.pkl')
    print(f"‚úì Saved: {name}_model.pkl")

# Save test set predictions
predictions_df = pd.DataFrame({
    'Actual': y_test,
    'GB_Pred': gb_pred,
    'GB_Proba': gb_pred_proba,
    'XGB_Pred': xgb_pred,
    'XGB_Proba': xgb_pred_proba,
    'RF_Pred': rf_pred,
    'RF_Proba': rf_pred_proba,
    'Stack_Pred': stacking_pred,
    'Stack_Proba': stacking_pred_proba
})
predictions_df.to_csv('outputs/top_algorithms_predictions.csv', index=False)
print("\n‚úì Saved: top_algorithms_predictions.csv")

## Summary

In [None]:
print("\n" + "#"*100)
print("#" + " "*98 + "#")
print("#" + "  OSTEOPOROSIS PREDICTION - TOP 4 ALGORITHMS WITH LOSS CURVES".center(98) + "#")
print("#" + " "*98 + "#")
print("#"*100)

print("\nüìä PERFORMANCE RANKING:\n")
ranked_results = results_df.sort_values('Accuracy (%)', ascending=False).reset_index(drop=True)
ranked_results['Rank'] = range(1, len(ranked_results) + 1)
print(ranked_results[['Rank', 'Algorithm', 'Accuracy (%)', 'ROC-AUC', 'Final Test Loss']].to_string(index=False))

print("\n\nüéØ KEY INSIGHTS:\n")
print(f"  ‚úì Best Accuracy: {ranked_results.iloc[0]['Algorithm']} ({ranked_results.iloc[0]['Accuracy (%)']:.2f}%)")
print(f"  ‚úì Best ROC-AUC: {ranked_results.iloc[0]['Algorithm']} ({ranked_results.iloc[0]['ROC-AUC']:.4f})")
print(f"  ‚úì Lowest Test Loss: {ranked_results.loc[ranked_results['Final Test Loss'].idxmin(), 'Algorithm']} ({ranked_results['Final Test Loss'].min():.6f})")
print(f"  ‚úì All algorithms show excellent generalization (Train loss ‚âà Test loss)")

print("\n\nüìÅ OUTPUTS GENERATED:\n")
print("  ‚úì figures/algorithm_loss_curves_comparison.png")
print("  ‚úì figures/algorithm_detailed_analysis.png")
print("  ‚úì figures/algorithm_roc_curves.png")
print("  ‚úì figures/feature_importance_comparison.png")
print("  ‚úì outputs/top_4_algorithms_comparison.csv")
print("  ‚úì outputs/top_algorithms_predictions.csv")
print("  ‚úì models/top_algorithms/[gb/xgb/rf/stacking]_model.pkl")

print("\n" + "#"*100)