## 1. Setup v√† Load Data

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    confusion_matrix, 
    classification_report,
    roc_auc_score,
    roc_curve,
    precision_recall_curve,
    f1_score,
    precision_score,
    recall_score,
    accuracy_score
)
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.4f}'.format)

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

print("‚úÖ Libraries imported successfully")

In [None]:
# Load test data
X_test = pd.read_csv('../data/X_test_processed.csv')
y_test = pd.read_csv('../data/y_test.csv')['conversion']

print(f"Test set size: {len(X_test):,} samples")
print(f"Features: {X_test.shape[1]}")
print(f"\nClass distribution in test set:")
print(y_test.value_counts())
print(f"\nPositive class ratio: {y_test.mean():.2%}")

In [None]:
# Load training data to check original distribution
y_train = pd.read_csv('../data/y_train.csv')['conversion']

print(f"Training set size: {len(y_train):,} samples")
print(f"\nClass distribution in training set (after SMOTE):")
print(y_train.value_counts())
print(f"\nPositive class ratio: {y_train.mean():.2%}")
print(f"\n‚ö†Ô∏è Note: Training data was balanced with SMOTE (50-50), test data remains imbalanced (~14.7%)")

In [None]:
# Load models
with open('../models/random_forest_model.pkl', 'rb') as f:
    rf_model = pickle.load(f)

with open('../models/gradient_boosting_model.pkl', 'rb') as f:
    gb_model = pickle.load(f)

with open('../models/xgboost_model.pkl', 'rb') as f:
    xgb_model = pickle.load(f)

print("‚úÖ All 3 models loaded successfully")
print(f"\nRandom Forest: {rf_model}")
print(f"\nGradient Boosting: {gb_model}")
print(f"\nXGBoost: {xgb_model}")

In [None]:
# Load feature importance
feature_importance = pd.read_csv('../results/metrics/feature_importance.csv')
print("Top 10 Most Important Features:")
print(feature_importance.head(10))

## 2. Predictions v√† Probability Scores

In [None]:
# Get predictions and probability scores for all models
models = {
    'Random Forest': rf_model,
    'Gradient Boosting': gb_model,
    'XGBoost': xgb_model
}

predictions = {}
probabilities = {}

for name, model in models.items():
    predictions[name] = model.predict(X_test)
    probabilities[name] = model.predict_proba(X_test)[:, 1]  # Probability of class 1
    print(f"‚úÖ {name}: Predictions generated")

print("\n" + "="*60)
print("Prediction Statistics:")
print("="*60)
for name in models.keys():
    unique, counts = np.unique(predictions[name], return_counts=True)
    pred_dist = dict(zip(unique, counts))
    print(f"\n{name}:")
    print(f"  Class 0: {pred_dist.get(0, 0):>6,} ({pred_dist.get(0, 0)/len(y_test)*100:>5.2f}%)")
    print(f"  Class 1: {pred_dist.get(1, 0):>6,} ({pred_dist.get(1, 0)/len(y_test)*100:>5.2f}%)")

## 3. Performance Metrics - T·ªïng Quan

In [None]:
# Calculate comprehensive metrics for all models
results = []

for name in models.keys():
    y_pred = predictions[name]
    y_proba = probabilities[name]
    
    metrics = {
        'Model': name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'ROC-AUC': roc_auc_score(y_test, y_proba),
        'F1-Score': f1_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, zero_division=0),
        'Recall': recall_score(y_test, y_pred),
    }
    results.append(metrics)

results_df = pd.DataFrame(results)
results_df = results_df.sort_values('ROC-AUC', ascending=False)

print("="*80)
print("üìä PERFORMANCE COMPARISON - ALL MODELS (Threshold = 0.5)")
print("="*80)
print(results_df.to_string(index=False))
print("\n‚ö†Ô∏è Note: Low F1/Precision/Recall due to default threshold 0.5 on imbalanced test set")

## 4. Confusion Matrix - Chi Ti·∫øt T·ª´ng Model

In [None]:
# Visualize confusion matrices for all models
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, (name, model) in enumerate(models.items()):
    cm = confusion_matrix(y_test, predictions[name])
    
    # Calculate percentages
    cm_pct = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100
    
    # Create annotations with both count and percentage
    annot = np.empty_like(cm).astype(str)
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            annot[i, j] = f'{cm[i, j]:,}\n({cm_pct[i, j]:.1f}%)'
    
    # Plot
    sns.heatmap(cm, annot=annot, fmt='', cmap='Blues', 
                xticklabels=['No Purchase (0)', 'Purchase (1)'],
                yticklabels=['No Purchase (0)', 'Purchase (1)'],
                ax=axes[idx], cbar=True)
    
    axes[idx].set_title(f'{name}\nConfusion Matrix', fontsize=12, fontweight='bold')
    axes[idx].set_ylabel('Actual', fontsize=10)
    axes[idx].set_xlabel('Predicted', fontsize=10)

plt.tight_layout()
plt.savefig('../results/figures/confusion_matrices_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ Confusion matrices saved to results/figures/confusion_matrices_comparison.png")

In [None]:
# Detailed confusion matrix analysis
print("="*80)
print("üîç CONFUSION MATRIX ANALYSIS - DETAILED BREAKDOWN")
print("="*80)

for name in models.keys():
    cm = confusion_matrix(y_test, predictions[name])
    tn, fp, fn, tp = cm.ravel()
    
    print(f"\n{'='*60}")
    print(f"{name}")
    print(f"{'='*60}")
    print(f"True Negatives (TN):  {tn:>6,}  |  Correctly predicted No Purchase")
    print(f"False Positives (FP): {fp:>6,}  |  Predicted Purchase, Actually No Purchase")
    print(f"False Negatives (FN): {fn:>6,}  |  Predicted No Purchase, Actually Purchase")
    print(f"True Positives (TP):  {tp:>6,}  |  Correctly predicted Purchase")
    print(f"\nTotal samples: {tn+fp+fn+tp:,}")
    
    # Calculate rates
    total_actual_neg = tn + fp
    total_actual_pos = fn + tp
    
    if total_actual_neg > 0:
        specificity = tn / total_actual_neg
        fpr = fp / total_actual_neg
        print(f"\nClass 0 (No Purchase) Performance:")
        print(f"  Specificity (True Negative Rate): {specificity:.2%}")
        print(f"  False Positive Rate:              {fpr:.2%}")
    
    if total_actual_pos > 0:
        sensitivity = tp / total_actual_pos
        fnr = fn / total_actual_pos
        print(f"\nClass 1 (Purchase) Performance:")
        print(f"  Sensitivity (Recall/TPR):         {sensitivity:.2%}")
        print(f"  False Negative Rate:              {fnr:.2%}")

## 5. Classification Report - Per Class Performance

In [None]:
# Generate detailed classification reports
print("="*80)
print("üìã CLASSIFICATION REPORT - PER CLASS METRICS")
print("="*80)

for name in models.keys():
    print(f"\n{'='*60}")
    print(f"{name}")
    print(f"{'='*60}")
    print(classification_report(y_test, predictions[name], 
                                target_names=['No Purchase (0)', 'Purchase (1)'],
                                digits=4))

## 6. ROC Curve v√† Precision-Recall Curve

In [None]:
# ROC Curves
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Plot ROC curves
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']
for idx, (name, model) in enumerate(models.items()):
    fpr, tpr, _ = roc_curve(y_test, probabilities[name])
    auc = roc_auc_score(y_test, probabilities[name])
    ax1.plot(fpr, tpr, label=f'{name} (AUC = {auc:.4f})', 
             linewidth=2.5, color=colors[idx])

ax1.plot([0, 1], [0, 1], 'k--', linewidth=1.5, label='Random Classifier')
ax1.set_xlabel('False Positive Rate', fontsize=12, fontweight='bold')
ax1.set_ylabel('True Positive Rate', fontsize=12, fontweight='bold')
ax1.set_title('ROC Curves Comparison', fontsize=14, fontweight='bold')
ax1.legend(loc='lower right', fontsize=10)
ax1.grid(alpha=0.3)

# Plot Precision-Recall curves
for idx, (name, model) in enumerate(models.items()):
    precision, recall, _ = precision_recall_curve(y_test, probabilities[name])
    ax2.plot(recall, precision, label=f'{name}', 
             linewidth=2.5, color=colors[idx])

# Baseline (proportion of positive class)
baseline = y_test.mean()
ax2.axhline(y=baseline, color='k', linestyle='--', linewidth=1.5, 
            label=f'Baseline ({baseline:.2%})')

ax2.set_xlabel('Recall', fontsize=12, fontweight='bold')
ax2.set_ylabel('Precision', fontsize=12, fontweight='bold')
ax2.set_title('Precision-Recall Curves', fontsize=14, fontweight='bold')
ax2.legend(loc='best', fontsize=10)
ax2.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('../results/figures/roc_pr_curves.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ ROC and PR curves saved to results/figures/roc_pr_curves.png")

## 7. Threshold Optimization Analysis

In [None]:
# Load threshold scan results
threshold_results = pd.read_csv('../results/metrics/threshold_scan_results.csv')
print("Threshold Scan Results (selected thresholds):")
print(threshold_results[threshold_results['Threshold'].isin([0.1, 0.2, 0.26, 0.3, 0.5, 0.7, 0.9])])

In [None]:
# Analyze XGBoost (best model) at different thresholds
print("="*80)
print("üéØ THRESHOLD OPTIMIZATION - XGBoost (Best Model)")
print("="*80)

thresholds_to_test = [0.1, 0.2, 0.26, 0.3, 0.4, 0.5, 0.6, 0.7]
threshold_analysis = []

for threshold in thresholds_to_test:
    y_pred_threshold = (probabilities['XGBoost'] >= threshold).astype(int)
    
    analysis = {
        'Threshold': threshold,
        'Accuracy': accuracy_score(y_test, y_pred_threshold),
        'Precision': precision_score(y_test, y_pred_threshold, zero_division=0),
        'Recall': recall_score(y_test, y_pred_threshold),
        'F1-Score': f1_score(y_test, y_pred_threshold),
        'Predicted_Pos': y_pred_threshold.sum(),
        'Predicted_Pos_Pct': y_pred_threshold.mean() * 100
    }
    threshold_analysis.append(analysis)

threshold_df = pd.DataFrame(threshold_analysis)
print(threshold_df.to_string(index=False))

# Find optimal threshold
optimal_idx = threshold_df['F1-Score'].idxmax()
optimal_threshold = threshold_df.loc[optimal_idx, 'Threshold']
print(f"\n‚úÖ Optimal Threshold for F1-Score: {optimal_threshold}")
print(f"   Metrics at optimal threshold:")
print(threshold_df.loc[optimal_idx].to_string())

In [None]:
# Visualize threshold impact
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# F1-Score vs Threshold
axes[0, 0].plot(threshold_df['Threshold'], threshold_df['F1-Score'], 
                marker='o', linewidth=2.5, markersize=8, color='#FF6B6B')
axes[0, 0].axvline(x=optimal_threshold, color='green', linestyle='--', 
                   linewidth=2, label=f'Optimal: {optimal_threshold}')
axes[0, 0].set_xlabel('Threshold', fontsize=11, fontweight='bold')
axes[0, 0].set_ylabel('F1-Score', fontsize=11, fontweight='bold')
axes[0, 0].set_title('F1-Score vs Threshold', fontsize=12, fontweight='bold')
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# Precision-Recall Trade-off
axes[0, 1].plot(threshold_df['Threshold'], threshold_df['Precision'], 
                marker='s', linewidth=2.5, markersize=8, color='#4ECDC4', label='Precision')
axes[0, 1].plot(threshold_df['Threshold'], threshold_df['Recall'], 
                marker='^', linewidth=2.5, markersize=8, color='#45B7D1', label='Recall')
axes[0, 1].axvline(x=optimal_threshold, color='green', linestyle='--', 
                   linewidth=2, label=f'Optimal: {optimal_threshold}')
axes[0, 1].set_xlabel('Threshold', fontsize=11, fontweight='bold')
axes[0, 1].set_ylabel('Score', fontsize=11, fontweight='bold')
axes[0, 1].set_title('Precision-Recall Trade-off', fontsize=12, fontweight='bold')
axes[0, 1].legend()
axes[0, 1].grid(alpha=0.3)

# Accuracy vs Threshold
axes[1, 0].plot(threshold_df['Threshold'], threshold_df['Accuracy'], 
                marker='D', linewidth=2.5, markersize=8, color='#95E1D3')
axes[1, 0].axvline(x=0.5, color='red', linestyle='--', 
                   linewidth=2, label='Default: 0.5')
axes[1, 0].set_xlabel('Threshold', fontsize=11, fontweight='bold')
axes[1, 0].set_ylabel('Accuracy', fontsize=11, fontweight='bold')
axes[1, 0].set_title('Accuracy vs Threshold', fontsize=12, fontweight='bold')
axes[1, 0].legend()
axes[1, 0].grid(alpha=0.3)

# Predicted Positives vs Threshold
axes[1, 1].plot(threshold_df['Threshold'], threshold_df['Predicted_Pos_Pct'], 
                marker='o', linewidth=2.5, markersize=8, color='#F38181')
axes[1, 1].axhline(y=y_test.mean()*100, color='orange', linestyle='--', 
                   linewidth=2, label=f'Actual: {y_test.mean()*100:.1f}%')
axes[1, 1].set_xlabel('Threshold', fontsize=11, fontweight='bold')
axes[1, 1].set_ylabel('% Predicted as Positive', fontsize=11, fontweight='bold')
axes[1, 1].set_title('Predicted Positives vs Threshold', fontsize=12, fontweight='bold')
axes[1, 1].legend()
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('../results/figures/threshold_optimization.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ Threshold optimization plots saved to results/figures/threshold_optimization.png")

## 8. Sample Predictions - Error Analysis

In [None]:
# Create comprehensive prediction dataframe for XGBoost
prediction_analysis = X_test.copy()
prediction_analysis['actual'] = y_test.values
prediction_analysis['predicted'] = predictions['XGBoost']
prediction_analysis['probability'] = probabilities['XGBoost']

# Categorize predictions
prediction_analysis['result'] = 'Unknown'
prediction_analysis.loc[
    (prediction_analysis['actual'] == 0) & (prediction_analysis['predicted'] == 0), 
    'result'
] = 'True Negative (TN)'
prediction_analysis.loc[
    (prediction_analysis['actual'] == 0) & (prediction_analysis['predicted'] == 1), 
    'result'
] = 'False Positive (FP)'
prediction_analysis.loc[
    (prediction_analysis['actual'] == 1) & (prediction_analysis['predicted'] == 0), 
    'result'
] = 'False Negative (FN)'
prediction_analysis.loc[
    (prediction_analysis['actual'] == 1) & (prediction_analysis['predicted'] == 1), 
    'result'
] = 'True Positive (TP)'

print("="*80)
print("üìå SAMPLE PREDICTIONS - XGBoost Model")
print("="*80)
print(f"\nPrediction Result Distribution:")
print(prediction_analysis['result'].value_counts())

# Save full predictions
prediction_analysis.to_csv('../results/metrics/xgboost_predictions_detailed.csv', index=False)
print("\n‚úÖ Full predictions saved to results/metrics/xgboost_predictions_detailed.csv")

In [None]:
# Show sample predictions from each category
print("\n" + "="*80)
print("üîç SAMPLE PREDICTIONS BY CATEGORY (Top 5 each)")
print("="*80)

categories = ['True Negative (TN)', 'False Positive (FP)', 'False Negative (FN)', 'True Positive (TP)']
key_features = ['recency', 'history', 'is_referral', 'used_discount', 'used_bogo']

for category in categories:
    print(f"\n{'='*60}")
    print(f"{category}")
    print(f"{'='*60}")
    
    samples = prediction_analysis[prediction_analysis['result'] == category]
    
    if len(samples) > 0:
        # Sort by probability (confidence)
        if 'Positive' in category:
            samples_sorted = samples.nlargest(5, 'probability')
        else:
            samples_sorted = samples.nsmallest(5, 'probability')
        
        display_cols = ['actual', 'predicted', 'probability'] + key_features
        display_cols = [col for col in display_cols if col in samples_sorted.columns]
        
        print(samples_sorted[display_cols].to_string(index=False))
    else:
        print("No samples in this category")

In [None]:
# Analyze probability distribution by prediction result
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
categories = ['True Negative (TN)', 'False Positive (FP)', 'False Negative (FN)', 'True Positive (TP)']
colors = ['#95E1D3', '#FF6B6B', '#FFA07A', '#4ECDC4']

for idx, (category, color) in enumerate(zip(categories, colors)):
    row = idx // 2
    col = idx % 2
    
    data = prediction_analysis[prediction_analysis['result'] == category]['probability']
    
    axes[row, col].hist(data, bins=50, color=color, alpha=0.7, edgecolor='black')
    axes[row, col].axvline(x=0.5, color='red', linestyle='--', linewidth=2, label='Threshold: 0.5')
    axes[row, col].set_xlabel('Probability Score', fontsize=11, fontweight='bold')
    axes[row, col].set_ylabel('Count', fontsize=11, fontweight='bold')
    axes[row, col].set_title(f'{category}\n(n={len(data):,})', fontsize=12, fontweight='bold')
    axes[row, col].legend()
    axes[row, col].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('../results/figures/probability_distribution_by_result.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ Probability distribution plots saved")

## 9. Feature Importance Visualization

In [None]:
# Feature importance for XGBoost
fig, ax = plt.subplots(figsize=(12, 8))

top_features = feature_importance.head(15)
colors_gradient = plt.cm.viridis(np.linspace(0.3, 0.9, len(top_features)))

bars = ax.barh(range(len(top_features)), top_features['Importance'], color=colors_gradient)
ax.set_yticks(range(len(top_features)))
ax.set_yticklabels(top_features['Feature'])
ax.invert_yaxis()
ax.set_xlabel('Importance Score', fontsize=12, fontweight='bold')
ax.set_title('Top 15 Most Important Features - XGBoost Model', fontsize=14, fontweight='bold')
ax.grid(axis='x', alpha=0.3)

# Add value labels
for i, (idx, row) in enumerate(top_features.iterrows()):
    ax.text(row['Importance'] + 0.001, i, f"{row['Importance']:.4f}", 
            va='center', fontsize=9)

plt.tight_layout()
plt.savefig('../results/figures/feature_importance_top15.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ Feature importance plot saved")

## 10. Overfitting/Underfitting Analysis

In [None]:
# Compare training vs test performance
# Note: Training metrics from GridSearchCV cross-validation
print("="*80)
print("‚ö†Ô∏è OVERFITTING/UNDERFITTING ANALYSIS")
print("="*80)

# Load training metrics from model comparison
model_comparison = pd.read_csv('../results/metrics/final_model_comparison.csv')
print("\nTraining Performance (from GridSearchCV 5-fold CV):")
print(model_comparison[['Model', 'ROC-AUC', 'Accuracy', 'F1-Score']].to_string(index=False))

print("\n" + "="*60)
print("Test Performance (current evaluation):")
print("="*60)
print(results_df[['Model', 'ROC-AUC', 'Accuracy', 'F1-Score']].to_string(index=False))

print("\n" + "="*60)
print("ANALYSIS:")
print("="*60)

for model_name in ['XGBoost', 'Gradient Boosting', 'Random Forest']:
    # Get training metrics
    train_row = model_comparison[model_comparison['Model'] == model_name].iloc[0]
    test_row = results_df[results_df['Model'] == model_name].iloc[0]
    
    train_auc = train_row['ROC-AUC']
    test_auc = test_row['ROC-AUC']
    
    train_acc = train_row['Accuracy']
    test_acc = test_row['Accuracy']
    
    auc_diff = train_auc - test_auc
    acc_diff = train_acc - test_acc
    
    print(f"\n{model_name}:")
    print(f"  ROC-AUC: Train={train_auc:.4f} | Test={test_auc:.4f} | Diff={auc_diff:+.4f}")
    print(f"  Accuracy: Train={train_acc:.4f} | Test={test_acc:.4f} | Diff={acc_diff:+.4f}")
    
    if abs(auc_diff) < 0.05 and abs(acc_diff) < 0.05:
        print(f"  ‚úÖ Status: GOOD GENERALIZATION (minimal overfitting)")
    elif auc_diff > 0.1 or acc_diff > 0.1:
        print(f"  ‚ö†Ô∏è Status: OVERFITTING DETECTED (train >> test)")
    elif test_auc < 0.6:
        print(f"  ‚ö†Ô∏è Status: UNDERFITTING (low performance overall)")
    else:
        print(f"  ‚ÑπÔ∏è Status: ACCEPTABLE (minor variance)")

In [None]:
# Visualize train vs test comparison
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

model_names = ['XGBoost', 'Gradient Boosting', 'Random Forest']
x_pos = np.arange(len(model_names))
width = 0.35

# ROC-AUC comparison
train_auc = [model_comparison[model_comparison['Model'] == m]['ROC-AUC'].values[0] 
             for m in model_names]
test_auc = [results_df[results_df['Model'] == m]['ROC-AUC'].values[0] 
            for m in model_names]

ax1.bar(x_pos - width/2, train_auc, width, label='Training (CV)', color='#4ECDC4', alpha=0.8)
ax1.bar(x_pos + width/2, test_auc, width, label='Test', color='#FF6B6B', alpha=0.8)
ax1.set_xlabel('Model', fontsize=12, fontweight='bold')
ax1.set_ylabel('ROC-AUC Score', fontsize=12, fontweight='bold')
ax1.set_title('ROC-AUC: Training vs Test', fontsize=14, fontweight='bold')
ax1.set_xticks(x_pos)
ax1.set_xticklabels(model_names, rotation=15, ha='right')
ax1.legend()
ax1.grid(axis='y', alpha=0.3)

# Accuracy comparison
train_acc = [model_comparison[model_comparison['Model'] == m]['Accuracy'].values[0] 
             for m in model_names]
test_acc = [results_df[results_df['Model'] == m]['Accuracy'].values[0] 
            for m in model_names]

ax2.bar(x_pos - width/2, train_acc, width, label='Training (CV)', color='#95E1D3', alpha=0.8)
ax2.bar(x_pos + width/2, test_acc, width, label='Test', color='#F38181', alpha=0.8)
ax2.set_xlabel('Model', fontsize=12, fontweight='bold')
ax2.set_ylabel('Accuracy', fontsize=12, fontweight='bold')
ax2.set_title('Accuracy: Training vs Test', fontsize=14, fontweight='bold')
ax2.set_xticks(x_pos)
ax2.set_xticklabels(model_names, rotation=15, ha='right')
ax2.legend()
ax2.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('../results/figures/train_vs_test_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ Train vs test comparison plot saved")

## 11. Summary Statistics

In [None]:
# Generate comprehensive summary
print("="*80)
print("üìä COMPREHENSIVE EVALUATION SUMMARY")
print("="*80)

print("\n1. DATASET INFORMATION:")
print(f"   Training samples: {len(y_train):,} (SMOTE balanced: 50-50)")
print(f"   Test samples: {len(y_test):,} (Imbalanced: {y_test.mean():.2%} positive)")
print(f"   Features: {X_test.shape[1]}")

print("\n2. BEST MODEL: XGBoost")
best_model_metrics = results_df[results_df['Model'] == 'XGBoost'].iloc[0]
print(f"   ROC-AUC: {best_model_metrics['ROC-AUC']:.4f}")
print(f"   Accuracy: {best_model_metrics['Accuracy']:.4f}")
print(f"   F1-Score (default threshold): {best_model_metrics['F1-Score']:.4f}")
print(f"   Precision: {best_model_metrics['Precision']:.4f}")
print(f"   Recall: {best_model_metrics['Recall']:.4f}")

print("\n3. OPTIMAL THRESHOLD PERFORMANCE:")
opt_metrics = threshold_df[threshold_df['Threshold'] == optimal_threshold].iloc[0]
print(f"   Threshold: {opt_metrics['Threshold']}")
print(f"   F1-Score: {opt_metrics['F1-Score']:.4f}")
print(f"   Precision: {opt_metrics['Precision']:.4f}")
print(f"   Recall: {opt_metrics['Recall']:.4f}")

print("\n4. CONFUSION MATRIX (XGBoost, threshold=0.5):")
cm = confusion_matrix(y_test, predictions['XGBoost'])
tn, fp, fn, tp = cm.ravel()
print(f"   True Negatives: {tn:,}")
print(f"   False Positives: {fp:,}")
print(f"   False Negatives: {fn:,}")
print(f"   True Positives: {tp:,}")

print("\n5. TOP 5 MOST IMPORTANT FEATURES:")
for idx, row in feature_importance.head(5).iterrows():
    print(f"   {idx+1}. {row['Feature']}: {row['Importance']:.4f}")

print("\n6. CLASS IMBALANCE IMPACT:")
print(f"   Model predicts mostly class 0 at default threshold")
print(f"   Class 0 predictions: {(predictions['XGBoost']==0).sum():,} ({(predictions['XGBoost']==0).mean()*100:.1f}%)")
print(f"   Class 1 predictions: {(predictions['XGBoost']==1).sum():,} ({(predictions['XGBoost']==1).mean()*100:.1f}%)")
print(f"   Actual class 1 in test: {y_test.sum():,} ({y_test.mean()*100:.1f}%)")

print("\n" + "="*80)
print("‚úÖ EVALUATION COMPLETED SUCCESSFULLY")
print("="*80)

In [None]:
# Save summary statistics to file
summary_stats = {
    'Dataset': {
        'train_samples': len(y_train),
        'test_samples': len(y_test),
        'features': X_test.shape[1],
        'test_positive_ratio': y_test.mean()
    },
    'Best_Model': 'XGBoost',
    'Performance_Default_Threshold': best_model_metrics.to_dict(),
    'Performance_Optimal_Threshold': opt_metrics.to_dict(),
    'Confusion_Matrix': {
        'TN': int(tn),
        'FP': int(fp),
        'FN': int(fn),
        'TP': int(tp)
    },
    'Top_5_Features': feature_importance.head(5)[['Feature', 'Importance']].to_dict('records')
}

import json
with open('../results/metrics/evaluation_summary.json', 'w') as f:
    json.dump(summary_stats, f, indent=2)

print("‚úÖ Summary statistics saved to results/metrics/evaluation_summary.json")

## 12. K·∫øt Lu·∫≠n v√† G·ª£i √ù C·∫£i Thi·ªán

### üéØ Key Findings:

1. **Model Performance**: XGBoost l√† model t·ªët nh·∫•t v·ªõi ROC-AUC = 0.6344
2. **Class Imbalance Impact**: F1-score th·∫•p (0.001) ·ªü threshold m·∫∑c ƒë·ªãnh do test set imbalanced
3. **Threshold Optimization**: Threshold = 0.26 t·ªëi ∆∞u h∆°n (F1 = 0.28)
4. **Feature Importance**: `is_referral`, `recency`, `offer_No Offer` l√† top 3 features
5. **Generalization**: Model generalize t·ªët (kh√¥ng b·ªã overfitting nghi√™m tr·ªçng)

### üõ†Ô∏è Recommendations:

1. **Threshold Selection**: S·ª≠ d·ª•ng threshold = 0.26 thay v√¨ 0.5
2. **Class Weights**: Th·ª≠ `class_weight='balanced'` trong training
3. **Feature Engineering**: T·∫°o interaction features gi·ªØa `is_referral` v√† `offer`
4. **Ensemble**: Combine 3 models b·∫±ng voting ho·∫∑c stacking
5. **Business Cost**: Incorporate business cost v√†o threshold selection