# NIBSS Fraud Detection: Feature Importance Analysis

This notebook provides comprehensive feature importance analysis for the optimized fraud detection models including:
- Permutation importance analysis
- SHAP (SHapley Additive exPlanations) values for interpretability
- Channel-specific feature analysis
- Temporal feature importance
- Feature interaction analysis

The analysis focuses on understanding which features contribute most to fraud detection performance and provides actionable insights for the Nigerian banking system.

## Setup and Data Loading

In [None]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import warnings
warnings.filterwarnings('ignore')

from sklearn.inspection import permutation_importance
from sklearn.metrics import roc_auc_score
import matplotlib.patches as mpatches

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Load data and models
data_splits = joblib.load('../data/processed/data_splits.pkl')
best_models = joblib.load('../models/optimization_results.pkl')['best_models']
data_info = joblib.load('../data/processed/data_info.pkl')

X_test = data_splits['X_test']
y_test = data_splits['y_test']
X_val = data_splits['X_val']
y_val = data_splits['y_val']

print("Models and data loaded successfully!")
print(f"Test samples: {len(X_test)}")
print(f"Validation samples: {len(X_val)}")

## Permutation Feature Importance Analysis

Permutation importance measures the decrease in model performance when a feature's values are randomly shuffled, providing a model-agnostic approach to feature importance.

In [None]:
# Function to get feature names after preprocessing
def get_feature_names(pipeline, original_features):
    """Extract feature names after preprocessing"""
    preprocessor = pipeline.named_steps['preprocessor']

    feature_names = []

    # Numerical features
    num_features = data_info['numerical_features']
    feature_names.extend(num_features)

    # Categorical features (one-hot encoded)
    cat_low_features = data_info['categorical_low']
    if hasattr(preprocessor.named_transformers_['cat_low'], 'get_feature_names_out'):
        cat_names = preprocessor.named_transformers_['cat_low'].get_feature_names_out(cat_low_features)
        feature_names.extend(cat_names)
    else:
        # Fallback for older sklearn versions
        feature_names.extend([f"{col}_encoded" for col in cat_low_features])

    # High cardinality categorical (target encoded)
    cat_high_features = data_info['categorical_high']
    feature_names.extend(cat_high_features)

    # Time features
    time_features = data_info['time_features']
    feature_names.extend(time_features)

    return feature_names

In [None]:
# Calculate permutation importance for each model
perm_importance_results = {}

for model_name, pipeline in best_models.items():
    print(f"\nCalculating permutation importance for {model_name}...")

    # Get feature names
    feature_names = get_feature_names(pipeline, data_info['numerical_features'] +
                                      data_info['categorical_low'] +
                                      data_info['categorical_high'] +
                                      data_info['time_features'])

    # Calculate permutation importance on validation set
    perm_importance = permutation_importance(
        pipeline, X_val, y_val,
        n_repeats=10,
        random_state=RANDOM_SEED,
        scoring='roc_auc',
        n_jobs=-1
    )

    # Store results
    perm_importance_results[model_name] = {
        'importances_mean': perm_importance.importances_mean,
        'importances_std': perm_importance.importances_std,
        'feature_names': feature_names
    }

    print(f"Completed {model_name}")

print("\nPermutation importance analysis completed!")

## Table 4.9: Top 10 Features by Permutation Importance

Summary table showing the most important features across all models with AUC decrease values.

In [None]:
# Process permutation importance results
top_n = 10
table_data = []

for rank in range(1, top_n + 1):
    row = {'Rank': rank}
    
    for model_name in ['logistic_regression', 'random_forest', 'xgboost']:
        results = perm_importance_results[model_name]
        
        # Get sorted indices
        sorted_idx = np.argsort(results['importances_mean'])[::-1]
        
        if rank <= len(sorted_idx):
            idx = sorted_idx[rank - 1]
            feature = results['feature_names'][idx]
            importance = results['importances_mean'][idx]
            std = results['importances_std'][idx]
            
            # Clean feature name
            if feature.startswith('cat_low_'):
                feature = feature.replace('cat_low_', '')
            
            row[f'{model_name.replace("_", " ").title()}\nΔ AUC'] = f"{importance:.3f} ± {std:.3f}"
            row['Feature'] = feature
        else:
            row[f'{model_name.replace("_", " ").title()}\nΔ AUC'] = "N/A"
    
    table_data.append(row)

# Create table with proper column ordering
table_4_9 = pd.DataFrame(table_data)
column_order = ['Rank', 'Feature', 'Logistic Regression\nΔ AUC', 'Random Forest\nΔ AUC', 'Xgboost\nΔ AUC']
table_4_9 = table_4_9[column_order]

print("\nTable 4.9: Top 10 Features by Permutation Importance (AUC Decrease)")
print("="*90)
print(table_4_9.to_string(index=False))

# Save table
table_4_9.to_csv('../data/processed/table_4_9.csv', index=False)
print("\nTable 4.9 saved to ../data/processed/table_4_9.csv")

## SHAP Analysis Preparation

Prepare data for SHAP analysis by transforming features and creating explainers for tree-based models.

In [None]:
# For SHAP, we need to work with transformed data
# Transform a sample of data for SHAP analysis (to save memory)
sample_size = min(1000, len(X_test))
X_test_sample = X_test.sample(n=sample_size, random_state=RANDOM_SEED)
y_test_sample = y_test.loc[X_test_sample.index]

# Transform features for each model
transformed_data = {}
for model_name, pipeline in best_models.items():
    # Get preprocessor
    preprocessor = pipeline.named_steps['preprocessor']

    # Transform the sample
    X_transformed = preprocessor.transform(X_test_sample)

    # Get the classifier
    if 'smote' in pipeline.named_steps:
        classifier = pipeline.named_steps['classifier']
    else:
        classifier = pipeline.named_steps.get('classifier', pipeline.named_steps.get('model'))

    transformed_data[model_name] = {
        'X_transformed': X_transformed,
        'classifier': classifier,
        'feature_names': get_feature_names(pipeline, X_test_sample.columns.tolist())
    }

print(f"Transformed sample size: {sample_size}")
print("Data prepared for SHAP analysis!")

## SHAP Analysis for XGBoost Model

Detailed SHAP analysis for the XGBoost model, which typically provides the most accurate and interpretable SHAP values for tree-based models.

In [None]:
# Focus on XGBoost as it typically provides the best SHAP visualizations
model_name = 'xgboost'
X_transformed = transformed_data[model_name]['X_transformed']
classifier = transformed_data[model_name]['classifier']
feature_names = transformed_data[model_name]['feature_names']

print("Computing SHAP values for XGBoost model...")

# Create SHAP explainer
explainer = shap.TreeExplainer(classifier)

# Calculate SHAP values
shap_values = explainer.shap_values(X_transformed)

print("SHAP values computed successfully!")
print(f"SHAP values shape: {shap_values.shape}")
print(f"Feature names count: {len(feature_names)}")

## Figure 4.9: SHAP Feature Importance for XGBoost Model

Bar plot showing the most important features based on mean absolute SHAP values, providing insights into which features have the greatest impact on model predictions.

In [None]:
# Load evaluation results to get test metrics for annotation
evaluation_results = joblib.load('../models/evaluation_results.pkl')
test_metrics = evaluation_results['test_metrics']
optimal_thresholds = evaluation_results['optimal_thresholds']

# Create SHAP feature importance plot with better styling
plt.figure(figsize=(12, 8))

# Calculate mean absolute SHAP values
shap_importance = np.abs(shap_values).mean(axis=0)
sorted_idx = np.argsort(shap_importance)[::-1][:20]  # Top 20 features

# Prepare data for plotting
top_features = [feature_names[i] for i in sorted_idx]
top_importance = shap_importance[sorted_idx]

# Clean feature names for display
display_names = []
for feat in top_features:
    if feat.startswith('cat_low_'):
        clean_name = feat.replace('cat_low_', '').replace('_', ' ').title()
    elif feat.startswith('cat_high_'):
        clean_name = feat.replace('cat_high_', '').replace('_', ' ').title()
    else:
        # Special handling for common features
        if feat == 'amount_vs_mean_ratio':
            clean_name = 'Amount vs Mean Ratio'
        elif feat == 'velocity_score':
            clean_name = 'Velocity Score'
        elif feat == 'amount_sum_24h':
            clean_name = 'Amount Sum (24H)'
        elif feat == 'tx_count_24h':
            clean_name = 'Transaction Count (24H)'
        elif feat == 'amount_mean_7d':
            clean_name = 'Amount Mean (7D)'
        elif feat == 'amount_std_7d':
            clean_name = 'Amount Std (7D)'
        elif feat == 'composite_risk':
            clean_name = 'Composite Risk Score'
        elif feat == 'merchant_risk_score':
            clean_name = 'Merchant Risk Score'
        elif feat == 'online_channel_ratio':
            clean_name = 'Online Channel Ratio'
        else:
            clean_name = feat.replace('_', ' ').title()
    display_names.append(clean_name)

# Create color gradient based on importance
colors = plt.cm.Blues(np.linspace(0.4, 0.9, len(top_importance)))

# Create horizontal bar plot
y_pos = np.arange(len(display_names))
bars = plt.barh(y_pos, top_importance, color=colors, edgecolor='navy', linewidth=0.5)

# Add value labels on bars
for i, (bar, value) in enumerate(zip(bars, top_importance)):
    plt.text(bar.get_width() + 0.001, bar.get_y() + bar.get_height()/2,
             f'{value:.4f}', va='center', fontsize=9, color='black')

# Formatting
plt.xlabel('Mean |SHAP value| (Average impact on model output)', fontsize=12, fontweight='bold')
plt.ylabel('Features', fontsize=12, fontweight='bold')
plt.title('Figure 4.9: SHAP Feature Importance for XGBoost Model\n(Nigerian Banking Fraud Detection)',
          fontsize=14, fontweight='bold', pad=20)

# Set y-axis
plt.yticks(y_pos, display_names, fontsize=11)
plt.gca().invert_yaxis()

# Add grid
plt.grid(True, axis='x', alpha=0.3, linestyle='--', linewidth=0.5)

# Set x-axis limits
plt.xlim(0, max(top_importance) * 1.15)

# Add a text box with model info
textstr = f'Model: XGBoost\nAUC-ROC: {test_metrics["xgboost"]["AUC"]["mean"]:.4f}\nOptimal Threshold: {optimal_thresholds["xgboost"]:.3f}'
props = dict(boxstyle='round', facecolor='lightblue', alpha=0.5)
plt.text(0.98, 0.97, textstr, transform=plt.gca().transAxes, fontsize=10,
         verticalalignment='top', horizontalalignment='right', bbox=props)

plt.tight_layout()
plt.savefig('../docs/images/figure_4_9_shap_xgboost.png',
            dpi=300, bbox_inches='tight', facecolor='white')
plt.show()

## SHAP Analysis for Random Forest Model

Calculate SHAP values for Random Forest to compare feature importance patterns across different tree-based algorithms.

In [None]:
# Calculate SHAP values for Random Forest
print("Computing SHAP values for Random Forest model...")

# Get Random Forest components
rf_transformed = transformed_data['random_forest']['X_transformed']
rf_classifier = transformed_data['random_forest']['classifier']
rf_feature_names = transformed_data['random_forest']['feature_names']

# Create SHAP explainer for Random Forest
rf_explainer = shap.TreeExplainer(rf_classifier)

# Calculate SHAP values (this may take a while for Random Forest)
rf_shap_values = rf_explainer.shap_values(rf_transformed)

# For binary classification, take the positive class SHAP values
if isinstance(rf_shap_values, list):
    rf_shap_values = rf_shap_values[1]

print("SHAP values computed successfully for Random Forest!")
print(f"RF SHAP values shape: {rf_shap_values.shape}")

## Figure 4.9a: SHAP Feature Importance for Random Forest Model

Comparison plot showing Random Forest SHAP feature importance with green color theme.

In [None]:
plt.figure(figsize=(12, 8))

# Calculate mean absolute SHAP values for Random Forest
rf_shap_importance = np.abs(rf_shap_values).mean(axis=0)

# Debug: Check shapes and ensure compatibility
print(f"rf_shap_importance shape: {rf_shap_importance.shape}")
print(f"rf_feature_names length: {len(rf_feature_names)}")

# Ensure rf_shap_importance is 1D
if rf_shap_importance.ndim > 1:
    rf_shap_importance = rf_shap_importance.flatten()

# Ensure feature names and importance values have matching lengths
min_length = min(len(rf_feature_names), len(rf_shap_importance))
rf_feature_names = rf_feature_names[:min_length]
rf_shap_importance = rf_shap_importance[:min_length]

# Sort indices
rf_sorted_idx = np.argsort(rf_shap_importance)[::-1]

# Prepare data for plotting
top_n = min(20, len(rf_feature_names))
rf_features_and_importance = []
for idx in rf_sorted_idx:
    if idx < len(rf_feature_names) and idx < len(rf_shap_importance):
        rf_features_and_importance.append((rf_feature_names[idx], rf_shap_importance[idx]))

rf_top_features_and_importance = rf_features_and_importance[:top_n]
rf_top_features = [feat for feat, importance in rf_top_features_and_importance]
rf_top_importance = [importance for feat, importance in rf_top_features_and_importance]

# Clean feature names for display (same logic as XGBoost)
rf_display_names = []
for feat in rf_top_features:
    if feat.startswith('cat_low_'):
        clean_name = feat.replace('cat_low_', '').replace('_', ' ').title()
    elif feat.startswith('cat_high_'):
        clean_name = feat.replace('cat_high_', '').replace('_', ' ').title()
    else:
        # Special handling for common features
        if feat == 'amount_vs_mean_ratio':
            clean_name = 'Amount vs Mean Ratio'
        elif feat == 'velocity_score':
            clean_name = 'Velocity Score'
        elif feat == 'amount_sum_24h':
            clean_name = 'Amount Sum (24H)'
        elif feat == 'tx_count_24h':
            clean_name = 'Transaction Count (24H)'
        elif feat == 'amount_mean_7d':
            clean_name = 'Amount Mean (7D)'
        elif feat == 'amount_std_7d':
            clean_name = 'Amount Std (7D)'
        elif feat == 'composite_risk':
            clean_name = 'Composite Risk Score'
        elif feat == 'merchant_risk_score':
            clean_name = 'Merchant Risk Score'
        elif feat == 'online_channel_ratio':
            clean_name = 'Online Channel Ratio'
        else:
            clean_name = feat.replace('_', ' ').title()
    rf_display_names.append(clean_name)

# Create color gradient - use green theme for Random Forest
colors = plt.cm.Greens(np.linspace(0.4, 0.9, len(rf_top_importance)))

# Create horizontal bar plot
y_pos = np.arange(len(rf_display_names))
bars = plt.barh(y_pos, rf_top_importance, color=colors, edgecolor='darkgreen', linewidth=0.5)

# Add value labels on bars
for i, (bar, value) in enumerate(zip(bars, rf_top_importance)):
    plt.text(bar.get_width() + 0.001, bar.get_y() + bar.get_height()/2,
             f'{value:.4f}', va='center', fontsize=9, color='black')

# Formatting
plt.xlabel('Mean |SHAP value| (Average impact on model output)', fontsize=12, fontweight='bold')
plt.ylabel('Features', fontsize=12, fontweight='bold')
plt.title('Figure 4.9a: SHAP Feature Importance for Random Forest Model\n(Nigerian Banking Fraud Detection)',
          fontsize=14, fontweight='bold', pad=20)

# Set y-axis
plt.yticks(y_pos, rf_display_names, fontsize=11)
plt.gca().invert_yaxis()

# Add grid
plt.grid(True, axis='x', alpha=0.3, linestyle='--', linewidth=0.5)

# Set x-axis limits
plt.xlim(0, max(rf_top_importance) * 1.15)

# Add a text box with model info
textstr = f'Model: Random Forest\nAUC-ROC: {test_metrics["random_forest"]["AUC"]["mean"]:.4f}'
props = dict(boxstyle='round', facecolor='lightgreen', alpha=0.5)
plt.text(0.98, 0.97, textstr, transform=plt.gca().transAxes, fontsize=10,
         verticalalignment='top', horizontalalignment='right', bbox=props)

plt.tight_layout()
plt.savefig('../docs/images/figure_4_9a_shap_random_forest.png',
            dpi=300, bbox_inches='tight', facecolor='white')
plt.show()

## Comparative SHAP Analysis: XGBoost vs Random Forest

Side-by-side comparison of feature importance rankings between XGBoost and Random Forest models.

In [None]:
# Create side-by-side comparison of top features
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))

# XGBoost plot
y_pos_xgb = np.arange(15)  # Top 15 for comparison
ax1.barh(y_pos_xgb, top_importance[:15], color='#45B7D1', alpha=0.8, edgecolor='navy', linewidth=0.5)
ax1.set_yticks(y_pos_xgb)
ax1.set_yticklabels(display_names[:15], fontsize=10)
ax1.invert_yaxis()
ax1.set_xlabel('Mean |SHAP value|', fontsize=11, fontweight='bold')
ax1.set_title('XGBoost Model', fontsize=13, fontweight='bold')
ax1.grid(True, axis='x', alpha=0.3, linestyle='--')

# Random Forest plot
y_pos_rf = np.arange(15)  # Top 15 for comparison
ax2.barh(y_pos_rf, rf_top_importance[:15], color='#4ECDC4', alpha=0.8, edgecolor='darkgreen', linewidth=0.5)
ax2.set_yticks(y_pos_rf)
ax2.set_yticklabels(rf_display_names[:15], fontsize=10)
ax2.invert_yaxis()
ax2.set_xlabel('Mean |SHAP value|', fontsize=11, fontweight='bold')
ax2.set_title('Random Forest Model', fontsize=13, fontweight='bold')
ax2.grid(True, axis='x', alpha=0.3, linestyle='--')

# Overall title
fig.suptitle('SHAP Feature Importance Comparison: XGBoost vs Random Forest\n(Top 15 Features)',
             fontsize=15, fontweight='bold')

plt.tight_layout()
plt.savefig('../docs/images/figure_shap_comparison.png',
            dpi=300, bbox_inches='tight', facecolor='white')
plt.show()

# Print feature importance comparison
print("\nTop 5 Features Comparison:")
print("="*60)
print(f"{'Rank':<6} {'XGBoost':<30} {'Random Forest':<30}")
print("-"*60)
for i in range(5):
    xgb_feat = display_names[i] if i < len(display_names) else "N/A"
    rf_feat = rf_display_names[i] if i < len(rf_display_names) else "N/A"
    print(f"{i+1:<6} {xgb_feat:<30} {rf_feat:<30}")

## SHAP Summary Plot

Detailed SHAP summary plot showing the relationship between feature values and their impact on model output.

In [None]:
# Create detailed SHAP summary plot
plt.figure(figsize=(12, 10))

# Create summary plot showing feature values
shap.summary_plot(
    shap_values,
    X_transformed,
    feature_names=feature_names,
    max_display=20,
    show=False
)

plt.title('SHAP Summary Plot: Feature Impact on Fraud Detection', fontsize=14)
plt.xlabel('SHAP value (impact on model output)', fontsize=12)
plt.tight_layout()
plt.savefig('../docs/images/shap_summary_detailed.png', dpi=300, bbox_inches='tight')
plt.show()

## Channel-Specific SHAP Analysis

Analysis of how feature importance varies across different banking channels (ATM, Mobile, POS, Web).

In [None]:
# Analyze SHAP values by channel
channels = X_test_sample['channel'].unique()
channel_shap_analysis = {}

for channel in channels:
    # Get indices for this channel
    channel_mask = X_test_sample['channel'] == channel
    channel_indices = np.where(channel_mask)[0]

    if len(channel_indices) > 0:
        # Calculate mean absolute SHAP values for this channel
        channel_shap = np.abs(shap_values[channel_indices]).mean(axis=0)
        channel_shap_analysis[channel] = channel_shap

# Create comparison plot
fig, ax = plt.subplots(figsize=(12, 6))

# Select top features for comparison
top_n_features = 10
overall_importance = np.abs(shap_values).mean(axis=0)
top_indices = np.argsort(overall_importance)[::-1][:top_n_features]

# Prepare data for plotting
x = np.arange(top_n_features)
width = 0.25

# Plot bars for each channel
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']
for i, (channel, color) in enumerate(zip(sorted(channels), colors)):
    if channel in channel_shap_analysis:
        values = [channel_shap_analysis[channel][idx] for idx in top_indices]
        ax.bar(x + i*width, values, width, label=channel, color=color, alpha=0.8)

# Customize plot
feature_labels = [feature_names[idx].replace('_', ' ').title() for idx in top_indices]
ax.set_xlabel('Features', fontsize=12)
ax.set_ylabel('Mean |SHAP value|', fontsize=12)
ax.set_title('Channel-Specific Feature Importance', fontsize=14)
ax.set_xticks(x + width * 1.5)
ax.set_xticklabels(feature_labels, rotation=45, ha='right')
ax.legend()
ax.grid(True, axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('../docs/images/channel_specific_shap.png', dpi=300, bbox_inches='tight')
plt.show()

print("Channel-specific SHAP analysis completed!")

## Feature Value vs SHAP Value Relationships

Scatter plots showing how feature values relate to their SHAP values for the top 6 most important features.

In [None]:
# Analyze how feature values correlate with their SHAP values
# Focus on top features
top_features_idx = np.argsort(np.abs(shap_values).mean(axis=0))[::-1][:6]

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, feature_idx in enumerate(top_features_idx):
    ax = axes[i]

    # Get feature values and SHAP values
    feature_values = X_transformed[:, feature_idx]
    feature_shap = shap_values[:, feature_idx]

    # Create scatter plot
    scatter = ax.scatter(feature_values, feature_shap,
                        c=feature_shap, cmap='coolwarm',
                        alpha=0.6, s=30)

    ax.set_xlabel(f'{feature_names[feature_idx]}', fontsize=10)
    ax.set_ylabel('SHAP value', fontsize=10)
    ax.set_title(f'SHAP vs {feature_names[feature_idx]}', fontsize=11)
    ax.grid(True, alpha=0.3)

    # Add colorbar
    cbar = plt.colorbar(scatter, ax=ax)
    cbar.set_label('SHAP value', fontsize=8)

plt.suptitle('Feature Value vs SHAP Value Relationships', fontsize=14)
plt.tight_layout()
plt.savefig('../docs/images/shap_feature_relationships.png', dpi=300, bbox_inches='tight')
plt.show()

## Temporal Feature Importance Analysis

Analysis of how temporal features (time-based features) contribute to fraud detection across different models.

In [None]:
# Analyze temporal features' importance
temporal_features = ['hour', 'day_of_week', 'month', 'hour_sin', 'hour_cos',
                    'day_sin', 'day_cos', 'month_sin', 'month_cos', 'is_weekend', 'is_peak_hour']

# Get indices of temporal features
temporal_indices = []
for i, feat in enumerate(feature_names):
    if any(temp in feat for temp in temporal_features):
        temporal_indices.append(i)

# Calculate temporal feature importance
temporal_importance = {}
for model_name in best_models.keys():
    results = perm_importance_results[model_name]
    temp_imp = []

    for idx in temporal_indices:
        if idx < len(results['importances_mean']):
            temp_imp.append(results['importances_mean'][idx])

    temporal_importance[model_name] = np.mean(temp_imp) if temp_imp else 0

# Create comparison plot
plt.figure(figsize=(10, 6))

models = list(temporal_importance.keys())
values = list(temporal_importance.values())
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']

bars = plt.bar(range(len(models)), values, color=colors, alpha=0.8)

# Add value labels on bars
for bar, value in zip(bars, values):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.0001,
             f'{value:.4f}', ha='center', va='bottom')

plt.xlabel('Model', fontsize=12)
plt.ylabel('Average Importance of Temporal Features', fontsize=12)
plt.title('Temporal Feature Importance Across Models', fontsize=14)
plt.xticks(range(len(models)), [m.replace('_', ' ').title() for m in models])
plt.grid(True, axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('../docs/images/temporal_feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nTemporal Feature Importance Summary:")
print("="*50)
for model_name, importance in temporal_importance.items():
    print(f"{model_name.replace('_', ' ').title()}: {importance:.4f}")

## Save Feature Importance Results

Compile and save all feature importance analysis results for subsequent use.

In [None]:
# Compile all feature importance results
feature_importance_results = {
    'permutation_importance': perm_importance_results,
    'shap_values': {
        'xgboost': {
            'values': shap_values,
            'expected_value': explainer.expected_value,
            'feature_names': feature_names
        },
        'random_forest': {
            'values': rf_shap_values,
            'expected_value': rf_explainer.expected_value,
            'feature_names': rf_feature_names
        }
    },
    'channel_analysis': channel_shap_analysis,
    'temporal_importance': temporal_importance,
    'tables': {
        'table_4_9': table_4_9
    },
    'top_features_analysis': {
        'xgboost_top_features': top_features,
        'xgboost_importance': top_importance,
        'rf_top_features': rf_top_features,
        'rf_importance': rf_top_importance
    }
}

# Save results
joblib.dump(feature_importance_results, '../models/feature_importance_results.pkl')

print("Feature importance analysis completed and saved!")
print("\nKey findings:")
print(f"1. Top feature across all models: {table_4_9.iloc[0]['Feature']}")
print(f"2. Average temporal feature importance: {np.mean(list(temporal_importance.values())):.4f}")
print(f"3. Number of features with positive importance: {np.sum(shap_importance > 0)}")
print(f"4. XGBoost vs Random Forest top feature agreement: {len(set(display_names[:5]).intersection(set(rf_display_names[:5])))} out of 5")

print("\nFiles created:")
print("- ../models/feature_importance_results.pkl")
print("- ../data/processed/table_4_9.csv")
print("- ../docs/images/figure_4_9_shap_xgboost.png")
print("- ../docs/images/figure_4_9a_shap_random_forest.png")
print("- ../docs/images/figure_shap_comparison.png")
print("- ../docs/images/shap_summary_detailed.png")
print("- ../docs/images/channel_specific_shap.png")
print("- ../docs/images/shap_feature_relationships.png")
print("- ../docs/images/temporal_feature_importance.png")

print("\nReady for cost-sensitive analysis in next notebook!")