# Circular Bias Detection: Visualization & Model Card

This notebook demonstrates:
1. Visualizing permutation test distributions
2. Analyzing p-value confidence intervals
3. Generating model audit cards
4. Performance optimization strategies

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from cbd.api import detect_bias

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Generate Test Data

In [None]:
# Create synthetic dataset
X, y = make_classification(
    n_samples=500,
    n_features=20,
    n_informative=10,
    n_redundant=5,
    random_state=42
)

print(f"Dataset shape: {X.shape}")
print(f"Class distribution: {np.bincount(y)}")

## 2. Train Models

In [None]:
# Train a well-generalized model
model_good = LogisticRegression(random_state=42, max_iter=1000, C=1.0)
model_good.fit(X, y)

# Train an overfitted model (high capacity)
model_overfit = DecisionTreeClassifier(random_state=42, max_depth=None)
model_overfit.fit(X, y)

print(f"Good model accuracy: {accuracy_score(y, model_good.predict(X)):.3f}")
print(f"Overfit model accuracy: {accuracy_score(y, model_overfit.predict(X)):.3f}")

## 3. Run Bias Detection with Full Results

In [None]:
# Detect bias in good model
result_good = detect_bias(
    model_good,
    X, y,
    metric=accuracy_score,
    n_permutations=1000,
    random_state=42,
    return_permutations=True,
    n_jobs=-1,  # Use all CPUs
    backend='threads'
)

# Detect bias in overfitted model
result_overfit = detect_bias(
    model_overfit,
    X, y,
    metric=accuracy_score,
    n_permutations=1000,
    random_state=42,
    return_permutations=True,
    n_jobs=-1,
    backend='threads'
)

print("\n=== Good Model ===")
print(f"Observed metric: {result_good['observed_metric']:.4f}")
print(f"P-value: {result_good['p_value']:.4f}")
if 'p_value_ci' in result_good:
    ci = result_good['p_value_ci']
    print(f"P-value 95% CI: [{ci[0]:.4f}, {ci[1]:.4f}]")
print(f"Conclusion: {result_good['conclusion']}")

print("\n=== Overfit Model ===")
print(f"Observed metric: {result_overfit['observed_metric']:.4f}")
print(f"P-value: {result_overfit['p_value']:.4f}")
if 'p_value_ci' in result_overfit:
    ci = result_overfit['p_value_ci']
    print(f"P-value 95% CI: [{ci[0]:.4f}, {ci[1]:.4f}]")
print(f"Conclusion: {result_overfit['conclusion']}")

## 4. Visualize Permutation Distributions

In [None]:
def plot_permutation_distribution(result, title, ax=None):
    """Plot histogram of permuted metrics with observed value."""
    if ax is None:
        fig, ax = plt.subplots(figsize=(10, 6))
    
    permuted = np.array(result['permuted_metrics'])
    observed = result['observed_metric']
    p_value = result['p_value']
    
    # Plot histogram
    ax.hist(permuted, bins=50, alpha=0.7, color='skyblue', edgecolor='black')
    
    # Mark observed value
    ax.axvline(observed, color='red', linestyle='--', linewidth=2, 
               label=f'Observed: {observed:.4f}')
    
    # Add percentile line
    percentile_95 = np.percentile(permuted, 95)
    ax.axvline(percentile_95, color='orange', linestyle=':', linewidth=2,
               label=f'95th percentile: {percentile_95:.4f}')
    
    # Styling
    ax.set_xlabel('Metric Value', fontsize=12)
    ax.set_ylabel('Frequency', fontsize=12)
    ax.set_title(f'{title}\nP-value: {p_value:.4f}', fontsize=14, fontweight='bold')
    ax.legend(fontsize=10)
    ax.grid(True, alpha=0.3)
    
    return ax

# Create side-by-side comparison
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

plot_permutation_distribution(result_good, 'Well-Generalized Model', axes[0])
plot_permutation_distribution(result_overfit, 'Overfitted Model', axes[1])

plt.tight_layout()
plt.savefig('permutation_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

## 5. Statistical Summary

In [None]:
def compute_distribution_stats(result):
    """Compute statistical summary of permutation distribution."""
    permuted = np.array(result['permuted_metrics'])
    observed = result['observed_metric']
    
    stats = {
        'Mean': np.mean(permuted),
        'Std': np.std(permuted),
        'Median': np.median(permuted),
        'Min': np.min(permuted),
        'Max': np.max(permuted),
        '95th Percentile': np.percentile(permuted, 95),
        'Observed': observed,
        'Z-score': (observed - np.mean(permuted)) / np.std(permuted),
        'P-value': result['p_value']
    }
    
    return stats

print("\n=== Good Model Statistics ===")
for key, value in compute_distribution_stats(result_good).items():
    print(f"{key:20s}: {value:.4f}")

print("\n=== Overfit Model Statistics ===")
for key, value in compute_distribution_stats(result_overfit).items():
    print(f"{key:20s}: {value:.4f}")

## 6. Model Card Generation

In [None]:
def generate_model_card(model, result, model_name, dataset_info):
    """Generate a model audit card with bias detection results."""
    
    card = f"""
# Model Card: {model_name}

## Model Details
- **Model Type**: {type(model).__name__}
- **Training Date**: {np.datetime64('today')}
- **Framework**: scikit-learn

## Dataset Information
- **Number of Samples**: {dataset_info['n_samples']}
- **Number of Features**: {dataset_info['n_features']}
- **Task**: {dataset_info['task']}

## Performance Metrics
- **Observed Metric**: {result['observed_metric']:.4f}
- **Metric Type**: {dataset_info['metric_name']}

## Circular Bias Detection Results
- **P-value**: {result['p_value']:.4f}
"""
    
    if 'p_value_ci' in result:
        ci = result['p_value_ci']
        card += f"- **P-value {int(result['confidence_level']*100)}% CI**: [{ci[0]:.4f}, {ci[1]:.4f}]\n"
    
    card += f"""
- **Number of Permutations**: {result['n_permutations']}
- **Null Method**: {result['null_method']}
- **Parallel Backend**: {result['backend']}
- **Conclusion**: {result['conclusion']}

## Interpretation
"""
    
    if result['p_value'] <= 0.05:
        card += """
⚠️ **WARNING**: This model shows signs of potential circular bias.
The observed performance is suspiciously high compared to the null distribution.

**Recommendations**:
1. Verify data leakage between training and evaluation
2. Check for feature engineering using target information
3. Ensure proper train/test split
4. Consider using cross-validation
5. Review feature selection process
"""
    else:
        card += """
✅ **PASSED**: No strong evidence of circular bias detected.
The model's performance is consistent with the null distribution.

**Note**: This does not guarantee absence of all forms of bias.
Continue to monitor model performance and fairness metrics.
"""
    
    card += f"""

## Computational Details
- **Samples Used**: {result['n_samples']}
- **Subsampled**: {result['subsampled']}
- **Parallel Workers**: {result['n_jobs']}

## Limitations
- This test detects circular bias through permutation testing
- Results are specific to the provided dataset
- Does not detect all forms of data leakage
- Should be combined with other validation methods

## Contact
For questions about this model card, please contact the model owner.

---
*Generated using Circular Bias Detector (CBD)*
"""
    
    return card

# Generate cards
dataset_info = {
    'n_samples': X.shape[0],
    'n_features': X.shape[1],
    'task': 'Binary Classification',
    'metric_name': 'Accuracy'
}

card_good = generate_model_card(model_good, result_good, "Logistic Regression", dataset_info)
card_overfit = generate_model_card(model_overfit, result_overfit, "Decision Tree", dataset_info)

# Save cards
with open('model_card_logistic.md', 'w') as f:
    f.write(card_good)

with open('model_card_tree.md', 'w') as f:
    f.write(card_overfit)

print("Model cards saved!")
print("\n" + "="*60)
print(card_good)

## 7. Performance Optimization Demo

In [None]:
import time

# Create larger dataset
X_large, y_large = make_classification(
    n_samples=5000,
    n_features=50,
    n_informative=25,
    random_state=42
)

model_large = LogisticRegression(random_state=42, max_iter=1000)
model_large.fit(X_large, y_large)

print("Performance Comparison:\n")

# Sequential
start = time.time()
result_seq = detect_bias(
    model_large, X_large, y_large,
    metric=accuracy_score,
    n_permutations=100,
    n_jobs=1,
    random_state=42
)
time_seq = time.time() - start
print(f"Sequential (n_jobs=1): {time_seq:.2f}s")

# Parallel threads
start = time.time()
result_par = detect_bias(
    model_large, X_large, y_large,
    metric=accuracy_score,
    n_permutations=100,
    n_jobs=-1,
    backend='threads',
    random_state=42
)
time_par = time.time() - start
print(f"Parallel threads (n_jobs=-1): {time_par:.2f}s")
print(f"Speedup: {time_seq/time_par:.2f}x")

# With subsampling
start = time.time()
result_sub = detect_bias(
    model_large, X_large, y_large,
    metric=accuracy_score,
    n_permutations=100,
    n_jobs=-1,
    backend='threads',
    subsample_size=1000,  # Use only 1000 samples
    random_state=42
)
time_sub = time.time() - start
print(f"\nWith subsampling (n=1000): {time_sub:.2f}s")
print(f"Speedup vs sequential: {time_seq/time_sub:.2f}x")
print(f"P-value (full): {result_par['p_value']:.4f}")
print(f"P-value (subsampled): {result_sub['p_value']:.4f}")

## 8. Advanced: Retrain Null Method

In [None]:
# Conservative test: retrain model on each permutation
print("Running conservative retrain test (this may take a while)...\n")

result_retrain = detect_bias(
    model_good,
    X, y,
    metric=accuracy_score,
    n_permutations=50,  # Fewer permutations due to computational cost
    null_method='retrain',
    n_jobs=-1,
    backend='threads',
    random_state=42,
    return_permutations=True
)

print(f"Permute method p-value: {result_good['p_value']:.4f}")
print(f"Retrain method p-value: {result_retrain['p_value']:.4f}")
print(f"\nRetrain is more conservative and computationally expensive.")
print(f"Use it when you need the most rigorous test.")

## 9. Probability-Based Metrics (AUC, Log Loss)

In [None]:
# Test with AUC metric
def auc_metric(y_true, y_proba):
    if y_proba.ndim == 2:
        y_proba = y_proba[:, 1]
    return roc_auc_score(y_true, y_proba)

result_auc = detect_bias(
    model_good,
    X, y,
    metric=auc_metric,
    n_permutations=500,
    allow_proba=True,  # Use predict_proba
    n_jobs=-1,
    random_state=42,
    return_permutations=True
)

print("AUC-based Detection:")
print(f"Observed AUC: {result_auc['observed_metric']:.4f}")
print(f"P-value: {result_auc['p_value']:.4f}")
print(f"Conclusion: {result_auc['conclusion']}")

# Visualize
fig, ax = plt.subplots(figsize=(10, 6))
plot_permutation_distribution(result_auc, 'AUC Metric with Probabilities', ax)
plt.savefig('auc_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

## Summary

This notebook demonstrated:

1. ✅ **Visualization**: Histogram plots showing permutation distributions
2. ✅ **Statistical Analysis**: Computing distribution statistics and z-scores
3. ✅ **Model Cards**: Automated generation of audit documentation
4. ✅ **Performance**: Parallel execution and subsampling strategies
5. ✅ **Advanced Methods**: Retrain null method and probability metrics

### Best Practices

- Use `n_permutations >= 1000` for reliable p-values
- Enable parallelization (`n_jobs=-1`) for large datasets
- Use `subsample_size` for datasets > 10,000 samples
- Consider `null_method='retrain'` for conservative testing
- Set `allow_proba=True` for AUC, log loss, and other probability metrics
- Always visualize the permutation distribution
- Generate model cards for audit trails