# Threshold Sweeping and Evaluation

This notebook demonstrates how to:
1. Sweep multiple threshold values to find optimal thresholds
2. Evaluate detection performance using precision, recall, and F1 scores
3. Use ground truth labels to assess detector performance
4. Visualize threshold selection trade-offs

The `sweep_thresholds` function is particularly useful for:
- Finding optimal threshold values
- Understanding precision-recall trade-offs
- Comparing different detectors


In [None]:
import numpy as np
import pandas as pd
from plotsmith import plot_timeseries
import matplotlib.pyplot as plt

from anomsmith import sweep_thresholds, detect_anomalies, ThresholdRule
from anomsmith.primitives.scorers.robust_zscore import RobustZScoreScorer
from anomsmith.primitives.scorers.statistical import ZScoreScorer, IQRScorer

np.random.seed(42)


## Creating Data with Ground Truth Labels

For evaluation, we need data with known anomalies (ground truth labels).


In [None]:
def create_labeled_data(n: int = 200, contamination: float = 0.1, seed: int = 42):
    """Create data with known anomalies and ground truth labels."""
    np.random.seed(seed)
    
    # Base series
    trend = np.linspace(0, 2, n)
    noise = np.random.randn(n) * 0.5
    y = trend + noise
    
    # Inject anomalies
    n_anomalies = int(n * contamination)
    anomaly_indices = np.random.choice(n, n_anomalies, replace=False)
    y[anomaly_indices] += np.random.choice([-1, 1], n_anomalies) * np.random.uniform(4, 8, n_anomalies)
    
    # Create ground truth labels
    labels = pd.Series(np.zeros(n), index=pd.date_range("2020-01-01", periods=n, freq="D"))
    labels.iloc[anomaly_indices] = 1
    
    index = pd.date_range("2020-01-01", periods=n, freq="D")
    y_series = pd.Series(y, index=index)
    
    return y_series, labels, anomaly_indices

y, labels, true_anomaly_indices = create_labeled_data(n=200, contamination=0.1)
print(f"Created time series with {len(y)} points")
print(f"True anomalies: {labels.sum()}")
print(f"Anomaly rate: {labels.mean():.2%}")


In [None]:
# Visualize data with ground truth
fig, ax = plt.subplots(figsize=(14, 6))
ax.plot(y.index, y.values, 'b-', linewidth=1.5, alpha=0.7, label='Time Series')
ax.scatter(y.index[true_anomaly_indices], y.values[true_anomaly_indices], 
          color='red', s=100, marker='x', linewidths=2, 
          label=f'True Anomalies ({len(true_anomaly_indices)})', zorder=5)
ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('Value', fontsize=12)
ax.set_title('Time Series with Ground Truth Labels', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()


## Sweeping Thresholds

Let's sweep a range of threshold values and evaluate performance.


In [None]:
# Initialize scorer
scorer = RobustZScoreScorer(epsilon=1e-8)
scorer.fit(y.values)

# Create range of threshold values (quantiles)
threshold_values = np.linspace(0.5, 0.99, 50)

# Sweep thresholds with ground truth labels
sweep_results = sweep_thresholds(y, scorer, threshold_values, labels=labels)

print("Threshold Sweep Results (first 10 rows):")
print(sweep_results.head(10))
print("\nThreshold Sweep Results (last 10 rows):")
print(sweep_results.tail(10))


In [None]:
# Find optimal threshold (max F1)
optimal_idx = sweep_results['f1'].idxmax()
optimal_threshold = sweep_results.loc[optimal_idx, 'threshold']
optimal_f1 = sweep_results.loc[optimal_idx, 'f1']
optimal_precision = sweep_results.loc[optimal_idx, 'precision']
optimal_recall = sweep_results.loc[optimal_idx, 'recall']

print(f"Optimal Threshold: {optimal_threshold:.4f}")
print(f"Optimal F1 Score: {optimal_f1:.4f}")
print(f"Optimal Precision: {optimal_precision:.4f}")
print(f"Optimal Recall: {optimal_recall:.4f}")


In [None]:
# Visualize precision-recall curve
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Precision-Recall curve
ax1 = axes[0, 0]
ax1.plot(sweep_results['recall'], sweep_results['precision'], 'b-', linewidth=2, alpha=0.7)
ax1.scatter(optimal_recall, optimal_precision, color='red', s=200, marker='*', 
           zorder=5, label=f'Optimal (F1={optimal_f1:.3f})')
ax1.set_xlabel('Recall', fontsize=12)
ax1.set_ylabel('Precision', fontsize=12)
ax1.set_title('Precision-Recall Curve', fontsize=14, fontweight='bold')
ax1.legend()
ax1.grid(True, alpha=0.3)

# F1 vs Threshold
ax2 = axes[0, 1]
ax2.plot(sweep_results['threshold'], sweep_results['f1'], 'g-', linewidth=2, alpha=0.7)
ax2.axvline(optimal_threshold, color='r', linestyle='--', linewidth=2, 
           label=f'Optimal ({optimal_threshold:.3f})')
ax2.set_xlabel('Threshold', fontsize=12)
ax2.set_ylabel('F1 Score', fontsize=12)
ax2.set_title('F1 Score vs Threshold', fontsize=14, fontweight='bold')
ax2.legend()
ax2.grid(True, alpha=0.3)

# Precision vs Threshold
ax3 = axes[1, 0]
ax3.plot(sweep_results['threshold'], sweep_results['precision'], 'b-', linewidth=2, alpha=0.7, label='Precision')
ax3.plot(sweep_results['threshold'], sweep_results['recall'], 'orange', linewidth=2, alpha=0.7, label='Recall')
ax3.axvline(optimal_threshold, color='r', linestyle='--', linewidth=2)
ax3.set_xlabel('Threshold', fontsize=12)
ax3.set_ylabel('Score', fontsize=12)
ax3.set_title('Precision and Recall vs Threshold', fontsize=14, fontweight='bold')
ax3.legend()
ax3.grid(True, alpha=0.3)

# All metrics together
ax4 = axes[1, 1]
ax4.plot(sweep_results['threshold'], sweep_results['precision'], 'b-', linewidth=2, alpha=0.7, label='Precision')
ax4.plot(sweep_results['threshold'], sweep_results['recall'], 'orange', linewidth=2, alpha=0.7, label='Recall')
ax4.plot(sweep_results['threshold'], sweep_results['f1'], 'g-', linewidth=2, alpha=0.7, label='F1')
ax4.axvline(optimal_threshold, color='r', linestyle='--', linewidth=2)
ax4.set_xlabel('Threshold', fontsize=12)
ax4.set_ylabel('Score', fontsize=12)
ax4.set_title('All Metrics vs Threshold', fontsize=14, fontweight='bold')
ax4.legend()
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


## Comparing Different Scorers

Let's compare how different scorers perform with threshold sweeping.


In [None]:
# Initialize multiple scorers
scorers = {
    'RobustZScore': RobustZScoreScorer(epsilon=1e-8),
    'ZScore': ZScoreScorer(),
    'IQR': IQRScorer()
}

# Fit all scorers
for name, scorer in scorers.items():
    scorer.fit(y.values)

# Sweep thresholds for each scorer
sweep_results_all = {}
for name, scorer in scorers.items():
    sweep_results_all[name] = sweep_thresholds(y, scorer, threshold_values, labels=labels)

# Find optimal for each
optimal_results = {}
for name, results in sweep_results_all.items():
    optimal_idx = results['f1'].idxmax()
    optimal_results[name] = {
        'threshold': results.loc[optimal_idx, 'threshold'],
        'f1': results.loc[optimal_idx, 'f1'],
        'precision': results.loc[optimal_idx, 'precision'],
        'recall': results.loc[optimal_idx, 'recall']
    }

# Compare
comparison_df = pd.DataFrame(optimal_results).T
print("Optimal Performance Comparison:")
print(comparison_df.round(4))


In [None]:
# Visualize comparison
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

colors = {'RobustZScore': 'blue', 'ZScore': 'green', 'IQR': 'orange'}

for idx, metric in enumerate(['precision', 'recall', 'f1']):
    ax = axes[idx]
    for name, results in sweep_results_all.items():
        ax.plot(results['threshold'], results[metric], 
               color=colors[name], linewidth=2, alpha=0.7, label=name)
        # Mark optimal point
        optimal_idx = results['f1'].idxmax()
        ax.scatter(results.loc[optimal_idx, 'threshold'], 
                  results.loc[optimal_idx, metric],
                  color=colors[name], s=100, marker='*', zorder=5)
    
    ax.set_xlabel('Threshold', fontsize=12)
    ax.set_ylabel(metric.capitalize(), fontsize=12)
    ax.set_title(f'{metric.capitalize()} vs Threshold', fontsize=14, fontweight='bold')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


## Summary

In this notebook, we've explored:
1. **sweep_thresholds**: Evaluating multiple threshold values
2. **Precision, Recall, F1**: Understanding evaluation metrics
3. **Optimal threshold selection**: Finding the best threshold for your use case
4. **Comparing scorers**: Evaluating different algorithms

Key takeaways:
- Threshold selection is crucial for good performance
- Precision-recall trade-offs help understand detector behavior
- Different scorers may have different optimal thresholds
- F1 score balances precision and recall
