# Backtesting and Model Validation

This notebook demonstrates how to use `backtest_detector` to validate anomaly detectors using expanding window splits.

Backtesting is crucial for:
- Validating detector performance on historical data
- Understanding how detectors perform over time
- Detecting performance degradation
- Comparing different detectors fairly


In [None]:
import numpy as np
import pandas as pd
from plotsmith import plot_timeseries
import matplotlib.pyplot as plt

from anomsmith import backtest_detector, detect_anomalies, ThresholdRule
from anomsmith.primitives.scorers.robust_zscore import RobustZScoreScorer
from anomsmith.primitives.detectors.ml import IsolationForestDetector

np.random.seed(42)


## Creating Time Series Data for Backtesting

We'll create a longer time series to demonstrate expanding window backtesting.


In [None]:
def create_backtest_data(n: int = 500, contamination: float = 0.08, seed: int = 42):
    """Create time series data for backtesting."""
    np.random.seed(seed)
    
    # Base series with trend
    t = np.arange(n)
    trend = 0.005 * t
    seasonal = 1.5 * np.sin(2 * np.pi * t / 100)
    noise = np.random.randn(n) * 0.5
    y = trend + seasonal + noise
    
    # Inject anomalies
    n_anomalies = int(n * contamination)
    anomaly_indices = np.random.choice(n, n_anomalies, replace=False)
    y[anomaly_indices] += np.random.choice([-1, 1], n_anomalies) * np.random.uniform(4, 7, n_anomalies)
    
    # Create ground truth labels
    labels = pd.Series(np.zeros(n), index=pd.date_range("2020-01-01", periods=n, freq="D"))
    labels.iloc[anomaly_indices] = 1
    
    index = pd.date_range("2020-01-01", periods=n, freq="D")
    y_series = pd.Series(y, index=index)
    
    return y_series, labels

y, labels = create_backtest_data(n=500, contamination=0.08)
print(f"Created time series with {len(y)} points")
print(f"True anomalies: {labels.sum()}")
print(f"Anomaly rate: {labels.mean():.2%}")


In [None]:
# Visualize the data
fig, ax = plt.subplots(figsize=(14, 6))
ax.plot(y.index, y.values, 'b-', linewidth=1.5, alpha=0.7, label='Time Series')
anomaly_indices = labels[labels == 1].index
ax.scatter(anomaly_indices, y.loc[anomaly_indices], 
          color='red', s=100, marker='x', linewidths=2, 
          label=f'True Anomalies ({len(anomaly_indices)})', zorder=5)
ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('Value', fontsize=12)
ax.set_title('Time Series for Backtesting', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()


## Running Backtests

Let's run backtests with different detectors and compare their performance across folds.


In [None]:
# Define threshold rule
threshold_rule = ThresholdRule(method="quantile", value=0.9, quantile=0.9)

# Backtest with RobustZScoreScorer
scorer = RobustZScoreScorer(epsilon=1e-8)
backtest_results_scorer = backtest_detector(
    y, scorer, threshold_rule, 
    labels=labels, 
    n_splits=5, 
    min_train_size=50
)

print("Backtest Results (RobustZScoreScorer):")
print(backtest_results_scorer)
print(f"\nAverage F1: {backtest_results_scorer['f1'].mean():.4f}")
print(f"Average Precision: {backtest_results_scorer['precision'].mean():.4f}")
print(f"Average Recall: {backtest_results_scorer['recall'].mean():.4f}")


In [None]:
# Backtest with IsolationForestDetector
detector = IsolationForestDetector(contamination=0.1, random_state=42)
backtest_results_detector = backtest_detector(
    y, detector, threshold_rule,
    labels=labels,
    n_splits=5,
    min_train_size=50
)

print("Backtest Results (IsolationForestDetector):")
print(backtest_results_detector)
print(f"\nAverage F1: {backtest_results_detector['f1'].mean():.4f}")
print(f"Average Precision: {backtest_results_detector['precision'].mean():.4f}")
print(f"Average Recall: {backtest_results_detector['recall'].mean():.4f}")


In [None]:
# Visualize backtest results
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

metrics = ['precision', 'recall', 'f1', 'avg_run_length']
titles = ['Precision', 'Recall', 'F1 Score', 'Average Run Length']

for idx, (metric, title) in enumerate(zip(metrics, titles)):
    ax = axes[idx // 2, idx % 2]
    ax.plot(backtest_results_scorer['fold'], backtest_results_scorer[metric], 
           'b-o', linewidth=2, markersize=8, label='RobustZScore', alpha=0.7)
    ax.plot(backtest_results_detector['fold'], backtest_results_detector[metric], 
           'g-s', linewidth=2, markersize=8, label='IsolationForest', alpha=0.7)
    ax.set_xlabel('Fold', fontsize=12)
    ax.set_ylabel(title, fontsize=12)
    ax.set_title(f'{title} Across Folds', fontsize=14, fontweight='bold')
    ax.legend()
    ax.grid(True, alpha=0.3)
    ax.set_xticks(backtest_results_scorer['fold'])

plt.tight_layout()
plt.show()


## Summary

In this notebook, we've explored:
1. **backtest_detector**: Running backtests with expanding windows
2. **Performance across folds**: Understanding how detectors perform over time
3. **Comparing detectors**: Fair comparison using the same backtest setup

Key takeaways:
- Backtesting validates detector performance on historical data
- Expanding windows simulate real-world deployment scenarios
- Performance metrics across folds help identify stability issues
- Average run length helps understand anomaly segment characteristics
