# Anomaly Detection

This notebook demonstrates anomaly detection methods in TimeSmith.

## What You'll Learn

- Creating data with anomalies
- Z-Score based detection
- Hampel filter for robust detection
- Visualizing detected anomalies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from timesmith.core import OutlierRemover

# Try to import advanced outlier detectors
try:
    from timesmith.core.outliers import ZScoreOutlierRemover, HampelOutlierRemover
    HAS_ADVANCED_OUTLIERS = True
except ImportError:
    HAS_ADVANCED_OUTLIERS = False
    print("Advanced outlier detectors not available (requires optional dependencies)")

np.random.seed(42)
print("Anomaly detection tools loaded!")

## 1. Create Data with Anomalies

Let's create a time series and inject some anomalies.

In [None]:
# Create time series with anomalies
dates = pd.date_range('2020-01-01', periods=200, freq='D')
y = pd.Series(np.random.randn(200).cumsum() + 100, index=dates)

# Add anomalies
y.iloc[50] = y.mean() + 5 * y.std()
y.iloc[100] = y.mean() - 4 * y.std()
y.iloc[150] = y.mean() + 6 * y.std()

print(f"Data points: {len(y)}")
print(f"True anomalies at indices: 50, 100, 150")

# Visualize
plt.figure(figsize=(14, 6))
plt.plot(y.index, y.values, linewidth=2, label='Time Series', color='steelblue')
plt.scatter(y.index[[50, 100, 150]], y.values[[50, 100, 150]], 
           color='red', s=100, zorder=5, label='True Anomalies')
plt.title('Time Series with Anomalies', fontsize=14, fontweight='bold')
plt.xlabel('Date')
plt.ylabel('Value')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 2. IQR-Based Outlier Removal

Use the built-in OutlierRemover with IQR method.

In [None]:
# IQR-based outlier removal
outlier_remover = OutlierRemover(method='iqr', factor=1.5)
outlier_remover.fit(y)
y_clean = outlier_remover.transform(y)

anomalies_iqr = y.index[~y.index.isin(y_clean.index)]

print(f"IQR-based detection:")
print(f"  Anomalies detected: {len(anomalies_iqr)}")
print(f"  Anomaly indices: {anomalies_iqr.tolist()}")

# Visualize
plt.figure(figsize=(14, 6))
plt.plot(y.index, y.values, linewidth=2, label='Time Series', color='steelblue', alpha=0.7)
if len(anomalies_iqr) > 0:
    plt.scatter(anomalies_iqr, y[anomalies_iqr], 
               color='red', s=100, zorder=5, label='Detected Anomalies')
plt.title('IQR-Based Anomaly Detection', fontsize=14, fontweight='bold')
plt.xlabel('Date')
plt.ylabel('Value')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 3. Z-Score Detector (if available)

Z-score based detection for statistical outliers.

In [None]:
if HAS_ADVANCED_OUTLIERS:
    # Z-score based detection
    detector = ZScoreOutlierRemover(threshold=3.0)
    detector.fit(y)
    scores = detector.score(y)
    anomalies = detector.predict(y)
    
    print(f"Z-Score detection:")
    print(f"  Anomalies detected: {anomalies.sum()}")
    print(f"  Anomaly indices: {y.index[anomalies].tolist()}")
    
    # Visualize scores
    fig, axes = plt.subplots(2, 1, figsize=(14, 10))
    axes[0].plot(y.index, y.values, linewidth=2, label='Time Series', color='steelblue')
    if anomalies.sum() > 0:
        axes[0].scatter(y.index[anomalies], y.values[anomalies], 
                       color='red', s=100, zorder=5, label='Anomalies')
    axes[0].set_title('Z-Score Anomaly Detection', fontweight='bold')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    axes[1].plot(y.index, scores, linewidth=2, color='orange')
    axes[1].axhline(y=3.0, color='red', linestyle='--', label='Threshold')
    axes[1].axhline(y=-3.0, color='red', linestyle='--')
    axes[1].set_title('Z-Scores', fontweight='bold')
    axes[1].set_xlabel('Date')
    axes[1].set_ylabel('Z-Score')
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    print("Z-Score detector not available. Install optional dependencies to use it.")

## 4. Hampel Detector (if available)

Hampel filter for robust outlier detection.

In [None]:
if HAS_ADVANCED_OUTLIERS:
    # Hampel filter
    hampel = HampelOutlierRemover(window=5, threshold=3.0)
    hampel.fit(y)
    hampel_anomalies = hampel.predict(y)
    
    print(f"Hampel detection:")
    print(f"  Anomalies detected: {hampel_anomalies.sum()}")
    print(f"  Anomaly indices: {y.index[hampel_anomalies].tolist()}")
    
    # Visualize
    plt.figure(figsize=(14, 6))
    plt.plot(y.index, y.values, linewidth=2, label='Time Series', color='steelblue', alpha=0.7)
    if hampel_anomalies.sum() > 0:
        plt.scatter(y.index[hampel_anomalies], y.values[hampel_anomalies], 
                   color='red', s=100, zorder=5, label='Hampel Anomalies')
    plt.title('Hampel Filter Anomaly Detection', fontsize=14, fontweight='bold')
    plt.xlabel('Date')
    plt.ylabel('Value')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    print("Hampel detector not available. Install optional dependencies to use it.")

## Summary

You've learned:
- How to detect anomalies using IQR method
- How to use Z-Score detection (if available)
- How to use Hampel filter for robust detection (if available)
- How to visualize detected anomalies

**Key Points:**
- IQR method is always available and works well for most cases
- Z-Score assumes normal distribution
- Hampel filter is more robust to outliers in the detection process
- Always visualize results to verify anomaly detection makes sense