# PCA-Based Anomaly Detection

This notebook demonstrates the `PCADetector`, which uses Principal Component Analysis to model healthy operation boundaries and detect anomalies.

PCA-based detection is particularly useful for:
- Multivariate time series
- High-dimensional data
- When you want to capture the main patterns in normal data

The detector supports three scoring methods:
1. **Reconstruction error**: Measures how well data can be reconstructed from principal components
2. **Mahalanobis distance**: Distance in the principal component space
3. **Both**: Average of reconstruction error and Mahalanobis distance


In [None]:
import numpy as np
import pandas as pd
from plotsmith import plot_timeseries
import matplotlib.pyplot as plt

from anomsmith import detect_anomalies, ThresholdRule
from anomsmith.primitives.detectors.pca import PCADetector

np.random.seed(42)


## Creating Multivariate Test Data

For PCA, we'll create data with multiple correlated features.


In [None]:
def create_multivariate_data(n: int = 200, n_features: int = 5, contamination: float = 0.1, seed: int = 42):
    """Create multivariate time series with anomalies."""
    np.random.seed(seed)
    
    # Create correlated features
    base = np.random.randn(n, n_features)
    # Add correlation structure
    correlation_matrix = np.random.rand(n_features, n_features)
    correlation_matrix = correlation_matrix @ correlation_matrix.T
    correlation_matrix = correlation_matrix / np.sqrt(np.diag(correlation_matrix))[:, None]
    correlation_matrix = correlation_matrix / np.sqrt(np.diag(correlation_matrix))[None, :]
    
    data = base @ correlation_matrix.T
    
    # Add trend and seasonality to first feature
    t = np.arange(n)
    data[:, 0] += 0.01 * t + 2 * np.sin(2 * np.pi * t / 50)
    
    # Inject anomalies
    n_anomalies = int(n * contamination)
    anomaly_indices = np.random.choice(n, n_anomalies, replace=False)
    
    for idx in anomaly_indices:
        # Anomalies affect multiple features
        data[idx] += np.random.randn(n_features) * 3
    
    # Convert to DataFrame for easier handling
    index = pd.date_range("2020-01-01", periods=n, freq="D")
    df = pd.DataFrame(data, index=index, columns=[f'feature_{i}' for i in range(n_features)])
    
    # For PCA detector, we'll use the first feature as y and others as X
    y = df['feature_0']
    X = df.drop('feature_0', axis=1)
    
    return y, X, anomaly_indices

y, X, true_anomaly_indices = create_multivariate_data(n=200, n_features=5, contamination=0.1)
print(f"Created multivariate data with {len(y)} points and {X.shape[1]} features")
print(f"True anomalies: {len(true_anomaly_indices)}")


In [None]:
# Visualize the multivariate data
import matplotlib.pyplot as plt

# Main feature
fig1, ax1 = plot_timeseries(
    y,
    title='Main Feature with Anomalies',
    xlabel='Date',
    ylabel='Value'
)
ax1.scatter(y.index[true_anomaly_indices], y.values[true_anomaly_indices], 
           color='red', s=100, marker='x', linewidths=2, 
           label=f'True Anomalies ({len(true_anomaly_indices)})', zorder=5)
ax1.legend()
plt.show()

# Other features
fig2, ax2 = plot_timeseries(
    X.iloc[:, 0],
    title='Additional Features',
    xlabel='Date',
    ylabel='Value'
)
for col in X.columns[1:]:
    ax2.plot(X.index, X[col].values, alpha=0.6, label=col)
ax2.legend()
plt.show()


## PCA Detection with Different Scoring Methods

Let's compare the three scoring methods: reconstruction error, Mahalanobis distance, and both.


In [None]:
# Create PCA detectors with different scoring methods
pca_reconstruction = PCADetector(
    n_components=0.95,
    score_method='reconstruction',
    contamination=0.1,
    random_state=42
)

pca_mahalanobis = PCADetector(
    n_components=0.95,
    score_method='mahalanobis',
    contamination=0.1,
    random_state=42
)

pca_both = PCADetector(
    n_components=0.95,
    score_method='both',
    contamination=0.1,
    random_state=42
)

# Fit detectors (using X as features, y as target)
pca_reconstruction.fit(y.values, X.values)
pca_mahalanobis.fit(y.values, X.values)
pca_both.fit(y.values, X.values)

# Define threshold rule
threshold_rule = ThresholdRule(method="quantile", value=0.9, quantile=0.9)

# Detect anomalies
result_recon = detect_anomalies(y, pca_reconstruction, threshold_rule)
result_mahal = detect_anomalies(y, pca_mahalanobis, threshold_rule)
result_both = detect_anomalies(y, pca_both, threshold_rule)

# Compare results
comparison = pd.DataFrame({
    'Reconstruction': [
        result_recon['flag'].sum(),
        result_recon['flag'].mean(),
        result_recon['score'].mean(),
        result_recon['score'].std()
    ],
    'Mahalanobis': [
        result_mahal['flag'].sum(),
        result_mahal['flag'].mean(),
        result_mahal['score'].mean(),
        result_mahal['score'].std()
    ],
    'Both': [
        result_both['flag'].sum(),
        result_both['flag'].mean(),
        result_both['score'].mean(),
        result_both['score'].std()
    ]
}, index=['Anomalies Detected', 'Anomaly Rate', 'Mean Score', 'Std Score'])

print("PCA Scoring Method Comparison:")
print(comparison.round(4))


In [None]:
# Visualize detection results
methods = [
    ('Reconstruction Error', result_recon, 'blue'),
    ('Mahalanobis Distance', result_mahal, 'green'),
    ('Both (Average)', result_both, 'orange')
]

for name, result, color in methods:
    anomaly_mask = result['flag'] == 1
    fig, ax = plot_timeseries(
        y,
        title=f'PCA Detection: {name}',
        xlabel='Date',
        ylabel='Value'
    )
    # True anomalies
    ax.scatter(y.index[true_anomaly_indices], y.values[true_anomaly_indices], 
              color='gray', s=80, marker='o', alpha=0.5, 
              label='True Anomalies', zorder=3)
    # Detected anomalies
    ax.scatter(y.index[anomaly_mask], y.values[anomaly_mask], 
              color='red', s=100, marker='x', linewidths=2, 
              label=f'Detected ({anomaly_mask.sum()})', zorder=5)
    ax.legend()
    plt.show()


## Understanding PCA Components

Let's examine how many components PCA is using and the explained variance.


In [None]:
# Get PCA model from one of the detectors
pca_model = pca_reconstruction.pca_
print(f"Number of components: {pca_model.n_components_}")
print(f"Explained variance ratio: {pca_model.explained_variance_ratio_}")
print(f"Total explained variance: {pca_model.explained_variance_ratio_.sum():.4f}")

# Visualize explained variance
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(range(1, len(pca_model.explained_variance_ratio_) + 1), 
       pca_model.explained_variance_ratio_, alpha=0.7)
ax.set_xlabel('Principal Component', fontsize=12)
ax.set_ylabel('Explained Variance Ratio', fontsize=12)
ax.set_title('PCA Explained Variance by Component', fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()


## Summary

In this notebook, we've explored:
1. **PCADetector** with reconstruction error scoring
2. **PCADetector** with Mahalanobis distance scoring
3. **PCADetector** with combined scoring

Key takeaways:
- PCA is excellent for multivariate anomaly detection
- Reconstruction error captures how well data fits the normal pattern
- Mahalanobis distance measures distance in the principal component space
- The choice of scoring method depends on your specific use case
