# Anomaly Detection System - Exploratory Analysis

This notebook demonstrates the anomaly detection pipeline and provides visualizations.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys

sys.path.append('..')
from src.utils.config import config
from src.utils.db_utils import db_manager

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Imports successful!")

## 1. Load Raw Data

In [None]:
# Load the latest raw data file
raw_path = Path(config.raw_data_path)
parquet_files = list(raw_path.glob('*.parquet'))
latest_file = max(parquet_files, key=lambda p: p.stat().st_mtime)

df = pd.read_parquet(latest_file)
print(f"Loaded {len(df)} records from {latest_file.name}")
df.head()

## 2. Data Overview

In [None]:
# Basic statistics
print("Dataset Info:")
print(f"Shape: {df.shape}")
print(f"Time range: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f"\nAnomaly distribution:")
print(df['is_anomaly'].value_counts())
print(f"Anomaly rate: {df['is_anomaly'].mean()*100:.2f}%")

In [None]:
# Log type distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

df['log_type'].value_counts().plot(kind='bar', ax=axes[0], color='steelblue')
axes[0].set_title('Log Type Distribution')
axes[0].set_xlabel('Log Type')
axes[0].set_ylabel('Count')

df['log_level'].value_counts().plot(kind='bar', ax=axes[1], color='coral')
axes[1].set_title('Log Level Distribution')
axes[1].set_xlabel('Log Level')
axes[1].set_ylabel('Count')

plt.tight_layout()
plt.show()

## 3. Temporal Patterns

In [None]:
# Logs over time
df['hour'] = df['timestamp'].dt.hour
df['date'] = df['timestamp'].dt.date

fig, axes = plt.subplots(2, 1, figsize=(15, 10))

# Hourly distribution
hourly_counts = df.groupby('hour').size()
axes[0].plot(hourly_counts.index, hourly_counts.values, marker='o', linewidth=2)
axes[0].set_title('Log Volume by Hour of Day')
axes[0].set_xlabel('Hour')
axes[0].set_ylabel('Number of Logs')
axes[0].grid(True, alpha=0.3)

# Daily distribution with anomalies
daily_stats = df.groupby('date').agg({
    'log_id': 'count',
    'is_anomaly': 'sum'
}).reset_index()
daily_stats.columns = ['date', 'total_logs', 'anomalies']

axes[1].plot(daily_stats['date'], daily_stats['total_logs'], label='Total Logs', linewidth=2)
axes[1].plot(daily_stats['date'], daily_stats['anomalies'], label='Anomalies', linewidth=2, color='red')
axes[1].set_title('Daily Log Volume and Anomalies')
axes[1].set_xlabel('Date')
axes[1].set_ylabel('Count')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Feature Analysis

In [None]:
# Response time distribution
api_logs = df[df['log_type'] == 'API_REQUEST'].copy()

if 'response_time_ms' in api_logs.columns:
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Normal vs Anomaly
    api_logs[api_logs['is_anomaly'] == False]['response_time_ms'].hist(
        bins=50, ax=axes[0], alpha=0.7, label='Normal', color='blue'
    )
    api_logs[api_logs['is_anomaly'] == True]['response_time_ms'].hist(
        bins=50, ax=axes[0], alpha=0.7, label='Anomaly', color='red'
    )
    axes[0].set_title('Response Time Distribution')
    axes[0].set_xlabel('Response Time (ms)')
    axes[0].set_ylabel('Frequency')
    axes[0].legend()
    
    # Box plot
    api_logs.boxplot(column='response_time_ms', by='is_anomaly', ax=axes[1])
    axes[1].set_title('Response Time: Normal vs Anomaly')
    axes[1].set_xlabel('Is Anomaly')
    axes[1].set_ylabel('Response Time (ms)')
    
    plt.tight_layout()
    plt.show()

## 5. Load Processed Features

In [None]:
# Load feature-engineered data
processed_path = Path(config.processed_data_path)
feature_files = list(processed_path.glob('features_*'))

if feature_files:
    latest_features = max(feature_files, key=lambda p: p.stat().st_mtime)
    features_df = pd.read_parquet(latest_features)
    print(f"Loaded {len(features_df)} records with {len(features_df.columns)} features")
    features_df.head()
else:
    print("No feature files found. Run feature_engineering.py first.")

## 6. Model Results

In [None]:
# Load anomaly detection results from database
anomalies = db_manager.get_recent_anomalies(hours=168, min_score=0.5)

if anomalies:
    anomalies_df = pd.DataFrame(anomalies)
    print(f"Found {len(anomalies_df)} anomalies in the database")
    
    # Score distribution
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    anomalies_df['anomaly_score'].hist(bins=30, color='darkred', alpha=0.7)
    plt.title('Anomaly Score Distribution')
    plt.xlabel('Anomaly Score')
    plt.ylabel('Frequency')
    
    plt.subplot(1, 2, 2)
    anomalies_df['model_name'].value_counts().plot(kind='bar', color='steelblue')
    plt.title('Anomalies by Model')
    plt.xlabel('Model')
    plt.ylabel('Count')
    
    plt.tight_layout()
    plt.show()
    
    # Top anomalies
    print("\nTop 10 Anomalies:")
    top_anomalies = anomalies_df.nlargest(10, 'anomaly_score')
    print(top_anomalies[['timestamp', 'log_type', 'anomaly_score', 'message']])
else:
    print("No anomalies found in database. Run batch_inference.py first.")

## 7. Model Performance Metrics

In [None]:
# Query model performance
query = """
    SELECT model_name, metric_name, metric_value, evaluation_date
    FROM model_performance
    ORDER BY evaluation_date DESC, model_name
"""

performance = db_manager.execute_query(query)

if performance:
    perf_df = pd.DataFrame(performance)
    
    # Pivot for easier viewing
    perf_pivot = perf_df.pivot_table(
        index='model_name',
        columns='metric_name',
        values='metric_value',
        aggfunc='first'
    )
    
    print("Model Performance Comparison:")
    print(perf_pivot)
    
    # Visualize
    perf_pivot.plot(kind='bar', figsize=(10, 6))
    plt.title('Model Performance Metrics')
    plt.xlabel('Model')
    plt.ylabel('Score')
    plt.legend(title='Metric')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("No performance metrics found.")

## 8. Feature Importance (for Isolation Forest)

In [None]:
import joblib

# Load Isolation Forest model
models_path = Path(config.models_path)
if_models = list(models_path.glob('isolation_forest_*_model.pkl'))

if if_models:
    latest_model = str(max(if_models, key=lambda p: p.stat().st_mtime)).replace('_model.pkl', '')
    model = joblib.load(f"{latest_model}_model.pkl")
    feature_names = joblib.load(f"{latest_model}_features.pkl")
    
    # Note: Isolation Forest doesn't have feature_importances_
    # We can compute approximate importance using permutation or SHAP
    print(f"Model uses {len(feature_names)} features:")
    for i, feat in enumerate(feature_names, 1):
        print(f"{i}. {feat}")
else:
    print("No Isolation Forest model found.")