# TSIOT Anomaly Detection Example

This notebook demonstrates anomaly detection techniques using TSIOT-generated synthetic time series data.

## Techniques Covered:
1. Statistical anomaly detection (Z-score, IQR)
2. Isolation Forest
3. Local Outlier Factor (LOF)
4. Autoencoder-based detection
5. LSTM-based anomaly detection

In [None]:
# Install required packages
!pip install requests pandas numpy matplotlib seaborn scikit-learn tensorflow plotly

In [None]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, RepeatVector, TimeDistributed
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8')
plt.rcParams['figure.figsize'] = (15, 10)

## 1. Generate Synthetic Data with Anomalies

In [None]:
# TSIOT API configuration
TSIOT_BASE_URL = "http://localhost:8080"
API_KEY = "your-api-key-here"

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {API_KEY}"
}

def generate_normal_series():
    """Generate normal time series data."""
    data = {
        "type": "lstm",
        "length": 1000,
        "parameters": {
            "trend": 0.05,
            "seasonality": 24,
            "noise": 0.1
        }
    }
    
    response = requests.post(f"{TSIOT_BASE_URL}/api/v1/generate", json=data, headers=headers)
    return response.json() if response.status_code == 200 else None

def inject_anomalies(values, anomaly_rate=0.05):
    """Inject synthetic anomalies into time series."""
    anomalous_values = values.copy()
    n_anomalies = int(len(values) * anomaly_rate)
    anomaly_indices = np.random.choice(len(values), n_anomalies, replace=False)
    
    for idx in anomaly_indices:
        # Create different types of anomalies
        anomaly_type = np.random.choice(['spike', 'dip', 'shift'])
        
        if anomaly_type == 'spike':
            anomalous_values[idx] *= np.random.uniform(3, 5)
        elif anomaly_type == 'dip':
            anomalous_values[idx] *= np.random.uniform(0.1, 0.3)
        elif anomaly_type == 'shift':
            shift_length = min(20, len(values) - idx)
            shift_value = np.random.uniform(-2, 2) * np.std(values)
            anomalous_values[idx:idx+shift_length] += shift_value
    
    return anomalous_values, anomaly_indices

# Generate data
normal_data = generate_normal_series()
if normal_data:
    normal_values = np.array(normal_data['values'])
    anomalous_values, true_anomaly_indices = inject_anomalies(normal_values)
    
    # Create DataFrame
    timestamps = pd.date_range(start='2023-01-01', periods=len(normal_values), freq='H')
    df = pd.DataFrame({
        'timestamp': timestamps,
        'normal': normal_values,
        'anomalous': anomalous_values,
        'is_anomaly': False
    })
    df.loc[true_anomaly_indices, 'is_anomaly'] = True
    df.set_index('timestamp', inplace=True)
    
    print(f"✅ Generated {len(df)} data points with {len(true_anomaly_indices)} anomalies")
else:
    print("❌ Failed to generate data")

In [None]:
# Visualize normal vs anomalous data
fig, axes = plt.subplots(2, 1, figsize=(15, 10))

# Normal data
axes[0].plot(df.index, df['normal'], color='blue', alpha=0.7, label='Normal Data')
axes[0].set_title('Normal Time Series')
axes[0].set_ylabel('Value')
axes[0].grid(True, alpha=0.3)
axes[0].legend()

# Anomalous data with anomalies highlighted
axes[1].plot(df.index, df['anomalous'], color='blue', alpha=0.7, label='Time Series')
anomaly_points = df[df['is_anomaly']]
axes[1].scatter(anomaly_points.index, anomaly_points['anomalous'], 
               color='red', s=50, alpha=0.8, label=f'Anomalies ({len(anomaly_points)})')
axes[1].set_title('Time Series with Injected Anomalies')
axes[1].set_ylabel('Value')
axes[1].set_xlabel('Time')
axes[1].grid(True, alpha=0.3)
axes[1].legend()

plt.tight_layout()
plt.show()

## 2. Statistical Anomaly Detection

In [None]:
def detect_anomalies_zscore(values, threshold=3):
    """Detect anomalies using Z-score method."""
    z_scores = np.abs((values - np.mean(values)) / np.std(values))
    return z_scores > threshold

def detect_anomalies_iqr(values, factor=1.5):
    """Detect anomalies using IQR method."""
    Q1 = np.percentile(values, 25)
    Q3 = np.percentile(values, 75)
    IQR = Q3 - Q1
    lower_bound = Q1 - factor * IQR
    upper_bound = Q3 + factor * IQR
    return (values < lower_bound) | (values > upper_bound)

def detect_anomalies_moving_avg(values, window=20, threshold=2):
    """Detect anomalies using moving average method."""
    moving_avg = pd.Series(values).rolling(window=window, center=True).mean()
    moving_std = pd.Series(values).rolling(window=window, center=True).std()
    deviation = np.abs(values - moving_avg) / moving_std
    return deviation > threshold

# Apply statistical methods
df['anomaly_zscore'] = detect_anomalies_zscore(df['anomalous'])
df['anomaly_iqr'] = detect_anomalies_iqr(df['anomalous'])
df['anomaly_moving_avg'] = detect_anomalies_moving_avg(df['anomalous'])

# Calculate performance metrics
def calculate_metrics(true_anomalies, predicted_anomalies):
    """Calculate precision, recall, and F1-score."""
    tp = np.sum(true_anomalies & predicted_anomalies)
    fp = np.sum(~true_anomalies & predicted_anomalies)
    fn = np.sum(true_anomalies & ~predicted_anomalies)
    
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1

# Evaluate statistical methods
methods = ['zscore', 'iqr', 'moving_avg']
results = {}

for method in methods:
    col_name = f'anomaly_{method}'
    precision, recall, f1 = calculate_metrics(df['is_anomaly'], df[col_name])
    results[method] = {'precision': precision, 'recall': recall, 'f1': f1}
    print(f"{method.upper()}: Precision={precision:.3f}, Recall={recall:.3f}, F1={f1:.3f}")

## 3. Machine Learning Anomaly Detection

In [None]:
# Prepare features for ML models
def create_features(values, window=10):
    """Create features for anomaly detection."""
    features = []
    
    for i in range(window, len(values)):
        # Statistical features
        window_data = values[i-window:i]
        feat = [
            np.mean(window_data),
            np.std(window_data),
            np.min(window_data),
            np.max(window_data),
            np.percentile(window_data, 25),
            np.percentile(window_data, 75),
            values[i] - np.mean(window_data),  # deviation from window mean
            (values[i] - values[i-1]) if i > 0 else 0,  # first difference
        ]
        features.append(feat)
    
    return np.array(features)

# Create features
window_size = 20
X = create_features(df['anomalous'].values, window_size)
y_true = df['is_anomaly'].values[window_size:]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"Feature matrix shape: {X_scaled.shape}")
print(f"True anomalies in features: {np.sum(y_true)}")

In [None]:
# Isolation Forest
iso_forest = IsolationForest(contamination=0.05, random_state=42)
iso_predictions = iso_forest.fit_predict(X_scaled)
iso_anomalies = iso_predictions == -1

# Local Outlier Factor
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05)
lof_predictions = lof.fit_predict(X_scaled)
lof_anomalies = lof_predictions == -1

# Evaluate ML methods
iso_precision, iso_recall, iso_f1 = calculate_metrics(y_true, iso_anomalies)
lof_precision, lof_recall, lof_f1 = calculate_metrics(y_true, lof_anomalies)

print(f"Isolation Forest: Precision={iso_precision:.3f}, Recall={iso_recall:.3f}, F1={iso_f1:.3f}")
print(f"LOF: Precision={lof_precision:.3f}, Recall={lof_recall:.3f}, F1={lof_f1:.3f}")

# Add results to dataframe
df_ml = df.iloc[window_size:].copy()
df_ml['anomaly_isolation_forest'] = iso_anomalies
df_ml['anomaly_lof'] = lof_anomalies

## 4. LSTM Autoencoder Anomaly Detection

In [None]:
# Prepare data for LSTM autoencoder
def create_sequences(data, seq_length):
    """Create sequences for LSTM training."""
    sequences = []
    for i in range(len(data) - seq_length + 1):
        sequences.append(data[i:i + seq_length])
    return np.array(sequences)

# Use only normal data for training (unsupervised)
normal_data_only = df['normal'].values
seq_length = 50

# Normalize data
data_mean = np.mean(normal_data_only)
data_std = np.std(normal_data_only)
normalized_normal = (normal_data_only - data_mean) / data_std
normalized_anomalous = (df['anomalous'].values - data_mean) / data_std

# Create sequences
X_train = create_sequences(normalized_normal, seq_length)
X_test = create_sequences(normalized_anomalous, seq_length)

print(f"Training sequences: {X_train.shape}")
print(f"Test sequences: {X_test.shape}")

In [None]:
# Build LSTM autoencoder
def build_lstm_autoencoder(seq_length, n_features=1):
    """Build LSTM autoencoder model."""
    # Encoder
    input_layer = Input(shape=(seq_length, n_features))
    encoded = LSTM(64, activation='relu', return_sequences=True)(input_layer)
    encoded = LSTM(32, activation='relu', return_sequences=False)(encoded)
    
    # Decoder
    decoded = RepeatVector(seq_length)(encoded)
    decoded = LSTM(32, activation='relu', return_sequences=True)(decoded)
    decoded = LSTM(64, activation='relu', return_sequences=True)(decoded)
    decoded = TimeDistributed(Dense(n_features))(decoded)
    
    autoencoder = Model(input_layer, decoded)
    autoencoder.compile(optimizer='adam', loss='mse')
    
    return autoencoder

# Build and train model
autoencoder = build_lstm_autoencoder(seq_length)
print("Training LSTM autoencoder...")

# Reshape for LSTM (add feature dimension)
X_train_reshaped = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test_reshaped = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Train autoencoder
history = autoencoder.fit(
    X_train_reshaped, X_train_reshaped,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    verbose=0
)

print("✅ LSTM autoencoder training completed")

In [None]:
# Use autoencoder for anomaly detection
# Predict and calculate reconstruction error
X_pred = autoencoder.predict(X_test_reshaped)
reconstruction_errors = np.mean(np.square(X_test_reshaped - X_pred), axis=(1, 2))

# Determine threshold (using percentile of training reconstruction errors)
train_pred = autoencoder.predict(X_train_reshaped)
train_errors = np.mean(np.square(X_train_reshaped - train_pred), axis=(1, 2))
threshold = np.percentile(train_errors, 95)  # 95th percentile

# Identify anomalies
lstm_anomalies = reconstruction_errors > threshold

# Map back to original indices
y_true_lstm = df['is_anomaly'].values[seq_length-1:seq_length-1+len(lstm_anomalies)]

# Evaluate LSTM autoencoder
lstm_precision, lstm_recall, lstm_f1 = calculate_metrics(y_true_lstm, lstm_anomalies)
print(f"LSTM Autoencoder: Precision={lstm_precision:.3f}, Recall={lstm_recall:.3f}, F1={lstm_f1:.3f}")
print(f"Threshold: {threshold:.4f}")
print(f"Anomalies detected: {np.sum(lstm_anomalies)}")

## 5. Results Comparison and Visualization

In [None]:
# Compile all results
all_results = {
    'Z-Score': results['zscore'],
    'IQR': results['iqr'],
    'Moving Avg': results['moving_avg'],
    'Isolation Forest': {'precision': iso_precision, 'recall': iso_recall, 'f1': iso_f1},
    'LOF': {'precision': lof_precision, 'recall': lof_recall, 'f1': lof_f1},
    'LSTM Autoencoder': {'precision': lstm_precision, 'recall': lstm_recall, 'f1': lstm_f1}
}

# Create comparison DataFrame
comparison_df = pd.DataFrame(all_results).T
print("🏆 Method Comparison:")
print(comparison_df.round(3))

In [None]:
# Visualization of results
fig, axes = plt.subplots(2, 2, figsize=(20, 12))

# Performance comparison
methods = list(all_results.keys())
metrics = ['precision', 'recall', 'f1']
colors = ['skyblue', 'lightgreen', 'salmon']

x = np.arange(len(methods))
width = 0.25

for i, metric in enumerate(metrics):
    values = [all_results[method][metric] for method in methods]
    axes[0, 0].bar(x + i*width, values, width, label=metric.capitalize(), color=colors[i])

axes[0, 0].set_xlabel('Methods')
axes[0, 0].set_ylabel('Score')
axes[0, 0].set_title('Anomaly Detection Performance Comparison')
axes[0, 0].set_xticks(x + width)
axes[0, 0].set_xticklabels(methods, rotation=45)
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# F1-Score comparison
f1_scores = [all_results[method]['f1'] for method in methods]
axes[0, 1].bar(methods, f1_scores, color='lightcoral')
axes[0, 1].set_title('F1-Score Comparison')
axes[0, 1].set_ylabel('F1-Score')
axes[0, 1].tick_params(axis='x', rotation=45)
axes[0, 1].grid(True, alpha=0.3)

# LSTM reconstruction errors
axes[1, 0].plot(reconstruction_errors, alpha=0.7, label='Reconstruction Error')
axes[1, 0].axhline(y=threshold, color='red', linestyle='--', label=f'Threshold ({threshold:.4f})')
axes[1, 0].scatter(np.where(lstm_anomalies)[0], reconstruction_errors[lstm_anomalies], 
                  color='red', s=30, alpha=0.8, label='Detected Anomalies')
axes[1, 0].set_title('LSTM Autoencoder - Reconstruction Errors')
axes[1, 0].set_xlabel('Time Steps')
axes[1, 0].set_ylabel('Reconstruction Error')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Time series with all detected anomalies
time_subset = df.iloc[100:300]  # Show a subset for clarity
axes[1, 1].plot(time_subset.index, time_subset['anomalous'], alpha=0.7, label='Time Series')
true_anomalies_subset = time_subset[time_subset['is_anomaly']]
axes[1, 1].scatter(true_anomalies_subset.index, true_anomalies_subset['anomalous'], 
                  color='red', s=50, label='True Anomalies', marker='x')
detected_subset = time_subset[time_subset['anomaly_zscore']]
axes[1, 1].scatter(detected_subset.index, detected_subset['anomalous'], 
                  color='orange', s=30, alpha=0.7, label='Z-Score Detected', marker='o')
axes[1, 1].set_title('Anomaly Detection Results (Subset)')
axes[1, 1].set_xlabel('Time')
axes[1, 1].set_ylabel('Value')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Summary and Recommendations

In [None]:
# Generate summary report
print("\n" + "="*80)
print("📋 ANOMALY DETECTION ANALYSIS SUMMARY")
print("="*80)

print(f"\n📊 Dataset Statistics:")
print(f"   Total data points: {len(df)}")
print(f"   True anomalies: {np.sum(df['is_anomaly'])} ({np.mean(df['is_anomaly'])*100:.1f}%)")
print(f"   Time range: {df.index.min().strftime('%Y-%m-%d')} to {df.index.max().strftime('%Y-%m-%d')}")

print(f"\n🏆 Best Performing Methods:")
best_f1 = max(f1_scores)
best_method = methods[f1_scores.index(best_f1)]
print(f"   Best F1-Score: {best_method} ({best_f1:.3f})")

best_precision = max([all_results[method]['precision'] for method in methods])
best_precision_method = [method for method in methods if all_results[method]['precision'] == best_precision][0]
print(f"   Best Precision: {best_precision_method} ({best_precision:.3f})")

best_recall = max([all_results[method]['recall'] for method in methods])
best_recall_method = [method for method in methods if all_results[method]['recall'] == best_recall][0]
print(f"   Best Recall: {best_recall_method} ({best_recall:.3f})")

print(f"\n💡 Recommendations:")
if best_f1 > 0.7:
    print(f"   ✅ {best_method} shows excellent performance for this dataset")
elif best_f1 > 0.5:
    print(f"   ⚠️ {best_method} shows moderate performance - consider ensemble methods")
else:
    print(f"   ❌ All methods show poor performance - review data quality and parameters")

print(f"\n🔧 Method-Specific Insights:")
for method, result in all_results.items():
    f1 = result['f1']
    if f1 > 0.6:
        print(f"   ✅ {method}: High performance (F1={f1:.3f})")
    elif f1 > 0.3:
        print(f"   ⚠️ {method}: Moderate performance (F1={f1:.3f}) - tune parameters")
    else:
        print(f"   ❌ {method}: Poor performance (F1={f1:.3f}) - not suitable for this data")

print(f"\n📈 Improvement Suggestions:")
print(f"   • Try ensemble methods combining multiple detectors")
print(f"   • Adjust contamination rate based on domain knowledge")
print(f"   • Use domain-specific features for better detection")
print(f"   • Consider temporal patterns and seasonality")
print(f"   • Validate results with domain experts")

print("\n" + "="*80)

## Next Steps

This notebook demonstrated various anomaly detection techniques on TSIOT-generated synthetic data. Consider:

1. **Parameter Tuning**: Optimize thresholds and parameters for each method
2. **Ensemble Methods**: Combine multiple detectors for better performance
3. **Real-time Detection**: Implement streaming anomaly detection
4. **Domain Adaptation**: Customize methods for specific use cases
5. **Validation**: Test on real-world data to ensure effectiveness

For more examples, explore other notebooks in this directory or visit the [TSIOT documentation](../../docs/).