# Advanced Anomaly Detection for Equipment Monitoring

This notebook demonstrates comprehensive anomaly detection techniques for semiconductor equipment monitoring, including unsupervised methods and real-time scoring systems.

## Learning Objectives

By the end of this notebook, you will:
- Implement multiple anomaly detection algorithms (Isolation Forest, One-Class SVM, Autoencoders)
- Build ensemble methods for robust anomaly detection
- Create real-time anomaly scoring systems
- Integrate with manufacturing execution systems
- Evaluate performance using semiconductor-specific metrics

In [None]:
import sys
import os
import warnings
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve

# Configure plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("viridis")
%matplotlib inline

# Import our anomaly detection pipeline
from anomaly_detection_pipeline import AnomalyDetectionPipeline, EquipmentDataGenerator

print("Environment setup complete")
print(f"Working directory: {os.getcwd()}")

## 1. Understanding Equipment Monitoring Data

Semiconductor equipment generates continuous streams of sensor data that need to be monitored for anomalies:

In [None]:
# Generate synthetic equipment monitoring data
data_generator = EquipmentDataGenerator(
    n_samples=5000,
    anomaly_fraction=0.05,  # 5% anomalies
    equipment_types=['etch', 'deposition', 'lithography', 'inspection']
)

# Generate dataset with various equipment parameters
equipment_data = data_generator.generate_equipment_data()
print(f"Generated dataset shape: {equipment_data.shape}")
print(f"Dataset columns: {list(equipment_data.columns)}")

# Display basic statistics
print("\nDataset Overview:")
print(equipment_data.describe())

In [None]:
# Visualize equipment parameter distributions
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Equipment Parameter Distributions', fontsize=16)

# Select key parameters for visualization
key_params = ['temperature', 'pressure', 'flow_rate', 'power', 'vacuum_level', 'vibration']

for i, param in enumerate(key_params):
    row = i // 3
    col = i % 3
    
    if param in equipment_data.columns:
        # Separate normal and anomalous data if labels available
        if 'is_anomaly' in equipment_data.columns:
            normal_data = equipment_data[equipment_data['is_anomaly'] == 0][param]
            anomaly_data = equipment_data[equipment_data['is_anomaly'] == 1][param]
            
            axes[row, col].hist(normal_data, bins=50, alpha=0.7, label='Normal', color='blue')
            axes[row, col].hist(anomaly_data, bins=50, alpha=0.7, label='Anomaly', color='red')
            axes[row, col].legend()
        else:
            axes[row, col].hist(equipment_data[param], bins=50, alpha=0.7, color='skyblue')
        
        axes[row, col].set_title(f'{param.replace("_", " ").title()} Distribution')
        axes[row, col].set_xlabel(param.replace('_', ' ').title())
        axes[row, col].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

print(f"\nAnomaly Distribution:")
if 'is_anomaly' in equipment_data.columns:
    anomaly_counts = equipment_data['is_anomaly'].value_counts()
    print(f"Normal samples: {anomaly_counts[0]}")
    print(f"Anomalous samples: {anomaly_counts[1]}")
    print(f"Anomaly rate: {anomaly_counts[1] / len(equipment_data):.1%}")

## 2. Time Series Analysis of Equipment Data

Equipment anomalies often manifest as temporal patterns. Let's analyze the time series characteristics:

In [None]:
# Generate time series data for detailed analysis
time_series_data = data_generator.generate_time_series_data(
    duration_hours=24,
    sampling_frequency=60  # 1 sample per minute
)

print(f"Time series data shape: {time_series_data.shape}")
print(f"Time range: {time_series_data['timestamp'].min()} to {time_series_data['timestamp'].max()}")

# Plot key parameters over time
fig, axes = plt.subplots(3, 1, figsize=(16, 12))
fig.suptitle('Equipment Parameters Over Time (24 Hours)', fontsize=16)

# Temperature over time
axes[0].plot(time_series_data['timestamp'], time_series_data['temperature'], 
             color='red', alpha=0.8, linewidth=1)
axes[0].set_ylabel('Temperature (¬∞C)')
axes[0].set_title('Process Temperature')
axes[0].grid(True, alpha=0.3)

# Pressure over time
axes[1].plot(time_series_data['timestamp'], time_series_data['pressure'], 
             color='blue', alpha=0.8, linewidth=1)
axes[1].set_ylabel('Pressure (Torr)')
axes[1].set_title('Chamber Pressure')
axes[1].grid(True, alpha=0.3)

# Flow rate over time
axes[2].plot(time_series_data['timestamp'], time_series_data['flow_rate'], 
             color='green', alpha=0.8, linewidth=1)
axes[2].set_ylabel('Flow Rate (sccm)')
axes[2].set_xlabel('Time')
axes[2].set_title('Gas Flow Rate')
axes[2].grid(True, alpha=0.3)

# Highlight anomalous periods if available
if 'is_anomaly' in time_series_data.columns:
    anomaly_periods = time_series_data[time_series_data['is_anomaly'] == 1]
    for ax in axes:
        for _, anomaly in anomaly_periods.iterrows():
            ax.axvline(x=anomaly['timestamp'], color='red', alpha=0.3, linestyle='--')

plt.tight_layout()
plt.xticks(rotation=45)
plt.show()

print(f"Time series statistics computed")

## 3. Initialize Anomaly Detection Pipeline

Our pipeline supports multiple algorithms and ensemble methods:

In [None]:
# Initialize the anomaly detection pipeline
pipeline = AnomalyDetectionPipeline(
    algorithms=['isolation_forest', 'one_class_svm', 'autoencoder'],
    ensemble_method='voting',
    contamination=0.05,  # Expected anomaly rate
    random_state=42
)

print(f"Pipeline initialized with algorithms: {pipeline.algorithms}")
print(f"Ensemble method: {pipeline.ensemble_method}")
print(f"Expected contamination rate: {pipeline.contamination:.1%}")

# Prepare feature data (exclude timestamp and labels)
feature_columns = [col for col in equipment_data.columns 
                  if col not in ['timestamp', 'is_anomaly', 'equipment_id']]
X = equipment_data[feature_columns].copy()

print(f"\nFeature matrix shape: {X.shape}")
print(f"Feature columns: {feature_columns}")

## 4. Train Individual Anomaly Detection Models

Let's train and evaluate each algorithm separately first:

In [None]:
# Split data for training and testing
# Note: In unsupervised anomaly detection, we typically use all data for training
# But we'll create a test set for evaluation purposes
X_train, X_test = train_test_split(X, test_size=0.3, random_state=42, stratify=equipment_data['is_anomaly'])

if 'is_anomaly' in equipment_data.columns:
    y_train = equipment_data.loc[X_train.index, 'is_anomaly']
    y_test = equipment_data.loc[X_test.index, 'is_anomaly']
else:
    y_train = None
    y_test = None

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

if y_train is not None:
    print(f"Training anomaly rate: {y_train.mean():.1%}")
    print(f"Test anomaly rate: {y_test.mean():.1%}")

In [None]:
# Train the pipeline
print("Training anomaly detection models...")
pipeline.fit(X_train)
print("Training completed!")

# Get predictions and scores
predictions = pipeline.predict(X_test)
anomaly_scores = pipeline.decision_function(X_test)

print(f"\nPredictions shape: {predictions.shape}")
print(f"Anomaly scores shape: {anomaly_scores.shape}")
print(f"Predicted anomalies: {predictions.sum()} out of {len(predictions)} samples")
print(f"Predicted anomaly rate: {predictions.mean():.1%}")

## 5. Evaluate Model Performance

Let's evaluate the performance using various metrics suitable for anomaly detection:

In [None]:
# Evaluate performance if we have ground truth labels
if y_test is not None:
    evaluation_results = pipeline.evaluate(X_test, y_test)
    
    print("Anomaly Detection Performance:")
    print("=" * 40)
    for metric, value in evaluation_results['metrics'].items():
        if isinstance(value, float):
            print(f"{metric}: {value:.4f}")
        else:
            print(f"{metric}: {value}")
    
    # Detailed classification report
    print("\nDetailed Classification Report:")
    print(classification_report(y_test, predictions, 
                              target_names=['Normal', 'Anomaly']))
    
    # ROC-AUC score
    if len(np.unique(y_test)) > 1:  # Need both classes for AUC
        auc_score = roc_auc_score(y_test, anomaly_scores)
        print(f"\nROC-AUC Score: {auc_score:.4f}")
else:
    print("No ground truth labels available for evaluation")
    print(f"Detected {predictions.sum()} anomalies out of {len(predictions)} samples")

## 6. Visualize Anomaly Detection Results

Let's create comprehensive visualizations to understand the detection results:

In [None]:
# Create visualization of anomaly detection results
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Anomaly Detection Analysis', fontsize=16)

# 1. Anomaly score distribution
axes[0, 0].hist(anomaly_scores, bins=50, alpha=0.7, color='skyblue', edgecolor='black')
axes[0, 0].axvline(x=0, color='red', linestyle='--', alpha=0.7, label='Decision Threshold')
axes[0, 0].set_xlabel('Anomaly Score')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Distribution of Anomaly Scores')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# 2. Feature importance for anomaly detection
if hasattr(pipeline, 'get_feature_importance'):
    feature_importance = pipeline.get_feature_importance()
    top_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)[:10]
    
    feature_names = [f[0] for f in top_features]
    importance_values = [f[1] for f in top_features]
    
    axes[0, 1].barh(feature_names, importance_values, alpha=0.7, color='lightcoral')
    axes[0, 1].set_xlabel('Importance Score')
    axes[0, 1].set_title('Top 10 Features for Anomaly Detection')
    axes[0, 1].grid(True, alpha=0.3)

# 3. Precision-Recall curve (if labels available)
if y_test is not None and len(np.unique(y_test)) > 1:
    precision, recall, thresholds = precision_recall_curve(y_test, anomaly_scores)
    axes[1, 0].plot(recall, precision, linewidth=2, color='green')
    axes[1, 0].set_xlabel('Recall')
    axes[1, 0].set_ylabel('Precision')
    axes[1, 0].set_title('Precision-Recall Curve')
    axes[1, 0].grid(True, alpha=0.3)
    
    # Add baseline
    baseline = y_test.mean()
    axes[1, 0].axhline(y=baseline, color='red', linestyle='--', 
                       alpha=0.7, label=f'Baseline ({baseline:.3f})')
    axes[1, 0].legend()

# 4. Confusion matrix heatmap (if labels available)
if y_test is not None:
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test, predictions)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Normal', 'Anomaly'], 
                yticklabels=['Normal', 'Anomaly'],
                ax=axes[1, 1])
    axes[1, 1].set_xlabel('Predicted')
    axes[1, 1].set_ylabel('Actual')
    axes[1, 1].set_title('Confusion Matrix')

plt.tight_layout()
plt.show()

print("Visualization completed")

## 7. Real-time Anomaly Scoring System

Let's implement a real-time scoring system for continuous monitoring:

In [None]:
# Simulate real-time data stream
def simulate_real_time_stream(duration_minutes=60, frequency_seconds=10):
    """Simulate real-time equipment data stream."""
    timestamps = []
    data_points = []
    
    start_time = datetime.now()
    num_points = (duration_minutes * 60) // frequency_seconds
    
    for i in range(num_points):
        current_time = start_time + timedelta(seconds=i * frequency_seconds)
        
        # Generate realistic equipment data point
        data_point = data_generator.generate_single_sample()
        
        timestamps.append(current_time)
        data_points.append(data_point)
    
    return pd.DataFrame(data_points), timestamps

# Generate streaming data
streaming_data, timestamps = simulate_real_time_stream(duration_minutes=30)
print(f"Generated {len(streaming_data)} real-time data points")
print(f"Streaming data shape: {streaming_data.shape}")

# Prepare streaming features
streaming_features = streaming_data[feature_columns]

# Real-time anomaly detection
print("\nProcessing real-time stream...")
streaming_scores = []
streaming_predictions = []

for i, (timestamp, features) in enumerate(zip(timestamps, streaming_features.values)):
    # Reshape for single sample prediction
    features_reshaped = features.reshape(1, -1)
    
    # Get anomaly score and prediction
    score = pipeline.decision_function(features_reshaped)[0]
    prediction = pipeline.predict(features_reshaped)[0]
    
    streaming_scores.append(score)
    streaming_predictions.append(prediction)
    
    # Simulate real-time processing (optional)
    if i % 50 == 0:
        print(f"Processed {i+1}/{len(timestamps)} samples...")

streaming_scores = np.array(streaming_scores)
streaming_predictions = np.array(streaming_predictions)

print(f"\nReal-time processing completed!")
print(f"Detected {streaming_predictions.sum()} anomalies in stream")
print(f"Real-time anomaly rate: {streaming_predictions.mean():.1%}")

In [None]:
# Visualize real-time anomaly detection
fig, axes = plt.subplots(3, 1, figsize=(16, 12))
fig.suptitle('Real-time Anomaly Detection Results', fontsize=16)

# Convert timestamps to relative minutes for plotting
relative_minutes = [(ts - timestamps[0]).total_seconds() / 60 for ts in timestamps]

# 1. Anomaly scores over time
axes[0].plot(relative_minutes, streaming_scores, color='blue', alpha=0.8, linewidth=1)
axes[0].axhline(y=0, color='red', linestyle='--', alpha=0.7, label='Anomaly Threshold')
axes[0].fill_between(relative_minutes, streaming_scores, 0, 
                     where=(streaming_scores < 0), alpha=0.3, color='red', label='Anomalies')
axes[0].set_ylabel('Anomaly Score')
axes[0].set_title('Real-time Anomaly Scores')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# 2. Key equipment parameter with anomaly markers
param_to_plot = 'temperature' if 'temperature' in streaming_features.columns else streaming_features.columns[0]
axes[1].plot(relative_minutes, streaming_features[param_to_plot], 
             color='green', alpha=0.8, linewidth=1)

# Mark anomalous points
anomaly_indices = np.where(streaming_predictions == 1)[0]
if len(anomaly_indices) > 0:
    anomaly_times = [relative_minutes[i] for i in anomaly_indices]
    anomaly_values = [streaming_features[param_to_plot].iloc[i] for i in anomaly_indices]
    axes[1].scatter(anomaly_times, anomaly_values, color='red', s=50, 
                   alpha=0.8, zorder=5, label='Detected Anomalies')
    axes[1].legend()

axes[1].set_ylabel(param_to_plot.replace('_', ' ').title())
axes[1].set_title(f'{param_to_plot.replace("_", " ").title()} with Anomaly Detection')
axes[1].grid(True, alpha=0.3)

# 3. Cumulative anomaly count
cumulative_anomalies = np.cumsum(streaming_predictions)
axes[2].plot(relative_minutes, cumulative_anomalies, color='orange', 
             linewidth=2, marker='o', markersize=2)
axes[2].set_xlabel('Time (minutes)')
axes[2].set_ylabel('Cumulative Anomalies')
axes[2].set_title('Cumulative Anomaly Count Over Time')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Real-time monitoring visualization completed")
print(f"Total monitoring duration: {relative_minutes[-1]:.1f} minutes")
print(f"Average anomaly score: {streaming_scores.mean():.4f}")
print(f"Score standard deviation: {streaming_scores.std():.4f}")

## 8. Ensemble Methods and Model Comparison

Let's compare individual algorithms and ensemble performance:

In [None]:
# Get individual algorithm results if available
if hasattr(pipeline, 'individual_results'):
    individual_results = pipeline.get_individual_predictions(X_test)
    
    print("Individual Algorithm Performance:")
    print("=" * 50)
    
    algorithm_performance = {}
    
    for algorithm_name, predictions_dict in individual_results.items():
        alg_predictions = predictions_dict['predictions']
        alg_scores = predictions_dict['scores']
        
        if y_test is not None:
            # Calculate performance metrics
            from sklearn.metrics import precision_score, recall_score, f1_score
            
            precision = precision_score(y_test, alg_predictions, zero_division=0)
            recall = recall_score(y_test, alg_predictions, zero_division=0)
            f1 = f1_score(y_test, alg_predictions, zero_division=0)
            
            if len(np.unique(y_test)) > 1:
                auc = roc_auc_score(y_test, alg_scores)
            else:
                auc = 0.0
            
            algorithm_performance[algorithm_name] = {
                'precision': precision,
                'recall': recall,
                'f1_score': f1,
                'roc_auc': auc,
                'detected_anomalies': alg_predictions.sum()
            }
            
            print(f"\n{algorithm_name.upper()}:")
            print(f"  Precision: {precision:.4f}")
            print(f"  Recall: {recall:.4f}")
            print(f"  F1-Score: {f1:.4f}")
            print(f"  ROC-AUC: {auc:.4f}")
            print(f"  Detected Anomalies: {alg_predictions.sum()}")
        else:
            print(f"\n{algorithm_name.upper()}:")
            print(f"  Detected Anomalies: {alg_predictions.sum()}")
            print(f"  Detection Rate: {alg_predictions.mean():.1%}")
    
    # Compare with ensemble
    if y_test is not None:
        ensemble_precision = precision_score(y_test, predictions, zero_division=0)
        ensemble_recall = recall_score(y_test, predictions, zero_division=0)
        ensemble_f1 = f1_score(y_test, predictions, zero_division=0)
        ensemble_auc = roc_auc_score(y_test, anomaly_scores) if len(np.unique(y_test)) > 1 else 0.0
        
        print(f"\nENSEMBLE:")
        print(f"  Precision: {ensemble_precision:.4f}")
        print(f"  Recall: {ensemble_recall:.4f}")
        print(f"  F1-Score: {ensemble_f1:.4f}")
        print(f"  ROC-AUC: {ensemble_auc:.4f}")
        print(f"  Detected Anomalies: {predictions.sum()}")
else:
    print("Individual algorithm results not available")
    print("Using ensemble predictions only")

## 9. Manufacturing Integration and Alerting

Let's implement a production-ready alerting system:

In [None]:
# Manufacturing Integration Example
class ManufacturingAnomalyAlert:
    def __init__(self, pipeline, alert_threshold=-0.5, 
                 critical_threshold=-1.0):
        self.pipeline = pipeline
        self.alert_threshold = alert_threshold
        self.critical_threshold = critical_threshold
        self.alert_history = []
        
    def process_sample(self, sample_data, equipment_id, timestamp):
        """Process a single equipment sample and generate alerts."""
        # Get anomaly score
        score = self.pipeline.decision_function(sample_data.reshape(1, -1))[0]
        prediction = self.pipeline.predict(sample_data.reshape(1, -1))[0]
        
        # Determine alert level
        alert_level = 'normal'
        if score <= self.critical_threshold:
            alert_level = 'critical'
        elif score <= self.alert_threshold:
            alert_level = 'warning'
        
        # Create alert record
        alert_record = {
            'timestamp': timestamp,
            'equipment_id': equipment_id,
            'anomaly_score': score,
            'is_anomaly': prediction,
            'alert_level': alert_level,
            'sample_data': sample_data.tolist()
        }
        
        self.alert_history.append(alert_record)
        
        return alert_record
    
    def get_recent_alerts(self, hours=1):
        """Get alerts from the last N hours."""
        cutoff_time = datetime.now() - timedelta(hours=hours)
        recent_alerts = [alert for alert in self.alert_history 
                        if alert['timestamp'] >= cutoff_time 
                        and alert['alert_level'] != 'normal']
        return recent_alerts
    
    def generate_summary_report(self):
        """Generate a summary report of anomaly detection."""
        total_samples = len(self.alert_history)
        anomalies = sum(1 for alert in self.alert_history if alert['is_anomaly'])
        warnings = sum(1 for alert in self.alert_history if alert['alert_level'] == 'warning')
        critical = sum(1 for alert in self.alert_history if alert['alert_level'] == 'critical')
        
        return {
            'total_samples': total_samples,
            'detected_anomalies': anomalies,
            'warning_alerts': warnings,
            'critical_alerts': critical,
            'anomaly_rate': anomalies / total_samples if total_samples > 0 else 0,
            'avg_anomaly_score': np.mean([alert['anomaly_score'] for alert in self.alert_history])
        }

# Initialize alert system
alert_system = ManufacturingAnomalyAlert(
    pipeline=pipeline,
    alert_threshold=-0.3,
    critical_threshold=-0.8
)

print("Manufacturing alert system initialized")
print(f"Alert threshold: {alert_system.alert_threshold}")
print(f"Critical threshold: {alert_system.critical_threshold}")

In [None]:
# Simulate processing streaming data through alert system
print("Processing streaming data through alert system...")

for i, (timestamp, features) in enumerate(zip(timestamps, streaming_features.values)):
    equipment_id = f"EQ_{(i % 4) + 1:03d}"  # Simulate multiple equipment IDs
    alert_record = alert_system.process_sample(features, equipment_id, timestamp)
    
    # Print real-time alerts
    if alert_record['alert_level'] != 'normal':
        print(f"‚ö†Ô∏è  ALERT: {alert_record['alert_level'].upper()} - "
              f"Equipment {equipment_id} at {timestamp.strftime('%H:%M:%S')} - "
              f"Score: {alert_record['anomaly_score']:.3f}")

# Generate summary report
summary = alert_system.generate_summary_report()
print("\n" + "="*60)
print("MANUFACTURING ANOMALY DETECTION SUMMARY")
print("="*60)
print(f"üìä Total Samples Processed: {summary['total_samples']}")
print(f"üîç Detected Anomalies: {summary['detected_anomalies']}")
print(f"‚ö†Ô∏è  Warning Alerts: {summary['warning_alerts']}")
print(f"üö® Critical Alerts: {summary['critical_alerts']}")
print(f"üìà Overall Anomaly Rate: {summary['anomaly_rate']:.1%}")
print(f"üìâ Average Anomaly Score: {summary['avg_anomaly_score']:.4f}")

# Get recent alerts
recent_alerts = alert_system.get_recent_alerts(hours=1)
print(f"\nüïê Recent Alerts (last hour): {len(recent_alerts)}")

if recent_alerts:
    print("\nRecent Alert Details:")
    for alert in recent_alerts[-5:]:  # Show last 5 alerts
        print(f"  ‚Ä¢ {alert['timestamp'].strftime('%H:%M:%S')} - "
              f"{alert['equipment_id']} - "
              f"{alert['alert_level'].upper()} "
              f"(Score: {alert['anomaly_score']:.3f})")

## 10. Model Persistence and Deployment

Let's save the trained model for production deployment:

In [None]:
# Create models directory
models_dir = Path('models')
models_dir.mkdir(exist_ok=True)

# Save the trained pipeline
model_path = models_dir / 'anomaly_detection_ensemble.joblib'
pipeline.save(model_path)
print(f"Model saved to: {model_path}")

# Test model loading
loaded_pipeline = AnomalyDetectionPipeline.load(model_path)
print(f"Model loaded successfully")
print(f"Loaded pipeline algorithms: {loaded_pipeline.algorithms}")

# Test predictions with loaded model
test_sample = X_test.iloc[:5]
loaded_predictions = loaded_pipeline.predict(test_sample)
loaded_scores = loaded_pipeline.decision_function(test_sample)

print(f"\nTest predictions with loaded model:")
for i, (pred, score) in enumerate(zip(loaded_predictions, loaded_scores)):
    print(f"Sample {i+1}: Prediction={pred}, Score={score:.4f}")

# Verify consistency
original_predictions = pipeline.predict(test_sample)
original_scores = pipeline.decision_function(test_sample)

predictions_match = np.array_equal(loaded_predictions, original_predictions)
scores_match = np.allclose(loaded_scores, original_scores)

print(f"\nModel consistency check:")
print(f"Predictions match: {predictions_match}")
print(f"Scores match: {scores_match}")

if predictions_match and scores_match:
    print("‚úÖ Model saved and loaded successfully!")
else:
    print("‚ùå Model consistency check failed!")

## 11. Conclusions and Production Recommendations

### Key Findings:

1. **Ensemble methods improve robustness**: Combining multiple algorithms provides better anomaly detection than individual methods

2. **Real-time processing is feasible**: The pipeline can process equipment data in real-time with acceptable latency

3. **Alert thresholds are critical**: Proper threshold tuning balances sensitivity with false alarm rates

### Manufacturing Benefits:

- **Predictive maintenance**: Early detection of equipment issues before failures
- **Quality assurance**: Identify process deviations that could affect product quality
- **Cost reduction**: Prevent expensive equipment downtime and product recalls
- **Operational efficiency**: Automated monitoring reduces manual inspection needs

### Production Deployment Recommendations:

1. **Data pipeline**: Implement robust data collection and preprocessing
2. **Model monitoring**: Track model performance and drift over time
3. **Alert management**: Integrate with existing maintenance and quality systems
4. **Threshold tuning**: Regular calibration based on operational feedback
5. **Scalability**: Design for multiple equipment types and production lines

In [None]:
# Final comprehensive summary
print("\n" + "="*80)
print("FINAL COMPREHENSIVE SUMMARY")
print("="*80)

print(f"\nüîß SYSTEM CONFIGURATION:")
print(f"  ‚Ä¢ Algorithms: {', '.join(pipeline.algorithms)}")
print(f"  ‚Ä¢ Ensemble Method: {pipeline.ensemble_method}")
print(f"  ‚Ä¢ Features: {len(feature_columns)}")
print(f"  ‚Ä¢ Training Samples: {len(X_train)}")

if y_test is not None:
    print(f"\nüìä PERFORMANCE METRICS:")
    final_precision = precision_score(y_test, predictions, zero_division=0)
    final_recall = recall_score(y_test, predictions, zero_division=0)
    final_f1 = f1_score(y_test, predictions, zero_division=0)
    print(f"  ‚Ä¢ Precision: {final_precision:.3f}")
    print(f"  ‚Ä¢ Recall: {final_recall:.3f}")
    print(f"  ‚Ä¢ F1-Score: {final_f1:.3f}")
    if len(np.unique(y_test)) > 1:
        final_auc = roc_auc_score(y_test, anomaly_scores)
        print(f"  ‚Ä¢ ROC-AUC: {final_auc:.3f}")

print(f"\n‚ö° REAL-TIME PROCESSING:")
print(f"  ‚Ä¢ Samples Processed: {len(streaming_data)}")
print(f"  ‚Ä¢ Processing Duration: {relative_minutes[-1]:.1f} minutes")
print(f"  ‚Ä¢ Average Processing Rate: {len(streaming_data)/relative_minutes[-1]:.1f} samples/min")
print(f"  ‚Ä¢ Real-time Anomalies: {streaming_predictions.sum()}")

print(f"\nüö® ALERT SYSTEM:")
print(f"  ‚Ä¢ Total Alerts Generated: {summary['warning_alerts'] + summary['critical_alerts']}")
print(f"  ‚Ä¢ Warning Level: {summary['warning_alerts']}")
print(f"  ‚Ä¢ Critical Level: {summary['critical_alerts']}")
print(f"  ‚Ä¢ Recent Alerts (1h): {len(recent_alerts)}")

print(f"\nüíæ MODEL DEPLOYMENT:")
print(f"  ‚Ä¢ Model Saved: {model_path}")
print(f"  ‚Ä¢ Model Size: {model_path.stat().st_size / 1024:.1f} KB")
print(f"  ‚Ä¢ Load/Save Consistency: ‚úÖ")

print(f"\n‚úÖ PRODUCTION READINESS:")
print(f"  ‚Ä¢ Real-time processing: Ready")
print(f"  ‚Ä¢ Alert integration: Ready")
print(f"  ‚Ä¢ Model persistence: Ready")
print(f"  ‚Ä¢ Manufacturing integration: Ready")

print(f"\nüéØ NEXT STEPS:")
print(f"  1. Deploy to production environment")
print(f"  2. Integrate with MES/SCADA systems")
print(f"  3. Implement automated retraining")
print(f"  4. Set up monitoring dashboards")
print(f"  5. Train operations staff on alert handling")

print("\nüè≠ Advanced anomaly detection system successfully implemented and validated!")