# Using Orion for Custom Data

This notebook is quick tutorial to use Orion on custom data.

Before you start, please use GPU runtime for faster computation. From the top menu `Runtime -> Change runtime type -> T4 GPU`.

## Step 0: install Orion on Colab
Orion is available on pypi: https://pypi.org/project/orion-ml and can be installed directly via

In [1]:
! pip install orion-ml



In [15]:
# Import Libraries and Configure Paths
import os
import warnings
import pandas as pd
import numpy as np
import glob
from datetime import datetime
from google.colab import drive

print("Configuring Orion paths...")
try:
    import orion
    import mlstars
    from mlblocks import add_pipelines_path, add_primitives_path

    # Get the installation directory for the mlstars and orion packages
    mlstars_path = os.path.dirname(mlstars.__file__)
    orion_path = os.path.dirname(orion.__file__)

    # Add the dynamically found DIRECTORY paths for primitives and pipelines
    add_primitives_path(os.path.join(mlstars_path, 'primitives'))
    add_primitives_path(os.path.join(orion_path, 'primitives'))
    add_pipelines_path(os.path.join(orion_path, 'pipelines'))

    print("Orion paths configured successfully! ✅")
except ImportError as e:
    print(f"Error importing Orion libraries. Please ensure orion-ml is installed correctly: {e}")
except Exception as e:
    print(f"An error occurred during path configuration: {e}")

# Suppress FutureWarning messages for cleaner output
# Else we will get a lot of warnings while runing the algos
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
print("Note: Warnings are being Ignored")

Configuring Orion paths...
Orion paths configured successfully! ✅


In [3]:
# Mount Google Drive
print("Mounting Google Drive...")
try:
    drive.mount('/content/gdrive')
    #drive.mount('/content/gdrive', force_remount=True)
    print("Google Drive mounted successfully. ✅")
except Exception as e:
    print(f"Error mounting Google Drive: {e}")

Mounting Google Drive...
Mounted at /content/gdrive
Google Drive mounted successfully. ✅


The results will stored in the Anomaly_Results folder under a new file name with the structure "timestamp_algorithm_name"

In [10]:
# Define Configuration and Set Up Directories
from datetime import datetime

# --- Configuration ---
INPUT_DATA_LOCATION = '/content/gdrive/MyDrive/Colab Notebooks/Traffic analysis/Datasets/'
BASE_OUTPUT_LOCATION = '/content/gdrive/MyDrive/Colab Notebooks/Traffic analysis/Anomaly_Results/'

# Generate a timestamped folder for the current session
current_time = datetime.now().strftime('%Y%m%d_%H%M%S')
SESSION_FOLDER_NAME = f"{current_time}_session"
SESSION_OUTPUT_LOCATION = os.path.join(BASE_OUTPUT_LOCATION, SESSION_FOLDER_NAME)

# Create the main session directory
try:
    os.makedirs(SESSION_OUTPUT_LOCATION, exist_ok=True)
    print(f"Input data will be read from: {INPUT_DATA_LOCATION}")
    print(f"All results for this session will be saved in: {SESSION_OUTPUT_LOCATION}")
except OSError as e:
    print(f"Error creating session directory: {e}")

Input data will be read from: /content/gdrive/MyDrive/Colab Notebooks/Traffic analysis/Datasets/
All results for this session will be saved in: /content/gdrive/MyDrive/Colab Notebooks/Traffic analysis/Anomaly_Results/20250802_180212_session


In [16]:
# Define Algorithm Pipelines and Hyperparameters

# Interval set to 1 hour - we have timestamps of 15 minutes - 1 hour seems a good tradeoff
# epoch value is set to 5 - since it runs on each csv file

# some alogrithms need a lot more iterations - trail and error was used to get a suitable value
# for e.g dense_autoencoder

# only included verified pipelines

pipelines = {
    'aer': {
        'mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1': {
            'interval': 3600
        },
        'orion.primitives.aer.AER#1': {
            'epochs': 5,
            'verbose': False
        }
    },
    'tadgan': {
        "mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": {
            "interval": 3600
        },
        'orion.primitives.tadgan.TadGAN#1': {
            'epochs': 5,
            'verbose': False
        }
    },
    'lstm_dynamic_threshold': {
        "mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": {
            "interval": 3600
        },
        'keras.Sequential.LSTMTimeSeriesRegressor#1': {
            'epochs': 5,
            'verbose': False
        }
    },
    'lstm_autoencoder': {
        "mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": {
            "interval": 3600
        },
        'keras.Sequential.LSTMSeq2Seq#1': {
            'epochs': 5,
            'verbose': False
        }
    },
    'dense_autoencoder': {
        "mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": {
            "interval": 3600
        },
        'keras.Sequential.DenseSeq2Seq#1': {
            'epochs': 20,
            'verbose': False
        }
    },
    'vae': {
        "mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": {
            "interval": 3600
        },
        'orion.primitives.vae.VAE#1': {
            'epochs': 5,
            'verbose': False
        }
    }
}

print(f"Loaded configurations for {len(pipelines)} algorithms.")

Loaded configurations for 6 algorithms.


In [12]:
# Cell 6: Main Processing Loop for All Algorithms and Files
from orion import Orion

# List to hold summary statistics from all algorithms
all_algorithms_summary = []

print(f"\nSearching for CSV files in '{INPUT_DATA_LOCATION}'...")
csv_files = glob.glob(os.path.join(INPUT_DATA_LOCATION, '*.csv'))

if not csv_files:
    print("No CSV files found in the specified directory.")
else:
    print(f"Found {len(csv_files)} CSV files. Starting processing...\n")

# --- Main Algorithm Loop ---
for algorithm_name, hyperparameters in pipelines.items():
    print(f"\n{'='*20} Processing Algorithm: {algorithm_name.upper()} {'='*20}")

    # Create a subfolder for the current algorithm's results
    ALGORITHM_OUTPUT_LOCATION = os.path.join(SESSION_OUTPUT_LOCATION, algorithm_name)
    os.makedirs(ALGORITHM_OUTPUT_LOCATION, exist_ok=True)

    # List to hold all detected anomalies for this algorithm
    current_algorithm_anomalies = []

    # --- File Processing Loop ---
    for file_path in csv_files:
        try:
            dataset_name = os.path.basename(file_path)
            print(f"--- Processing: {dataset_name} ---")

            # 1. Load Data
            data = pd.read_csv(file_path)

            # 2. Format Data for Orion
            data['timestamp_original'] = pd.to_datetime(data['timestamp'])
            data['timestamp'] = data['timestamp_original'].values.astype(np.int64) // 10**9
            if 'value' not in data.columns:
                data.rename(columns={data.columns[1]: 'value'}, inplace=True)
            orion_input_data = data[['timestamp', 'value']].copy()

            # 3. Run Orion Anomaly Detection
            print(f"   - Running {algorithm_name} anomaly detection...")
            orion_detector = Orion(pipeline=algorithm_name, hyperparameters=hyperparameters)
            detected_anomalies = orion_detector.fit_detect(orion_input_data)
            print(f"   - Detected {len(detected_anomalies)} anomaly interval(s).")

            # 4. Process and Augment Results
            if not detected_anomalies.empty:
                detected_anomalies['dataset_name'] = dataset_name
                detected_anomalies['algorithm'] = algorithm_name
                current_algorithm_anomalies.append(detected_anomalies)

            data['is_anomaly'] = False
            data['anomaly_severity'] = 0.0

            for _, anomaly in detected_anomalies.iterrows():
                anomaly_mask = (data['timestamp'] >= anomaly['start']) & (data['timestamp'] <= anomaly['end'])
                data.loc[anomaly_mask, 'is_anomaly'] = True
                current_severity = data.loc[anomaly_mask, 'anomaly_severity']
                data.loc[anomaly_mask, 'anomaly_severity'] = np.maximum(current_severity, anomaly['severity'])

            # 5. Save Augmented Data
            augmented_filename = f"{os.path.splitext(dataset_name)[0]}_augmented_{algorithm_name}.csv"
            augmented_filepath = os.path.join(ALGORITHM_OUTPUT_LOCATION, augmented_filename)
            data.to_csv(augmented_filepath, index=False)
            print(f"   - Saved augmented data to: {augmented_filepath}")

        except Exception as e:
            print(f"An error occurred while processing {file_path} with {algorithm_name}: {e}")

    # --- Generate and Save Report for the Current Algorithm ---
    if not current_algorithm_anomalies:
        print(f"\nNo anomalies detected by {algorithm_name.upper()} in any dataset.")
        summary = {
            'Algorithm': algorithm_name,
            'Total Anomalies Detected': 0,
            'Min Severity': 'N/A',
            'Max Severity': 'N/A',
            'Average Severity': 'N/A'
        }
    else:
        consolidated_report = pd.concat(current_algorithm_anomalies, ignore_index=True)
        consolidated_report['start_timestamp'] = pd.to_datetime(consolidated_report['start'], unit='s')
        consolidated_report['end_timestamp'] = pd.to_datetime(consolidated_report['end'], unit='s')

        report_filename = f"{algorithm_name}_consolidated_report.csv"
        report_filepath = os.path.join(ALGORITHM_OUTPUT_LOCATION, report_filename)
        consolidated_report.to_csv(report_filepath, index=False)
        print(f"\nConsolidated report for {algorithm_name.upper()} saved to: {report_filepath}")

        # Collect stats for the final summary report
        summary = {
            'Algorithm': algorithm_name,
            'Total Anomalies Detected': len(consolidated_report),
            'Min Severity': consolidated_report['severity'].min(),
            'Max Severity': consolidated_report['severity'].max(),
            'Average Severity': consolidated_report['severity'].mean()
        }
    all_algorithms_summary.append(summary)

print("\n\n--- All files and algorithms processed. ---")


Searching for CSV files in '/content/gdrive/MyDrive/Colab Notebooks/Traffic analysis/Datasets/'...
Found 2 CSV files. Starting processing...


--- Processing: cs_inbound_AT03.csv ---
   - Running aer anomaly detection...
   - Detected 2 anomaly interval(s).
   - Saved augmented data to: /content/gdrive/MyDrive/Colab Notebooks/Traffic analysis/Anomaly_Results/20250802_180212_session/aer/cs_inbound_AT03_augmented_aer.csv
--- Processing: cs_inbound_AT01.csv ---
   - Running aer anomaly detection...
   - Detected 1 anomaly interval(s).
   - Saved augmented data to: /content/gdrive/MyDrive/Colab Notebooks/Traffic analysis/Anomaly_Results/20250802_180212_session/aer/cs_inbound_AT01_augmented_aer.csv

Consolidated report for AER saved to: /content/gdrive/MyDrive/Colab Notebooks/Traffic analysis/Anomaly_Results/20250802_180212_session/aer/aer_consolidated_report.csv

--- Processing: cs_inbound_AT03.csv ---
   - Running tadgan anomaly detection...
   - Detected 3 anomaly interval(s).
   - Save

In [13]:
# Generate and Save Final Summary Report

if not all_algorithms_summary:
    print("\nNo summary data was generated.")
else:
    # Create a DataFrame from the summary list
    summary_df = pd.DataFrame(all_algorithms_summary)
    summary_df = summary_df.set_index('Algorithm')

    # --- Console Output Summary ---
    print("\n\n--- Overall Algorithm Performance Summary ---")
    # CORRECTED LINE: Provide a callable function to the float_format parameter
    print(summary_df.to_string(float_format='{:.4f}'.format))

    # --- Save Final Summary CSV Report ---
    summary_filename = f"{SESSION_FOLDER_NAME}_overall_summary.csv"
    summary_filepath = os.path.join(SESSION_OUTPUT_LOCATION, summary_filename)

    try:
        summary_df.to_csv(summary_filepath)
        print(f"\nFinal summary report saved to: {summary_filepath} ✅")
    except Exception as e:
        print(f"\nError saving final summary report: {e}")



--- Overall Algorithm Performance Summary ---
                        Total Anomalies Detected  Min Severity  Max Severity  Average Severity
Algorithm                                                                                     
aer                                            3        0.1828        0.8048            0.3959
tadgan                                         5        0.1176        1.9696            0.6501
lstm_dynamic_threshold                         4        0.6484        0.8491            0.7253
lstm_autoencoder                               4        0.1809        2.2494            0.8008
dense_autoencoder                              4        0.0233        2.0317            0.6654
vae                                            5        0.2007        2.1995            0.8415

Final summary report saved to: /content/gdrive/MyDrive/Colab Notebooks/Traffic analysis/Anomaly_Results/20250802_180212_session/20250802_180212_session_overall_summary.csv ✅
