Environment Setup

In [1]:
import os
import sys
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
import json
import pickle
import random
from collections import defaultdict, Counter
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve, auc
from sklearn.model_selection import StratifiedKFold, train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import *
from pyspark.ml.classification import *
from pyspark.ml.evaluation import *

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
import drain3
from drain3 import TemplateMiner
from drain3.template_miner_config import TemplateMinerConfig

In [2]:
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(RANDOM_SEED)
    torch.cuda.manual_seed_all(RANDOM_SEED)

In [3]:
PROJECT_ROOT = Path(r"C:\Computer Science\AIMLDL\log-anomaly-detection")
DATASET_PATH = PROJECT_ROOT / "dataset"
LABELED_DATA_PATH = DATASET_PATH / "labeled_data"
NORMALIZED_DATA_PATH = LABELED_DATA_PATH / "normalized"
RESULTS_PATH = PROJECT_ROOT / "results" / "cross_source_transfer"
MODELS_PATH = PROJECT_ROOT / "models"

RESULTS_PATH.mkdir(parents=True, exist_ok=True)
MODELS_PATH.mkdir(parents=True, exist_ok=True)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

print(f"Project root: {PROJECT_ROOT}")
print(f"Results path: {RESULTS_PATH}")
print(f"Models path: {MODELS_PATH}")
print(f"Random seed: {RANDOM_SEED}")

Using device: cuda
GPU: NVIDIA GeForce RTX 4060 Laptop GPU
GPU Memory: 8.0 GB
Project root: C:\Computer Science\AIMLDL\log-anomaly-detection
Results path: C:\Computer Science\AIMLDL\log-anomaly-detection\results\cross_source_transfer
Models path: C:\Computer Science\AIMLDL\log-anomaly-detection\models
Random seed: 42


Spark Configuration

In [5]:
os.environ['HADOOP_HOME'] = 'C:\\hadoop'
os.environ['PATH'] = f"{os.environ['HADOOP_HOME']}\\bin;{os.environ['PATH']}"

app_name = "CrossSourceAnomalyDetection"
spark = SparkSession.builder \
    .master("local[*]") \
    .config("spark.driver.memory", "18g") \
    .config("spark.driver.maxResultSize", "8g") \
    .config("spark.executor.memory", "16g") \
    .config("spark.executor.memoryFraction", "0.8") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.sql.adaptive.skewJoin.enabled", "true") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.repl.eagerEval.enabled", "true") \
    .config("spark.sql.repl.eagerEval.maxNumRows", 20) \
    .config("spark.sql.shuffle.partitions", "16") \
    .config("spark.sql.broadcastTimeout", "36000") \
    .config("spark.sql.adaptive.advisoryPartitionSizeInBytes", "128MB") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "512m") \
    .config("spark.default.parallelism", "16") \
    .appName(app_name) \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

print(f"Spark {spark.version}")
print(f"Available cores: {spark.sparkContext.defaultParallelism}")
print(f"Driver memory: {spark.conf.get('spark.driver.memory')}")
print(f"Executor memory: {spark.conf.get('spark.executor.memory')}")

Spark 3.4.1
Available cores: 16
Driver memory: 18g
Executor memory: 16g


In [6]:
PROJECT_CONFIG = {
    'experiment_name': 'cross_source_transfer_anomaly_detection',
    'log_sources': [
        'Android', 'Apache', 'BGL', 'Hadoop', 'HDFS', 'HealthApp',
        'HPC', 'Linux', 'Mac', 'OpenSSH', 'OpenStack', 'Proxifier',
        'Spark', 'Thunderbird', 'Windows', 'Zookeeper'
    ],
    'few_shot_sizes': [10, 50],
    'random_seeds': [42, 123, 456],
    'max_sequence_length': 512,
    'batch_size': 16,
    'learning_rate': 2e-5,
    'bert_model_name': 'bert-base-uncased',
    'template_extraction_method': 'drain'
}

print(f"Spark session initialized for {app_name}")
print(f"Project configured for {len(PROJECT_CONFIG['log_sources'])} log sources")
print(f"Leave-one-out cross-validation will run {len(PROJECT_CONFIG['log_sources'])} experiments")

Spark session initialized for CrossSourceAnomalyDetection
Project configured for 16 log sources
Leave-one-out cross-validation will run 16 experiments


Data Discovery and Analysis

In [7]:
LOG_SOURCES = [
    'Android_2k', 'Apache_2k', 'BGL_2k', 'Hadoop_2k', 'HDFS_2k', 
    'HealthApp_2k', 'HPC_2k', 'Linux_2k', 'Mac_2k', 'OpenSSH_2k',
    'OpenStack_2k', 'Proxifier_2k', 'Spark_2k', 'Thunderbird_2k',
    'Windows_2k', 'Zookeeper_2k'
]

LABELS = {
    0: "normal",
    1: "security_anomaly", 
    2: "system_failure",
    3: "performance_issue",
    4: "network_anomaly", 
    5: "config_error",
    6: "hardware_issue",
    7: "unknown_anomaly"
}

labeled_files = list(LABELED_DATA_PATH.glob("*_labeled.csv"))
normalized_files = list(NORMALIZED_DATA_PATH.glob("*_normalized.csv")) if NORMALIZED_DATA_PATH.exists() else []

print(f"Labeled datasets found: {len(labeled_files)}")
print(f"Normalized datasets found: {len(normalized_files)}")

Labeled datasets found: 6
Normalized datasets found: 6


In [8]:
dataset_registry = {}
total_records = 0
total_anomalies = 0

for file_path in sorted(labeled_files):
    filename = file_path.name
    log_source = None
    for source in PROJECT_CONFIG['log_sources']:
        if source.lower() in filename.lower():
            log_source = source
            break
    
    if log_source is None:
        print(f"Could not identify log source for {filename}")
        continue
    
    try:
        normalized_filename = filename.replace('_labeled.csv', '_normalized.csv')
        normalized_file_path = NORMALIZED_DATA_PATH / normalized_filename

        if normalized_file_path.exists():
            print(f"Loading normalized file: {normalized_filename}")
            df = pd.read_csv(normalized_file_path)
            file_to_register_path = normalized_file_path
        else:
            print(f"Normalized file not found. Loading labeled file: {filename}")
            df = pd.read_csv(file_path)
            file_to_register_path = file_path

        n_records = len(df)
        n_columns = len(df.columns)
        
        has_labels = 'AnomalyLabel' in df.columns
        
        label_stats = {}
        if has_labels:
            label_counts = df['AnomalyLabelName'].value_counts()
            
            normal_count = int(label_counts.get("normal", 0))
            
            anomaly_count = n_records - normal_count
            anomaly_rate = (anomaly_count / n_records) * 100 if n_records > 0 else 0
            
            label_stats = {
                'normal_count': normal_count,
                'anomaly_count': anomaly_count,
                'anomaly_rate': anomaly_rate,
                'unique_labels': {str(k): int(v) for k, v in label_counts.to_dict().items()}
            }
            
            total_records += n_records
            total_anomalies += anomaly_count
        
        has_normalized_timestamp = 'timestamp_normalized' in df.columns
        
        dataset_registry[log_source] = {
            'file_path': file_path,
            'n_records': n_records,
            'n_columns': n_columns,
            'columns': list(df.columns),
            'has_labels': has_labels,
            'has_normalized_timestamp': has_normalized_timestamp,
            'label_stats': label_stats,
            'size_mb': file_path.stat().st_size / (1024 * 1024)
        }
        
        print(f"{log_source:<12} | {n_records:>8,} records | {n_columns:>2} cols | "
              f"{'Labels' if has_labels else 'No Labels':<8} | "
              f"{anomaly_rate:>5.1f}% anomalies" if has_labels else "")
              
    except Exception as e:
        print(f"Error processing {filename}: {e}")

Loading normalized file: Apache_2k_normalized.csv
Apache       |    2,000 records | 10 cols | Labels   |  29.8% anomalies
Loading normalized file: BGL_2k_normalized.csv
BGL          |    2,000 records | 17 cols | Labels   |  75.0% anomalies
Loading normalized file: HPC_2k_normalized.csv
HPC          |    2,000 records | 14 cols | Labels   |   9.8% anomalies
Loading normalized file: OpenSSH_2k_normalized.csv
OpenSSH      |    2,000 records | 13 cols | Labels   |  78.8% anomalies
Loading normalized file: Proxifier_2k_normalized.csv
Proxifier    |    2,000 records | 10 cols | Labels   |   4.9% anomalies
Loading normalized file: Zookeeper_2k_normalized.csv
Zookeeper    |    2,000 records | 14 cols | Labels   |  46.2% anomalies


In [9]:
print(f"\nSuccessfully registered: {len(dataset_registry)}/{len(PROJECT_CONFIG['log_sources'])} log sources")
print(f"Total records: {total_records:,}")
print(f"Total anomalies: {total_anomalies:,}")
if total_records > 0:
    print(f"Overall anomaly rate: {(total_anomalies/total_records*100):.2f}%")


Successfully registered: 6/16 log sources
Total records: 12,000
Total anomalies: 4,888
Overall anomaly rate: 40.73%


In [10]:
missing_sources = set(PROJECT_CONFIG['log_sources']) - set(dataset_registry.keys())
if missing_sources:
    print(f"⚠️  Missing sources: {missing_sources}")
    print("Updating project config to only include available sources...")
    PROJECT_CONFIG['log_sources'] = list(dataset_registry.keys())

⚠️  Missing sources: {'HDFS', 'Android', 'Thunderbird', 'Linux', 'Hadoop', 'HealthApp', 'OpenStack', 'Windows', 'Mac', 'Spark'}
Updating project config to only include available sources...


In [11]:
print(f"Leave-one-source-out experiments: {len(PROJECT_CONFIG['log_sources'])}")
print(f"Few-shot adaptation sizes: {PROJECT_CONFIG['few_shot_sizes']}")
print(f"Random seeds for reproducibility: {PROJECT_CONFIG['random_seeds']}")

registry_path = RESULTS_PATH / "dataset_registry.json"
with open(registry_path, 'w') as f:
    registry_for_export = {}
    for source, info in dataset_registry.items():
        registry_for_export[source] = {
            'file_path': str(info['file_path']),
            'n_records': info['n_records'],
            'n_columns': info['n_columns'],
            'columns': info['columns'],
            'has_labels': info['has_labels'],
            'has_normalized_timestamp': info['has_normalized_timestamp'],
            'label_stats': info['label_stats'],
            'size_mb': info['size_mb']
        }
    
    json.dump({
        'dataset_registry': registry_for_export,
        'project_config': PROJECT_CONFIG,
        'total_records': total_records,
        'total_anomalies': total_anomalies,
        'analysis_timestamp': datetime.now().isoformat()
    }, f, indent=2)

Leave-one-source-out experiments: 6
Few-shot adaptation sizes: [10, 50]
Random seeds for reproducibility: [42, 123, 456]
