Setup

In [28]:
import os
import sys
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
import json
import pickle
import re
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from collections import Counter
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.ml.feature import *

import torch
from transformers import AutoTokenizer, AutoModel

import drain3
from drain3 import TemplateMiner
from drain3.template_miner_config import TemplateMinerConfig
from drain3.masking import MaskingInstruction

In [3]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

PROJECT_ROOT = Path(r"C:\Computer Science\AIMLDL\log-anomaly-detection")
DATASET_PATH = PROJECT_ROOT / "dataset"
LABELED_DATA_PATH = DATASET_PATH / "labeled_data"
NORMALIZED_DATA_PATH = LABELED_DATA_PATH / "normalized"
RESULTS_PATH = PROJECT_ROOT / "results" / "cross_source_transfer"
MODELS_PATH = PROJECT_ROOT / "models"
FEATURES_PATH = PROJECT_ROOT / "features"

In [5]:
FEATURES_PATH.mkdir(parents=True, exist_ok=True)

registry_path = RESULTS_PATH / "dataset_registry.json"
if registry_path.exists():
    with open(registry_path, 'r') as f:
        registry_data = json.load(f)
        dataset_registry = registry_data['dataset_registry']
        PROJECT_CONFIG = registry_data['project_config']
    print(f"Loaded dataset registry with {len(dataset_registry)} sources")
    print(f"Project config loaded for {len(PROJECT_CONFIG['log_sources'])} log sources")
else:
    print("Dataset registry not found. Please run data processing notebook first.")
    sys.exit()

Loaded dataset registry with 6 sources
Project config loaded for 6 log sources


In [7]:
os.environ['HADOOP_HOME'] = 'C:\\hadoop'
os.environ['PATH'] = f"{os.environ['HADOOP_HOME']}\\bin;{os.environ['PATH']}"

spark = SparkSession.builder \
    .master("local[*]") \
    .config("spark.driver.memory", "18g") \
    .config("spark.executor.memory", "16g") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .appName("FeatureEngineering") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

tokenizer = AutoTokenizer.from_pretrained(PROJECT_CONFIG['bert_model_name'])

print(f"Feature engineering environment initialized")
print(f"Spark session ready: {spark.version}")
print(f"BERT tokenizer loaded: {PROJECT_CONFIG['bert_model_name']}")
print(f"Features will be saved to: {FEATURES_PATH}")

Feature engineering environment initialized
Spark session ready: 3.4.1
BERT tokenizer loaded: bert-base-uncased
Features will be saved to: C:\Computer Science\AIMLDL\log-anomaly-detection\features


Template Extraction using Drain Algorithm

In [19]:
drain_config = TemplateMinerConfig()
drain_config.drain_sim_th = 0.4
drain_config.drain_depth = 4
drain_config.drain_max_children = 100
drain_config.masking_instructions = [
    MaskingInstruction(r'\d+', "<NUM>"),
    MaskingInstruction(r'[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}', "<UUID>"),
    MaskingInstruction(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b', "<IP>"),
    MaskingInstruction(r'/[^\s]*', "<PATH>")
]

print("Drain3 configuration set successfully!")
print(f"Similarity threshold: {drain_config.drain_sim_th}")
print(f"Masking instructions loaded: {len(drain_config.masking_instructions)}")

Drain3 configuration set successfully!
Similarity threshold: 0.4
Masking instructions loaded: 4


In [20]:
def extract_templates_per_source(log_source, content_data, max_lines=10000):
    print(f"\n--- Processing {log_source} ---")
    
    template_miner = TemplateMiner(config=drain_config)
    
    if len(content_data) > max_lines:
        sampled_data = content_data.sample(n=max_lines, random_state=RANDOM_SEED)
        print(f"Sampled {max_lines} lines from {len(content_data)} total lines")
    else:
        sampled_data = content_data
        print(f"Processing all {len(content_data)} lines")
    
    templates = {}
    template_ids = []
    
    for idx, content in enumerate(sampled_data):
        if pd.isna(content) or content.strip() == "":
            template_ids.append(-1)  
            continue
            
        result = template_miner.add_log_message(str(content).strip())
        template_id = result["cluster_id"]
        template_ids.append(template_id)
        
        if template_id not in templates:
            templates[template_id] = {
                'template': result["template_mined"],
                'count': 1,
                'size': result["cluster_size"]
            }
        else:
            templates[template_id]['count'] += 1
    
    print(f"Extracted {len(templates)} unique templates")
    print(f"Template coverage: {len([t for t in template_ids if t != -1])/len(template_ids)*100:.1f}%")
    
    sorted_templates = sorted(templates.items(), key=lambda x: x[1]['count'], reverse=True)
    print(f"\nTop 5 templates:")
    for i, (tid, info) in enumerate(sorted_templates[:5], 1):
        print(f"   {i}. [ID: {tid}] ({info['count']} instances) {info['template'][:100]}...")
    
    return templates, template_ids, template_miner

In [23]:
template_data = {}
all_templates = {}

for log_source in PROJECT_CONFIG['log_sources']:
    if log_source not in dataset_registry:
        continue
        
    file_path = dataset_registry[log_source]['file_path']
    df = pd.read_csv(file_path)
    
    content_col = None
    for col in ['Content', 'content', 'Message', 'message', 'Text', 'text']:
        if col in df.columns:
            content_col = col
            break
    
    if content_col is None:
        print(f"No content column found for {log_source}")
        continue
    
    content_data = df[content_col].fillna("")
    templates, template_ids, miner = extract_templates_per_source(log_source, content_data)
    
    template_data[log_source] = {
        'templates': templates,
        'template_ids': template_ids,
        'miner': miner,
        'content_column': content_col,
        'original_data': df
    }
    
    for tid, template_info in templates.items():
        global_key = f"{log_source}_{tid}"
        all_templates[global_key] = {
            'source': log_source,
            'template_id': tid,
            'template': template_info['template'],
            'count': template_info['count'],
            'frequency': template_info['count'] / len(content_data)
        }


--- Processing Apache ---
Processing all 2000 lines
Extracted 6 unique templates
Template coverage: 100.0%

Top 5 templates:
   1. [ID: 3] (836 instances) jk<<NUM>>_init() Found child <<NUM>> in scoreboard slot <<NUM>>...
   2. [ID: 1] (569 instances) workerEnv.init() ok <<PATH>>...
   3. [ID: 2] (539 instances) mod_jk child workerEnv in error state <<NUM>>...
   4. [ID: 4] (32 instances) [client <<NUM>>.<<NUM>>.<<NUM>>.<<NUM>>] Directory index forbidden by rule: <<PATH>>...
   5. [ID: 5] (12 instances) jk<<NUM>>_init() Can't find child <<NUM>> in scoreboard...

--- Processing BGL ---
Processing all 2000 lines
Extracted 105 unique templates
Template coverage: 100.0%

Top 5 templates:
   1. [ID: 5] (721 instances) generating core.<<NUM>>...
   2. [ID: 69] (208 instances) iar <<NUM>>a<<NUM>>fc dear <<NUM>>b<<NUM>>e<<NUM>>...
   3. [ID: 55] (121 instances) <<NUM>> floating point alignment exceptions...
   4. [ID: 2] (109 instances) <<NUM>> double-hummer alignment exceptions...
   5. [ID:

In [24]:
print(f"Similarity threshold: {drain_config.drain_sim_th}")
print(f"Masking instructions loaded: {len(drain_config.masking_instructions)}")

Similarity threshold: 0.4
Masking instructions loaded: 4


In [25]:
print(f"Processed {len(template_data)} log sources")
print(f"Total unique templates across all sources: {len(all_templates)}")

template_counts = [len(info['templates']) for info in template_data.values()]
print(f"Templates per source - Min: {min(template_counts)}, Max: {max(template_counts)}, Avg: {np.mean(template_counts):.1f}")

templates_path = FEATURES_PATH / "template_extraction_results.pkl"
with open(templates_path, 'wb') as f:
    pickle.dump({
        'template_data': template_data,
        'all_templates': all_templates,
        'config': drain_config
    }, f)

Processed 6 log sources
Total unique templates across all sources: 628
Templates per source - Min: 6, Max: 403, Avg: 104.7


BERT Text Embeddings Generation

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

from transformers import AutoModel
bert_model = AutoModel.from_pretrained(PROJECT_CONFIG['bert_model_name'])
bert_model.to(device)
bert_model.eval()

Using device: cuda


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [27]:
def generate_bert_embeddings(texts, batch_size=16, max_length=512):
    all_embeddings = []
    
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            
            encoded = tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=max_length,
                return_tensors='pt'
            ).to(device)
            
            outputs = bert_model(**encoded)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]
            
            all_embeddings.append(cls_embeddings.cpu().numpy())
    
    return np.vstack(all_embeddings)

In [29]:
def process_source_for_embeddings(log_source, max_samples=5000):
    print(f"\n--- Generating BERT embeddings for {log_source} ---")
    
    if log_source not in template_data:
        print(f"Template data not available for {log_source}")
        return None
    
    source_data = template_data[log_source]
    df = source_data['original_data']
    content_col = source_data['content_column']
    
    texts = df[content_col].fillna("").astype(str).tolist()
    
    if len(texts) > max_samples:
        sample_indices = np.random.choice(len(texts), max_samples, replace=False)
        sample_indices.sort() 
        texts = [texts[i] for i in sample_indices]
        sampled_df = df.iloc[sample_indices].copy()
        print(f"Sampled {max_samples} texts from {len(df)} total")
    else:
        sampled_df = df.copy()
        sample_indices = None
        print(f"Processing all {len(texts)} texts")
    
    print("Generating BERT embeddings...")
    embeddings = generate_bert_embeddings(texts, batch_size=16) 
    print(f"Generated embeddings shape: {embeddings.shape}")
    
    original_labels = None
    binary_labels = None
    
    if 'AnomalyLabel' in sampled_df.columns:
        original_labels = sampled_df['AnomalyLabel'].values
        
        binary_labels = (original_labels != 0).astype(int)
        
        anomaly_rate = np.mean(binary_labels) * 100
        print(f"Label distribution: {np.sum(binary_labels == 0)} normal, {np.sum(binary_labels == 1)} anomaly ({anomaly_rate:.1f}%)")
    
    return {
        'embeddings': embeddings,
        'labels': binary_labels, 
        'texts': texts,
        'sample_indices': sample_indices,
        'original_labels': original_labels, 
        'dataframe': sampled_df
    }

In [30]:
bert_embeddings_data = {}
embedding_stats = []

for log_source in PROJECT_CONFIG['log_sources']:
    if log_source not in dataset_registry:
        continue
    
    try:
        embedding_result = process_source_for_embeddings(log_source)
        if embedding_result is not None:
            bert_embeddings_data[log_source] = embedding_result
            
            stats = {
                'source': log_source,
                'n_samples': embedding_result['embeddings'].shape[0],
                'embedding_dim': embedding_result['embeddings'].shape[1],
                'has_labels': embedding_result['labels'] is not None,
                'anomaly_rate': np.mean(embedding_result['labels']) * 100 if embedding_result['labels'] is not None else None
            }
            embedding_stats.append(stats)
            
    except Exception as e:
        print(f"Error processing {log_source}: {e}")


--- Generating BERT embeddings for Apache ---
Processing all 2000 texts
Generating BERT embeddings...
Generated embeddings shape: (2000, 768)
Label distribution: 1405 normal, 595 anomaly (29.8%)

--- Generating BERT embeddings for BGL ---
Processing all 2000 texts
Generating BERT embeddings...
Generated embeddings shape: (2000, 768)
Label distribution: 501 normal, 1499 anomaly (75.0%)

--- Generating BERT embeddings for HPC ---
Processing all 2000 texts
Generating BERT embeddings...
Generated embeddings shape: (2000, 768)
Label distribution: 1804 normal, 196 anomaly (9.8%)

--- Generating BERT embeddings for OpenSSH ---
Processing all 2000 texts
Generating BERT embeddings...
Generated embeddings shape: (2000, 768)
Label distribution: 424 normal, 1576 anomaly (78.8%)

--- Generating BERT embeddings for Proxifier ---
Processing all 2000 texts
Generating BERT embeddings...
Generated embeddings shape: (2000, 768)
Label distribution: 1903 normal, 97 anomaly (4.9%)

--- Generating BERT embe

In [31]:
bert_embeddings_path = FEATURES_PATH / "bert_embeddings.pkl"
with open(bert_embeddings_path, 'wb') as f:
    pickle.dump({
        'embeddings_data': bert_embeddings_data,
        'model_name': PROJECT_CONFIG['bert_model_name'],
        'max_length': PROJECT_CONFIG['max_sequence_length'],
        'stats': embedding_stats
    }, f)

print(f"Successfully processed {len(bert_embeddings_data)} sources")
for stats in embedding_stats:
    print(f"   {stats['source']:<12} | {stats['n_samples']:>6,} samples | "
          f"Dim: {stats['embedding_dim']:>3} | Labels: {'✓' if stats['has_labels'] else '✗'} | "
          f"Anomaly: {stats['anomaly_rate']:>5.1f}%" if stats['anomaly_rate'] is not None else "")

Successfully processed 6 sources
   Apache       |  2,000 samples | Dim: 768 | Labels: ✓ | Anomaly:  29.8%
   BGL          |  2,000 samples | Dim: 768 | Labels: ✓ | Anomaly:  75.0%
   HPC          |  2,000 samples | Dim: 768 | Labels: ✓ | Anomaly:   9.8%
   OpenSSH      |  2,000 samples | Dim: 768 | Labels: ✓ | Anomaly:  78.8%
   Proxifier    |  2,000 samples | Dim: 768 | Labels: ✓ | Anomaly:   4.9%
   Zookeeper    |  2,000 samples | Dim: 768 | Labels: ✓ | Anomaly:  46.2%


Template + BERT Hybrid Features

In [38]:
def create_template_embeddings(templates, embedding_dim=50):
    template_ids = list(templates.keys())
    n_templates = len(template_ids)
    np.random.seed(RANDOM_SEED)
    template_embeddings = np.random.normal(0, 0.1, (n_templates, embedding_dim))
    template_to_idx = {tid: idx for idx, tid in enumerate(template_ids)}
    return template_embeddings, template_to_idx

In [39]:
def create_template_features(template_ids, templates, method='frequency', embedding_dim=50):
    features = {}
    template_counts = Counter(template_ids)
    total_logs = len(template_ids)
    
    if method == 'frequency':
        template_freq = {}
        for tid in template_ids:
            if tid == -1:
                template_freq[tid] = 0.0
            else:
                template_freq[tid] = template_counts[tid] / total_logs
        
        unique_templates = sorted(set(tid for tid in template_ids if tid != -1))
        feature_vector = []
        for tid in template_ids:
            vec = [0.0] * len(unique_templates)
            if tid != -1 and tid in unique_templates:
                idx = unique_templates.index(tid)
                vec[idx] = template_freq[tid]
            feature_vector.append(vec)
        
        return np.array(feature_vector), unique_templates
    
    elif method == 'onehot':
        unique_templates = sorted(set(tid for tid in template_ids if tid != -1))
        feature_vector = []
        for tid in template_ids:
            vec = [0.0] * len(unique_templates)
            if tid != -1 and tid in unique_templates:
                idx = unique_templates.index(tid)
                vec[idx] = 1.0
            feature_vector.append(vec)
        
        return np.array(feature_vector), unique_templates
    
    elif method == 'embedding':
        unique_templates = sorted(set(tid for tid in template_ids if tid != -1))
        template_embeddings, template_to_idx = create_template_embeddings(
            {tid: None for tid in unique_templates},
            embedding_dim=embedding_dim
        )
        
        feature_vector = []
        for tid in template_ids:
            if tid == -1 or tid not in template_to_idx:
                vec = np.zeros(template_embeddings.shape[1])
            else:
                vec = template_embeddings[template_to_idx[tid]]
            feature_vector.append(vec)
        
        return np.array(feature_vector), unique_templates

In [40]:
def combine_template_bert_features(bert_emb, template_features, method='concat'):
    if method == 'concat':
        return np.hstack([bert_emb, template_features])
    
    elif method == 'weighted':
        bert_normalized = bert_emb / np.linalg.norm(bert_emb, axis=1, keepdims=True)
        template_normalized = template_features / (np.linalg.norm(template_features, axis=1, keepdims=True) + 1e-8)
        return 0.7 * bert_normalized + 0.3 * template_normalized
    
    elif method == 'attention':
        return np.hstack([bert_emb, template_features])

In [41]:
hybrid_features_data = {}

for log_source in PROJECT_CONFIG['log_sources']:
    if log_source not in bert_embeddings_data or log_source not in template_data:
        print(f"⚠️  Missing data for {log_source}, skipping hybrid features")
        continue
    
    print(f"\n--- Creating hybrid features for {log_source} ---")
    
    bert_data = bert_embeddings_data[log_source]
    bert_embeddings = bert_data['embeddings']
    bert_dim = bert_embeddings.shape[1]
    
    template_info = template_data[log_source]
    template_ids = template_info['template_ids']
    templates = template_info['templates']
    
    if bert_data['sample_indices'] is not None:
        template_ids = [template_ids[i] for i in bert_data['sample_indices']]
    
    template_variants = {}
    
    template_onehot, unique_templates = create_template_features(
        template_ids, templates, method='onehot'
    )
    template_variants['onehot'] = template_onehot
    
    template_freq, _ = create_template_features(
        template_ids, templates, method='frequency'
    )
    template_variants['frequency'] = template_freq
    
    template_emb, _ = create_template_features(
        template_ids, templates, method='embedding', embedding_dim=bert_dim
    )
    template_variants['embedding'] = template_emb
    
    print(f"BERT embeddings shape: {bert_embeddings.shape}")
    print(f"Template features - OneHot: {template_onehot.shape}, "
          f"Frequency: {template_freq.shape}, Embedding: {template_emb.shape}")
    print(f"Unique templates: {len(unique_templates)}")
    
    hybrid_variants = {}
    
    for template_type, template_feats in template_variants.items():
        hybrid_concat = combine_template_bert_features(
            bert_embeddings, template_feats, method='concat'
        )
        hybrid_variants[f'bert_{template_type}_concat'] = hybrid_concat
        
        if template_type == 'embedding':
            hybrid_weighted = combine_template_bert_features(
                bert_embeddings, template_feats, method='weighted'
            )
            hybrid_variants[f'bert_{template_type}_weighted'] = hybrid_weighted
    
    hybrid_features_data[log_source] = {
        'bert_only': bert_embeddings,
        'template_variants': template_variants,
        'hybrid_variants': hybrid_variants,
        'labels': bert_data['labels'],
        'unique_templates': unique_templates,
        'template_ids': template_ids,
        'texts': bert_data['texts']
    }
    
    print(f"Created {len(hybrid_variants)} hybrid feature variants")


--- Creating hybrid features for Apache ---
BERT embeddings shape: (2000, 768)
Template features - OneHot: (2000, 6), Frequency: (2000, 6), Embedding: (2000, 768)
Unique templates: 6
Created 4 hybrid feature variants

--- Creating hybrid features for BGL ---
BERT embeddings shape: (2000, 768)
Template features - OneHot: (2000, 105), Frequency: (2000, 105), Embedding: (2000, 768)
Unique templates: 105
Created 4 hybrid feature variants

--- Creating hybrid features for HPC ---
BERT embeddings shape: (2000, 768)
Template features - OneHot: (2000, 45), Frequency: (2000, 45), Embedding: (2000, 768)
Unique templates: 45
Created 4 hybrid feature variants

--- Creating hybrid features for OpenSSH ---
BERT embeddings shape: (2000, 768)
Template features - OneHot: (2000, 23), Frequency: (2000, 23), Embedding: (2000, 768)
Unique templates: 23
Created 4 hybrid feature variants

--- Creating hybrid features for Proxifier ---
BERT embeddings shape: (2000, 768)
Template features - OneHot: (2000, 403

In [42]:
hybrid_features_path = FEATURES_PATH / "hybrid_features.pkl"
with open(hybrid_features_path, 'wb') as f:
    pickle.dump({
        'hybrid_features_data': hybrid_features_data,
        'feature_types': ['bert_only', 'template_onehot', 'template_frequency', 
                          'template_embedding', 'bert_onehot_concat', 'bert_frequency_concat',
                          'bert_embedding_concat', 'bert_embedding_weighted'],
        'creation_timestamp': datetime.now().isoformat()
    }, f)

In [43]:
print(f"Successfully created hybrid features for {len(hybrid_features_data)} sources")

for log_source, data in hybrid_features_data.items():
    print(f"\n{log_source}:")
    print(f"    BERT only: {data['bert_only'].shape}")
    print(f"    Template variants: {len(data['template_variants'])}")
    print(f"    Hybrid variants: {len(data['hybrid_variants'])}")
    if data['labels'] is not None:
        anomaly_rate = np.mean(data['labels']) * 100
        print(f"    Anomaly rate: {anomaly_rate:.1f}%")

Successfully created hybrid features for 6 sources

Apache:
    BERT only: (2000, 768)
    Template variants: 3
    Hybrid variants: 4
    Anomaly rate: 29.8%

BGL:
    BERT only: (2000, 768)
    Template variants: 3
    Hybrid variants: 4
    Anomaly rate: 75.0%

HPC:
    BERT only: (2000, 768)
    Template variants: 3
    Hybrid variants: 4
    Anomaly rate: 9.8%

OpenSSH:
    BERT only: (2000, 768)
    Template variants: 3
    Hybrid variants: 4
    Anomaly rate: 78.8%

Proxifier:
    BERT only: (2000, 768)
    Template variants: 3
    Hybrid variants: 4
    Anomaly rate: 4.9%

Zookeeper:
    BERT only: (2000, 768)
    Template variants: 3
    Hybrid variants: 4
    Anomaly rate: 46.2%


Cross-Source Feature Analysis and Leave-One-Out Preparation

In [45]:
bert_dims = []
template_counts = []
sample_counts = []

print("Source compatibility analysis:")
for source, data in hybrid_features_data.items():
    bert_dim = data['bert_only'].shape[1]
    n_templates = len(data['unique_templates'])
    n_samples = data['bert_only'].shape[0]
    has_labels = data['labels'] is not None
    
    bert_dims.append(bert_dim)
    template_counts.append(n_templates)
    sample_counts.append(n_samples)
    
    print(f"   {source:<12} | BERT: {bert_dim} | Templates: {n_templates:>3} | Samples: {n_samples:>6,} | Labels: {'Yes' if has_labels else 'NA'}")

print(f"\nCompatibility check:")
print(f"   BERT dimensions consistent: {len(set(bert_dims)) == 1} (all {bert_dims[0] if len(set(bert_dims)) == 1 else 'different'})")
print(f"   Template counts - Min: {min(template_counts)}, Max: {max(template_counts)}, Avg: {np.mean(template_counts):.1f}")
print(f"   Sample counts - Min: {min(sample_counts):,}, Max: {max(sample_counts):,}, Total: {sum(sample_counts):,}")


Source compatibility analysis:
   Apache       | BERT: 768 | Templates:   6 | Samples:  2,000 | Labels: Yes
   BGL          | BERT: 768 | Templates: 105 | Samples:  2,000 | Labels: Yes
   HPC          | BERT: 768 | Templates:  45 | Samples:  2,000 | Labels: Yes
   OpenSSH      | BERT: 768 | Templates:  23 | Samples:  2,000 | Labels: Yes
   Proxifier    | BERT: 768 | Templates: 403 | Samples:  2,000 | Labels: Yes
   Zookeeper    | BERT: 768 | Templates:  46 | Samples:  2,000 | Labels: Yes

Compatibility check:
   BERT dimensions consistent: True (all 768)
   Template counts - Min: 6, Max: 403, Avg: 104.7
   Sample counts - Min: 2,000, Max: 2,000, Total: 12,000


In [46]:
def create_leave_one_out_splits():
    splits = []
    
    for test_source in hybrid_features_data.keys():
        train_sources = [s for s in hybrid_features_data.keys() if s != test_source]
        
        splits.append({
            'test_source': test_source,
            'train_sources': train_sources,
            'test_samples': hybrid_features_data[test_source]['bert_only'].shape[0],
            'train_samples': sum(hybrid_features_data[s]['bert_only'].shape[0] for s in train_sources)
        })
    
    return splits

cross_source_splits = create_leave_one_out_splits()

In [47]:
print(f"Total experiments: {len(cross_source_splits)}")

for i, split in enumerate(cross_source_splits, 1):
    print(f"{i:2d}. Test: {split['test_source']:<12} | Train: {len(split['train_sources'])} sources | "
          f"Test samples: {split['test_samples']:>6,} | Train samples: {split['train_samples']:>7,}")

cross_source_path = FEATURES_PATH / "cross_source_splits.pkl"
with open(cross_source_path, 'wb') as f:
    pickle.dump({
        'splits': cross_source_splits,
        'feature_compatibility': {
            'bert_dims': bert_dims,
            'template_counts': template_counts,
            'sample_counts': sample_counts
        }
    }, f)

Total experiments: 6
 1. Test: Apache       | Train: 5 sources | Test samples:  2,000 | Train samples:  10,000
 2. Test: BGL          | Train: 5 sources | Test samples:  2,000 | Train samples:  10,000
 3. Test: HPC          | Train: 5 sources | Test samples:  2,000 | Train samples:  10,000
 4. Test: OpenSSH      | Train: 5 sources | Test samples:  2,000 | Train samples:  10,000
 5. Test: Proxifier    | Train: 5 sources | Test samples:  2,000 | Train samples:  10,000
 6. Test: Zookeeper    | Train: 5 sources | Test samples:  2,000 | Train samples:  10,000
