Setup

In [1]:
import os
import sys
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
import json
import pickle
from collections import Counter
import re
from scipy.stats import entropy

import findspark
findspark.init()

from pyspark.sql import SparkSession, Window
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler, StandardScaler

import torch
from transformers import AutoTokenizer, AutoModel

import drain3
from drain3 import TemplateMiner
from drain3.template_miner_config import TemplateMinerConfig
from drain3.masking import MaskingInstruction

from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier

In [2]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

PROJECT_ROOT = Path(r"C:\Computer Science\AIMLDL\log-anomaly-detection")
DATASET_PATH = PROJECT_ROOT / "dataset"
LABELED_DATA_PATH = DATASET_PATH / "labeled_data"
NORMALIZED_DATA_PATH = LABELED_DATA_PATH / "normalized"
FEATURES_PATH = PROJECT_ROOT / "features"
FEATURES_PATH.mkdir(parents=True, exist_ok=True)

os.environ['HADOOP_HOME'] = 'C:\\hadoop'
os.environ['PATH'] = f"{os.environ['HADOOP_HOME']}\\bin;{os.environ['PATH']}"

In [3]:
spark = SparkSession.builder \
    .master("local[*]") \
    .config("spark.driver.memory", "18g") \
    .config("spark.executor.memory", "16g") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.default.parallelism", "8") \
    .appName("MultiClassFeatureEngineering") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

print(f"Spark version: {spark.version}")

Spark version: 3.4.1


In [4]:
PROJECT_CONFIG = {
    'bert_model_name': 'bert-base-uncased',
    'max_sequence_length': 512,
    'num_classes': 7,
    'label_map': {
        0: 'normal',
        1: 'security_anomaly',
        2: 'system_failure',
        3: 'performance_issue',
        4: 'network_anomaly',
        5: 'config_error',
        6: 'hardware_issue'
    },
    'log_sources': []
}

dataset_registry = {}

enhanced_files = list(NORMALIZED_DATA_PATH.glob("*_enhanced.csv"))
for file_path in enhanced_files:
    source_name = file_path.stem.replace('_enhanced', '')
    PROJECT_CONFIG['log_sources'].append(source_name)
    dataset_registry[source_name] = {
        'file_path': str(file_path),
        'log_type': source_name
    }

print(f"Loaded {len(dataset_registry)} log sources")
print(f"Label mapping: {PROJECT_CONFIG['label_map']}")

Loaded 9 log sources
Label mapping: {0: 'normal', 1: 'security_anomaly', 2: 'system_failure', 3: 'performance_issue', 4: 'network_anomaly', 5: 'config_error', 6: 'hardware_issue'}


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

tokenizer = AutoTokenizer.from_pretrained(PROJECT_CONFIG['bert_model_name'])
bert_model = AutoModel.from_pretrained(PROJECT_CONFIG['bert_model_name'])
bert_model.to(device)
bert_model.eval()

drain_configs = {
    'hdfs': {'sim_th': 0.5, 'depth': 4},
    'bgl': {'sim_th': 0.3, 'depth': 5},
    'hadoop': {'sim_th': 0.4, 'depth': 4},
    'apache': {'sim_th': 0.4, 'depth': 4},
    'default': {'sim_th': 0.4, 'depth': 4}
}

Using device: cuda


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Temporal and Statistical Feature Extraction

In [6]:
def calculate_shannon_entropy(text):
    """Calculate Shannon entropy - higher for more random/error messages"""
    if not text or pd.isna(text):
        return 0
    text = str(text)
    if len(text) == 0:
        return 0
    prob = [text.count(c) / len(text) for c in set(text)]
    return -sum(p * np.log2(p) for p in prob if p > 0)

def count_repeated_words(text):
    """Count repeated words in text (common in system failures)"""
    if not text or pd.isna(text):
        return 0
    words = str(text).lower().split()
    if len(words) <= 1:
        return 0
    word_counts = Counter(words)
    return sum(1 for count in word_counts.values() if count > 1)

def count_repeated_chars(text):
    """Count repeated character sequences (common in error messages)"""
    if not text or pd.isna(text):
        return 0
    text = str(text)
    repeated_count = 0
    for i in range(len(text) - 1):
        if text[i] == text[i + 1]:
            repeated_count += 1
    return repeated_count

def get_error_patterns_by_source(source_type):
    """Source-specific error patterns for minority class detection"""
    patterns = {
        'apache': {
            'error_level': r'\b(error|critical|alert|emergency)\b',
            'http_error': r'\b(40[0-9]|50[0-9])\b',
            'security_threat': r'\b(attack|intrusion|unauthorized|forbidden|hack)\b',
            'resource_issue': r'\b(timeout|memory|disk|space|limit)\b'
        },
        'linux': {
            'kernel_panic': r'\b(kernel|panic|oops|segfault|core dump)\b',
            'auth_failure': r'\b(authentication failed|login failed|access denied)\b',
            'resource_exhaustion': r'\b(out of memory|disk full|no space|quota exceeded)\b',
            'hardware_error': r'\b(hardware|disk error|i/o error|bad sector)\b'
        },
        'hadoop': {
            'job_failure': r'\b(job failed|task failed|exception|error)\b',
            'performance_issue': r'\b(slow|timeout|latency|performance)\b',
            'network_problem': r'\b(connection|unreachable|network|socket)\b',
            'config_error': r'\b(configuration|config|property|setting)\b'
        },
        'openssh': {
            'security_breach': r'\b(failed password|invalid user|break-in|attack)\b',
            'connection_issue': r'\b(connection closed|timeout|refused)\b'
        },
        'bgl': {
            'system_failure': r'\b(failure|failed|error|exception)\b',
            'hardware_issue': r'\b(hardware|disk|memory|cpu|node)\b',
            'config_error': r'\b(config|configuration|parameter)\b'
        },
        'hdfs': {
            'system_failure': r'\b(block|replica|datanode|namenode|error)\b',
            'network_problem': r'\b(connection|network|timeout)\b'
        },
        'hpc': {
            'system_failure': r'\b(node|job|task|error|failure)\b',
            'performance_issue': r'\b(slow|performance|latency|timeout)\b',
            'network_problem': r'\b(network|connection|communication)\b',
            'hardware_issue': r'\b(hardware|memory|disk|cpu)\b'
        },
        'proxifier': {
            'network_anomaly': r'\b(connection|proxy|tunnel|network)\b'
        },
        'zookeeper': {
            'system_failure': r'\b(error|exception|failure)\b',
            'performance_issue': r'\b(timeout|slow|latency)\b',
            'network_problem': r'\b(connection|network|socket)\b',
            'config_error': r'\b(config|configuration|property)\b'
        }
    }
    return patterns.get(source_type.lower(), {})

def detect_holiday_patterns(timestamps):
    """Detect holiday/special time periods (simplified implementation)"""
    # This is a simplified version - in practice, you'd use a holiday calendar
    if timestamps is None or len(timestamps) == 0:
        return [0] * len(timestamps) if hasattr(timestamps, '__len__') else 0
    
    # For now, just mark weekends as "holiday periods"
    try:
        if hasattr(timestamps, 'dt'):
            return (timestamps.dt.dayofweek >= 5).astype(int)
        else:
            return [0] * len(timestamps)
    except:
        return [0] * len(timestamps) if hasattr(timestamps, '__len__') else 0

def add_imbalance_aware_features(df_pandas, source_type, content_col):
    """Add features specifically designed for imbalanced anomaly detection"""
    
    print(f"  Adding imbalance-aware features for {source_type}")
    
    # Message complexity features (anomalies often have different complexity)
    if content_col in df_pandas.columns:
        df_pandas['msg_length'] = df_pandas[content_col].str.len().fillna(0)
        df_pandas['msg_word_count'] = df_pandas[content_col].str.split().str.len().fillna(0)
        df_pandas['msg_unique_chars'] = df_pandas[content_col].apply(
            lambda x: len(set(str(x))) if pd.notna(x) else 0
        )
        df_pandas['msg_entropy'] = df_pandas[content_col].apply(calculate_shannon_entropy)
        
        # Anomaly-specific patterns (source-dependent)
        error_patterns = get_error_patterns_by_source(source_type)
        for pattern_name, pattern in error_patterns.items():
            df_pandas[f'has_{pattern_name}'] = df_pandas[content_col].str.contains(
                pattern, case=False, na=False
            ).astype(int)
        
        # Character distribution features
        df_pandas['special_char_ratio'] = (
            df_pandas[content_col].str.count(r'[^a-zA-Z0-9\s]') / 
            (df_pandas['msg_length'] + 1)
        ).fillna(0)
        df_pandas['number_ratio'] = (
            df_pandas[content_col].str.count(r'\d') / 
            (df_pandas['msg_length'] + 1)
        ).fillna(0)
        df_pandas['uppercase_ratio'] = (
            df_pandas[content_col].str.count(r'[A-Z]') / 
            (df_pandas['msg_length'] + 1)
        ).fillna(0)
        
        # Repetition patterns (common in system failures)
        df_pandas['repeated_words'] = df_pandas[content_col].apply(count_repeated_words)
        df_pandas['repeated_chars'] = df_pandas[content_col].apply(count_repeated_chars)
    
    return df_pandas

def add_temporal_anomaly_features(df_pandas):
    """Add temporal features that capture anomaly patterns"""
    
    print("  Adding enhanced temporal anomaly features")
    
    # Enhanced anomaly-specific temporal features
    if 'hour' in df_pandas.columns:
        df_pandas['is_off_hours'] = ((df_pandas['hour'] < 6) | (df_pandas['hour'] > 22)).astype(int)
    
    if 'is_weekend' in df_pandas.columns and 'is_night' in df_pandas.columns:
        df_pandas['is_weekend_night'] = (df_pandas['is_weekend'] & df_pandas['is_night']).astype(int)
    
    # Holiday patterns (simplified)
    if 'timestamp_dt' in df_pandas.columns:
        df_pandas['is_holiday_period'] = detect_holiday_patterns(df_pandas['timestamp_dt'])
    
    # Time gap analysis (anomalies often have unusual timing)
    if 'timestamp_dt' in df_pandas.columns:
        df_pandas = df_pandas.sort_values('timestamp_dt').reset_index(drop=True)
        df_pandas['time_gap_seconds'] = df_pandas['timestamp_dt'].diff().dt.total_seconds().fillna(0)
        df_pandas['is_burst'] = (df_pandas['time_gap_seconds'] < 1).astype(int)  # Rapid succession
        df_pandas['is_isolated'] = (df_pandas['time_gap_seconds'] > 300).astype(int)  # Isolated events
        
        # Rolling window features for different time scales
        windows = ['1min', '5min', '15min', '1H', '6H']
        df_pandas_indexed = df_pandas.set_index('timestamp_dt')
        
        for window in windows:
            try:
                # Log frequency in window
                df_pandas[f'log_count_{window}'] = df_pandas_indexed.rolling(window).size().values
                
                # Message diversity in window
                if any(col.startswith('has_') for col in df_pandas.columns):
                    error_cols = [col for col in df_pandas.columns if col.startswith('has_')]
                    df_pandas[f'error_density_{window}'] = (
                        df_pandas_indexed[error_cols].rolling(window).sum().sum(axis=1).values
                    )
            except Exception as e:
                print(f"    Warning: Could not create {window} features: {e}")
                df_pandas[f'log_count_{window}'] = 0
                df_pandas[f'error_density_{window}'] = 0
    
    return df_pandas

def add_statistical_anomaly_features(df_pandas):
    """Add statistical features that capture anomaly distributions"""
    
    print("  Adding statistical anomaly features")
    
    # Message length distribution features
    if 'msg_length' in df_pandas.columns:
        windows = [10, 50, 100]
        for w in windows:
            df_pandas[f'msg_len_mean_{w}'] = df_pandas['msg_length'].rolling(w, min_periods=1).mean()
            df_pandas[f'msg_len_std_{w}'] = df_pandas['msg_length'].rolling(w, min_periods=1).std()
            df_pandas[f'msg_len_zscore_{w}'] = (
                (df_pandas['msg_length'] - df_pandas[f'msg_len_mean_{w}']) / 
                (df_pandas[f'msg_len_std_{w}'] + 1e-6)
            )
            df_pandas[f'is_length_outlier_{w}'] = (
                np.abs(df_pandas[f'msg_len_zscore_{w}']) > 2
            ).astype(int)
    
    # Temporal distribution features
    for col in ['time_diff_seconds', 'logs_last_minute']:
        if col in df_pandas.columns:
            windows = [10, 50, 100]
            for w in windows:
                df_pandas[f'{col}_mean_{w}'] = df_pandas[col].rolling(w, min_periods=1).mean()
                df_pandas[f'{col}_std_{w}'] = df_pandas[col].rolling(w, min_periods=1).std()
                df_pandas[f'{col}_zscore_{w}'] = (
                    (df_pandas[col] - df_pandas[f'{col}_mean_{w}']) / 
                    (df_pandas[f'{col}_std_{w}'] + 1e-6)
                )
                df_pandas[f'is_{col}_outlier_{w}'] = (
                    np.abs(df_pandas[f'{col}_zscore_{w}']) > 2
                ).astype(int)
    
    return df_pandas

def create_imbalance_aware_template_features(template_data, labels):
    """Create template features that emphasize minority class patterns"""
    
    enhanced_features = []
    
    for log_source, data_dict in template_data.items():
        templates = data_dict['templates']
        template_ids = data_dict['template_ids']
        
        # Enhanced features for each log entry
        template_counts = Counter(template_ids)
        total = len(template_ids)
        
        for i, template_id in enumerate(template_ids):
            if template_id == -1:
                enhanced_features.append([0] * 15)  # Increased feature count
                continue
            
            # Basic features (existing)
            frequency = template_counts[template_id] / total
            rarity = 1.0 / (frequency + 1e-6)
            template_text = templates[template_id]['template']
            length = len(template_text.split())
            n_wildcards = sum([template_text.count(w) for w in ['<NUM>', '<IP>', '<PATH>', '<UUID>']])
            
            # NEW: Class-specific template features
            class_probs = np.array(templates[template_id]['class_dist']) / (templates[template_id]['count'] + 1e-6)
            
            # Minority class indicators
            minority_score = sum(class_probs[c] for c in [1, 3, 6])  # Classes with low coverage
            anomaly_score = sum(class_probs[c] for c in [1, 2, 3, 4, 5, 6])  # All non-normal
            
            # Template complexity features
            complexity_score = length * n_wildcards / (frequency + 1e-6)
            uniqueness_score = rarity * (1 - max(class_probs))
            
            # Combine features
            features = [
                rarity, length, n_wildcards, frequency,  # Original 4
                minority_score, anomaly_score,           # Class-specific 2
                complexity_score, uniqueness_score,      # Complexity 2
                *class_probs.tolist()                    # Class probabilities 7
            ]  # Total: 15 features
            
            enhanced_features.append(features)
    
    return np.array(enhanced_features)

def analyze_class_imbalance(df_pandas):
    """Comprehensive class imbalance analysis"""
    
    if 'AnomalyLabel' not in df_pandas.columns:
        return None
        
    analysis = {}
    
    # Basic distribution
    label_counts = df_pandas['AnomalyLabel'].value_counts().sort_index()
    total_samples = len(df_pandas)
    
    analysis['class_distribution'] = {}
    analysis['class_percentages'] = {}
    
    for label in range(7):
        count = label_counts.get(label, 0)
        analysis['class_distribution'][label] = count
        analysis['class_percentages'][label] = (count / total_samples) * 100
    
    # Imbalance metrics
    present_classes = [label for label in range(7) if label_counts.get(label, 0) > 0]
    if len(present_classes) > 1:
        counts = [label_counts[label] for label in present_classes]
        analysis['imbalance_ratio'] = max(counts) / min(counts)
        analysis['minority_classes'] = [
            label for label in present_classes 
            if label_counts[label] < total_samples * 0.05  # Less than 5%
        ]
        analysis['extreme_minority'] = [
            label for label in present_classes 
            if label_counts[label] < total_samples * 0.01  # Less than 1%
        ]
    else:
        analysis['imbalance_ratio'] = 1.0
        analysis['minority_classes'] = []
        analysis['extreme_minority'] = []
    
    return analysis

In [7]:
pyspark_features_data = {}

for log_source in PROJECT_CONFIG['log_sources']:
    if log_source not in dataset_registry:
        continue
    
    print(f"\n{'='*60}")
    print(f"Processing: {log_source}")
    print(f"{'='*60}")
    
    file_path = dataset_registry[log_source]['file_path']
    
    df_spark = spark.read.csv(file_path, header=True, inferSchema=True)
    
    print(f"Loaded {df_spark.count()} rows with {len(df_spark.columns)} columns")
    
    content_col = None
    for col in ['Content', 'content', 'Message', 'message', 'Text', 'text']:
        if col in df_spark.columns:
            content_col = col
            break
    
    if content_col is None:
        print(f"No content column found, skipping")
        continue
    
    if 'timestamp_dt' in df_spark.columns:
        df_spark = df_spark.withColumn('timestamp', F.col('timestamp_dt').cast(TimestampType()))
    elif 'timestamp_normalized' in df_spark.columns:
        df_spark = df_spark.withColumn('timestamp', F.to_timestamp('timestamp_normalized'))
    else:
        print("No timestamp column found, skipping")
        continue
    
    print("Extracting basic temporal features")
    
    df_spark = df_spark.withColumn('hour', F.hour('timestamp')) \
                       .withColumn('day_of_week', F.dayofweek('timestamp')) \
                       .withColumn('day_of_month', F.dayofmonth('timestamp')) \
                       .withColumn('month', F.month('timestamp')) \
                       .withColumn('is_weekend', F.when(F.dayofweek('timestamp').isin([1, 7]), 1).otherwise(0)) \
                       .withColumn('is_business_hours', F.when(F.hour('timestamp').between(9, 17), 1).otherwise(0)) \
                       .withColumn('is_night', F.when(F.hour('timestamp').between(0, 6), 1).otherwise(0))
    
    window_spec = Window.orderBy('timestamp')
    window_spec_1min = Window.orderBy(F.col('timestamp').cast('long')).rangeBetween(-60, 0)
    
    df_spark = df_spark.withColumn('log_index', F.monotonically_increasing_id())
    
    df_spark = df_spark.withColumn('prev_timestamp', F.lag('timestamp', 1).over(window_spec))
    df_spark = df_spark.withColumn('time_diff_seconds', 
                                    F.when(F.col('prev_timestamp').isNotNull(), 
                                           F.unix_timestamp('timestamp') - F.unix_timestamp('prev_timestamp'))
                                    .otherwise(0))
    
    df_spark = df_spark.withColumn('logs_last_minute', 
                                    F.count('*').over(window_spec_1min))
    
    print("Calculating basic statistical features")
    
    df_spark = df_spark.withColumn('content_length', F.length(F.col(content_col)))
    df_spark = df_spark.withColumn('word_count', F.size(F.split(F.col(content_col), ' ')))
    
    window_10 = Window.orderBy('timestamp').rowsBetween(-9, 0)
    
    df_spark = df_spark.withColumn('content_length_mean_10', F.avg('content_length').over(window_10))
    df_spark = df_spark.withColumn('content_length_std_10', F.stddev('content_length').over(window_10))
    df_spark = df_spark.withColumn('time_diff_mean_10', F.avg('time_diff_seconds').over(window_10))
    df_spark = df_spark.withColumn('time_diff_std_10', F.stddev('time_diff_seconds').over(window_10))
    
    hour_counts = df_spark.groupBy('hour').count().withColumnRenamed('count', 'hour_frequency')
    df_spark = df_spark.join(hour_counts, on='hour', how='left')
    
    if 'AnomalyLabel' in df_spark.columns:
        df_spark = df_spark.withColumn('AnomalyLabel', F.col('AnomalyLabel').cast(IntegerType()))
        df_spark = df_spark.withColumn('AnomalyLabel', 
                                        F.when(F.col('AnomalyLabel').isNull(), 0)
                                        .when(F.col('AnomalyLabel') < 0, 0)
                                        .when(F.col('AnomalyLabel') > 6, 0)
                                        .otherwise(F.col('AnomalyLabel')))
    
    df_spark.cache()
    
    total_count = df_spark.count()
    if 'AnomalyLabel' in df_spark.columns:
        label_dist = df_spark.groupBy('AnomalyLabel').count().orderBy('AnomalyLabel').collect()
        print(f"Total: {total_count:,}")
        print("Label distribution:")
        for row in label_dist:
            lbl = row['AnomalyLabel']
            cnt = row['count']
            lbl_name = PROJECT_CONFIG['label_map'].get(lbl, 'unknown')
            print(f"  {lbl} ({lbl_name}): {cnt:,} ({cnt/total_count*100:.2f}%)")
    
    max_samples = 5000
    if total_count > max_samples:
        sample_fraction = max_samples / total_count
        df_spark_sampled = df_spark.sample(withReplacement=False, fraction=sample_fraction, seed=RANDOM_SEED)
        print(f"Sampled {df_spark_sampled.count()} rows")
    else:
        df_spark_sampled = df_spark
    
    df_pandas = df_spark_sampled.toPandas()
    
    print(f"Converted to Pandas: {df_pandas.shape}")
    
    # ========================================================================
    # APPLY ENHANCED FEATURE ENGINEERING FOR IMBALANCED DATA
    # ========================================================================
    
    # Phase 1: Add imbalance-aware features
    df_pandas = add_imbalance_aware_features(df_pandas, log_source, content_col)
    
    # Phase 2: Enhanced temporal features
    df_pandas = add_temporal_anomaly_features(df_pandas)
    
    # Phase 3: Statistical anomaly features
    df_pandas = add_statistical_anomaly_features(df_pandas)
    
    # Analyze class imbalance
    imbalance_analysis = analyze_class_imbalance(df_pandas)
    
    if imbalance_analysis:
        print(f"IMBALANCE ANALYSIS:")
        print(f"Classes present: {len([c for c in range(7) if imbalance_analysis['class_distribution'][c] > 0])}/7")
        print(f"Imbalance ratio: {imbalance_analysis['imbalance_ratio']:.2f}:1")
        
        if imbalance_analysis['minority_classes']:
            print(f"Minority classes: {imbalance_analysis['minority_classes']}")
        if imbalance_analysis['extreme_minority']:
            print(f"Extreme minority: {imbalance_analysis['extreme_minority']}")
    
    print(f"Enhanced features shape: {df_pandas.shape}")
    print()
    
    pyspark_features_data[log_source] = {
        'spark_df': df_spark,
        'pandas_df': df_pandas,
        'content_col': content_col,
        'total_count': total_count,
        'imbalance_analysis': imbalance_analysis
    }


Processing: Apache_2k
Loaded 2000 rows with 21 columns
Extracting basic temporal features
Calculating basic statistical features
Total: 2,000
Label distribution:
  0 (normal): 1,405 (70.25%)
  1 (security_anomaly): 32 (1.60%)
  2 (system_failure): 563 (28.15%)
Converted to Pandas: (2000, 31)
  Adding imbalance-aware features for Apache_2k
  Adding enhanced temporal anomaly features
  Adding statistical anomaly features
IMBALANCE ANALYSIS:
Classes present: 3/7
Imbalance ratio: 43.91:1
Minority classes: [1]
Enhanced features shape: (2000, 92)


Processing: BGL_2k
Loaded 2000 rows with 28 columns
Extracting basic temporal features
Calculating basic statistical features
Total: 2,000
Label distribution:
  0 (normal): 501 (25.05%)
  1 (security_anomaly): 150 (7.50%)
  2 (system_failure): 913 (45.65%)
  5 (config_error): 73 (3.65%)
  6 (hardware_issue): 363 (18.15%)
Converted to Pandas: (2000, 37)
  Adding imbalance-aware features for BGL_2k
  Adding enhanced temporal anomaly features
  Addi

Drain Template Extraction

In [8]:
template_data = {}

for log_source, data_dict in pyspark_features_data.items():
    print(f"Enhanced Template Extraction for {log_source}")
    
    df_pandas = data_dict['pandas_df']
    content_col = data_dict['content_col']
    
    source_config = drain_configs.get(log_source, drain_configs['default'])
    
    drain_config = TemplateMinerConfig()
    drain_config.drain_sim_th = source_config['sim_th']
    drain_config.drain_depth = source_config['depth']
    drain_config.drain_max_children = 100
    
    # Enhanced masking instructions for better anomaly detection
    drain_config.masking_instructions = [
        MaskingInstruction(r'\d+', "<NUM>"),
        MaskingInstruction(r'[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}', "<UUID>"),
        MaskingInstruction(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b', "<IP>"),
        MaskingInstruction(r'/[^\s]*', "<PATH>"),
        MaskingInstruction(r'\b[0-9a-fA-F]{8,}\b', "<HEX>"),  # Hexadecimal values
        MaskingInstruction(r'\b\d{4}-\d{2}-\d{2}\b', "<DATE>"),  # Date patterns
        MaskingInstruction(r'\b\d{2}:\d{2}:\d{2}\b', "<TIME>"),  # Time patterns
    ]
    
    template_miner = TemplateMiner(config=drain_config)
    
    templates = {}
    template_ids = []
    
    labels = df_pandas['AnomalyLabel'].values if 'AnomalyLabel' in df_pandas.columns else None
    
    print("Processing log messages for template extraction...")
    for idx, content in enumerate(df_pandas[content_col].fillna("").astype(str)):
        if content.strip() == "":
            template_ids.append(-1)
            continue
        
        result = template_miner.add_log_message(content.strip())
        tid = result["cluster_id"]
        template_ids.append(tid)
        
        if tid not in templates:
            templates[tid] = {
                'template': result["template_mined"],
                'count': 1,
                'class_dist': [0] * 7,
                'anomaly_score': 0.0,
                'minority_score': 0.0
            }
        else:
            templates[tid]['count'] += 1
        
        if labels is not None:
            lbl = int(labels[idx])
            templates[tid]['class_dist'][lbl] += 1
    
    print(f"Extracted {len(templates)} unique templates")
    
    # Calculate enhanced template scores
    for tid, template_info in templates.items():
        class_probs = np.array(template_info['class_dist']) / (template_info['count'] + 1e-6)
        
        # Anomaly score (all non-normal classes)
        template_info['anomaly_score'] = sum(class_probs[c] for c in [1, 2, 3, 4, 5, 6])
        
        # Minority score (focus on classes with low coverage)
        template_info['minority_score'] = sum(class_probs[c] for c in [1, 3, 6])
    
    # Create enhanced template features using the improved function
    enhanced_template_features = create_imbalance_aware_template_features(
        {log_source: {'templates': templates, 'template_ids': template_ids}}, 
        labels
    )
    
    template_data[log_source] = {
        'templates': templates,
        'template_ids': template_ids,
        'enhanced_features': enhanced_template_features
    }
    
    print(f"Enhanced template features shape: {template_data[log_source]['enhanced_features'].shape}")
    print(f"Template features per sample: {template_data[log_source]['enhanced_features'].shape[1]}")
    
    # Template analysis for imbalanced data
    if labels is not None:
        minority_templates = []
        anomaly_templates = []
        
        for tid, template_info in templates.items():
            if template_info['minority_score'] > 0.5:
                minority_templates.append(tid)
            if template_info['anomaly_score'] > 0.3:
                anomaly_templates.append(tid)
        
        print(f"Templates with high minority class association: {len(minority_templates)}")
        print(f"Templates with high anomaly association: {len(anomaly_templates)}")
    
    print()

Enhanced Template Extraction for Apache_2k
Processing log messages for template extraction...
Extracted 6 unique templates
Enhanced template features shape: (2000, 15)
Template features per sample: 15
Templates with high minority class association: 1
Templates with high anomaly association: 4

Enhanced Template Extraction for BGL_2k
Processing log messages for template extraction...
Extracted 105 unique templates
Enhanced template features shape: (2000, 15)
Template features per sample: 15
Templates with high minority class association: 32
Templates with high anomaly association: 59

Enhanced Template Extraction for Hadoop_2k
Processing log messages for template extraction...
Extracted 102 unique templates
Enhanced template features shape: (2000, 15)
Template features per sample: 15
Templates with high minority class association: 1
Templates with high anomaly association: 21

Enhanced Template Extraction for HDFS_2k
Processing log messages for template extraction...
Extracted 16 unique

Bert Embeddings and Statistical Features

In [9]:
bert_features_data = {}

for log_source, data_dict in pyspark_features_data.items():
    print(f"Enhanced BERT Feature Generation for {log_source}")
    
    df_pandas = data_dict['pandas_df']
    content_col = data_dict['content_col']
    
    texts = df_pandas[content_col].fillna("").astype(str).tolist()
    labels = df_pandas['AnomalyLabel'].values if 'AnomalyLabel' in df_pandas.columns else None
    
    print(f"Processing {len(texts)} texts...")
    all_embeddings = []
    batch_size = 16
    
    # Generate BERT embeddings
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            
            encoded = tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=512,
                return_tensors='pt'
            ).to(device)
            
            outputs = bert_model(**encoded)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]
            all_embeddings.append(cls_embeddings.cpu().numpy())
            
            if (i // batch_size) % 10 == 0:
                print(f"  Processed {i}/{len(texts)} texts")
    
    bert_embeddings = np.vstack(all_embeddings)
    print(f"BERT embeddings shape: {bert_embeddings.shape}")
    
    # Enhanced statistical features for imbalanced data
    print("Calculating enhanced statistical features...")
    
    # Multiple window sizes for different temporal patterns
    window_sizes = [5, 10, 20, 50]
    statistical_features = []
    
    for i in range(len(bert_embeddings)):
        sample_stats = []
        
        for window_size in window_sizes:
            start = max(0, i - window_size)
            window = bert_embeddings[start:i+1]
            
            # Basic statistics
            mean_emb = np.mean(window, axis=0)
            std_emb = np.std(window, axis=0)
            distance_from_mean = np.linalg.norm(bert_embeddings[i] - mean_emb)
            avg_std = np.mean(std_emb)
            
            # Distance-based features
            if len(window) > 1:
                distances = [np.linalg.norm(bert_embeddings[i] - w) for w in window]
                min_dist = np.min(distances)
                max_dist = np.max(distances)
                median_dist = np.median(distances)
                
                # Outlier detection features
                q75, q25 = np.percentile(distances, [75, 25])
                iqr = q75 - q25
                outlier_threshold = q75 + 1.5 * iqr
                is_outlier = 1 if distance_from_mean > outlier_threshold else 0
            else:
                min_dist = max_dist = median_dist = 0
                is_outlier = 0
            
            # Anomaly-specific features
            cosine_sim_mean = np.dot(bert_embeddings[i], mean_emb) / (
                np.linalg.norm(bert_embeddings[i]) * np.linalg.norm(mean_emb) + 1e-8
            )
            
            sample_stats.extend([
                distance_from_mean, avg_std, min_dist, max_dist, 
                median_dist, is_outlier, cosine_sim_mean
            ])
        
        # Class-specific features (if labels available)
        if labels is not None:
            current_label = labels[i]
            
            # Find similar class samples in window
            window_labels = labels[start:i+1] if i > 0 else [current_label]
            same_class_ratio = sum(1 for l in window_labels if l == current_label) / len(window_labels)
            minority_class_indicator = 1 if current_label in [1, 3, 6] else 0
            
            sample_stats.extend([same_class_ratio, minority_class_indicator])
        else:
            sample_stats.extend([0, 0])
        
        statistical_features.append(sample_stats)
    
    statistical_features = np.array(statistical_features)
    print(f"Enhanced statistical features shape: {statistical_features.shape}")
    
    # Additional anomaly-specific BERT features
    print("Calculating anomaly-specific BERT features...")
    
    # Sentence-level features
    sentence_features = []
    for i, text in enumerate(texts):
        text_len = len(text)
        word_count = len(text.split())
        
        # Embedding magnitude (anomalies might have different magnitudes)
        emb_magnitude = np.linalg.norm(bert_embeddings[i])
        
        # Embedding sparsity (count of near-zero values)
        emb_sparsity = np.sum(np.abs(bert_embeddings[i]) < 0.01) / len(bert_embeddings[i])
        
        # Embedding entropy (measure of information content)
        emb_normalized = np.abs(bert_embeddings[i]) / (np.sum(np.abs(bert_embeddings[i])) + 1e-8)
        emb_entropy = -np.sum(emb_normalized * np.log(emb_normalized + 1e-8))
        
        sentence_features.append([
            text_len, word_count, emb_magnitude, emb_sparsity, emb_entropy
        ])
    
    sentence_features = np.array(sentence_features)
    print(f"Sentence-level features shape: {sentence_features.shape}")
    
    bert_features_data[log_source] = {
        'embeddings': bert_embeddings,
        'statistical_features': statistical_features,
        'sentence_features': sentence_features
    }
    
    print(f"Total BERT-based features: {bert_embeddings.shape[1] + statistical_features.shape[1] + sentence_features.shape[1]}")
    print()

Enhanced BERT Feature Generation for Apache_2k
Processing 2000 texts...
  Processed 0/2000 texts
  Processed 160/2000 texts
  Processed 320/2000 texts
  Processed 480/2000 texts
  Processed 640/2000 texts
  Processed 800/2000 texts
  Processed 960/2000 texts
  Processed 1120/2000 texts
  Processed 1280/2000 texts
  Processed 1440/2000 texts
  Processed 1600/2000 texts
  Processed 1760/2000 texts
  Processed 1920/2000 texts
BERT embeddings shape: (2000, 768)
Calculating enhanced statistical features...
Enhanced statistical features shape: (2000, 30)
Calculating anomaly-specific BERT features...
Sentence-level features shape: (2000, 5)
Total BERT-based features: 803

Enhanced BERT Feature Generation for BGL_2k
Processing 2000 texts...
  Processed 0/2000 texts
  Processed 160/2000 texts
  Processed 320/2000 texts
  Processed 480/2000 texts
  Processed 640/2000 texts
  Processed 800/2000 texts
  Processed 960/2000 texts
  Processed 1120/2000 texts
  Processed 1280/2000 texts
  Processed 14

Hybrid Feature

In [10]:
def select_features_for_imbalanced_classes(X, y, feature_names, top_k=200):
    """Select features that are most informative for minority classes"""
    
    print(f"Selecting top {top_k} features for imbalanced learning...")
    
    # Method 1: Mutual Information (handles imbalanced data well)
    mi_selector = SelectKBest(mutual_info_classif, k=min(top_k, X.shape[1]))
    mi_selector.fit(X, y)
    mi_scores = mi_selector.scores_
    
    # Method 2: Random Forest Feature Importance (with balanced class weights)
    rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42, n_jobs=-1)
    rf.fit(X, y)
    rf_importance = rf.feature_importances_
    
    # Combine scores (60% MI, 40% RF)
    combined_scores = 0.6 * (mi_scores / np.max(mi_scores)) + 0.4 * (rf_importance / np.max(rf_importance))
    
    # Select top features
    top_indices = np.argsort(combined_scores)[-top_k:]
    selected_features = [feature_names[i] for i in top_indices] if feature_names else list(range(top_k))
    
    print(f"Selected {len(selected_features)} features for imbalanced learning")
    
    return top_indices, selected_features, combined_scores

hybrid_features_data = {}

for log_source in pyspark_features_data.keys():
    if log_source not in bert_features_data or log_source not in template_data:
        continue
    
    print(f"Enhanced Feature Assembly for {log_source}")
    
    df_pandas = pyspark_features_data[log_source]['pandas_df']
    bert_emb = bert_features_data[log_source]['embeddings']
    bert_stat_features = bert_features_data[log_source]['statistical_features']
    bert_sentence_features = bert_features_data[log_source]['sentence_features']
    template_features = template_data[log_source]['enhanced_features']
    
    # Enhanced feature collection
    temporal_cols = [
        'hour', 'day_of_week', 'is_weekend', 'is_business_hours', 
        'time_diff_seconds', 'logs_last_minute', 'is_night',
        'is_off_hours', 'is_weekend_night', 'is_holiday_period',
        'is_burst', 'is_isolated'
    ]
    
    statistical_cols = [
        'content_length', 'word_count', 'content_length_mean_10', 
        'content_length_std_10', 'time_diff_mean_10', 'time_diff_std_10',
        'hour_frequency'
    ]
    
    # Anomaly-specific features
    anomaly_cols = [col for col in df_pandas.columns if col.startswith('has_')]
    complexity_cols = [
        'msg_length', 'msg_word_count', 'msg_unique_chars', 'msg_entropy',
        'special_char_ratio', 'number_ratio', 'uppercase_ratio',
        'repeated_words', 'repeated_chars'
    ]
    
    # Rolling window features
    rolling_cols = [col for col in df_pandas.columns if any(x in col for x in ['_mean_', '_std_', '_zscore_', '_outlier_'])]
    
    # Time-based features
    time_cols = [col for col in df_pandas.columns if 'log_count_' in col or 'error_density_' in col]
    
    # Collect available features
    available_temporal = [c for c in temporal_cols if c in df_pandas.columns]
    available_statistical = [c for c in statistical_cols if c in df_pandas.columns]
    available_anomaly = [c for c in anomaly_cols if c in df_pandas.columns]
    available_complexity = [c for c in complexity_cols if c in df_pandas.columns]
    available_rolling = [c for c in rolling_cols if c in df_pandas.columns]
    available_time = [c for c in time_cols if c in df_pandas.columns]
    
    # Extract feature arrays
    temporal_features = df_pandas[available_temporal].fillna(0).values if available_temporal else None
    statistical_features = df_pandas[available_statistical].fillna(0).values if available_statistical else None
    anomaly_features = df_pandas[available_anomaly].fillna(0).values if available_anomaly else None
    complexity_features = df_pandas[available_complexity].fillna(0).values if available_complexity else None
    rolling_features = df_pandas[available_rolling].fillna(0).values if available_rolling else None
    time_features = df_pandas[available_time].fillna(0).values if available_time else None
    
    print(f"Enhanced feature dimensions:")
    print(f"  BERT embeddings: {bert_emb.shape}")
    print(f"  BERT statistical: {bert_stat_features.shape}")
    print(f"  BERT sentence: {bert_sentence_features.shape}")
    print(f"  Template enhanced: {template_features.shape}")
    if temporal_features is not None:
        print(f"  Temporal: {temporal_features.shape}")
    if statistical_features is not None:
        print(f"  Statistical: {statistical_features.shape}")
    if anomaly_features is not None:
        print(f"  Anomaly patterns: {anomaly_features.shape}")
    if complexity_features is not None:
        print(f"  Message complexity: {complexity_features.shape}")
    if rolling_features is not None:
        print(f"  Rolling statistics: {rolling_features.shape}")
    if time_features is not None:
        print(f"  Time-based: {time_features.shape}")
    
    # Create enhanced feature variants
    feature_variants = {}
    
    # Basic variants
    feature_variants['bert_only'] = bert_emb
    feature_variants['bert_enhanced'] = np.hstack([bert_emb, bert_stat_features, bert_sentence_features])
    feature_variants['template_enhanced'] = template_features
    
    # Imbalance-aware variants
    imbalance_components = [bert_emb, bert_stat_features, template_features]
    
    if anomaly_features is not None:
        imbalance_components.append(anomaly_features)
        feature_variants['anomaly_focused'] = np.hstack([bert_emb, anomaly_features, template_features])
    
    if complexity_features is not None:
        imbalance_components.append(complexity_features)
    
    if temporal_features is not None:
        imbalance_components.append(temporal_features)
    
    if statistical_features is not None:
        imbalance_components.append(statistical_features)
    
    if rolling_features is not None:
        imbalance_components.append(rolling_features)
    
    if time_features is not None:
        imbalance_components.append(time_features)
    
    # Comprehensive feature set
    feature_variants['imbalance_aware_full'] = np.hstack(imbalance_components)
    
    # Sentence-level focused variant
    if bert_sentence_features is not None:
        sentence_components = [bert_emb, bert_sentence_features, template_features]
        if complexity_features is not None:
            sentence_components.append(complexity_features)
        feature_variants['sentence_focused'] = np.hstack(sentence_components)
    
    labels = df_pandas['AnomalyLabel'].values if 'AnomalyLabel' in df_pandas.columns else None
    
    # Feature selection for imbalanced data
    if labels is not None and len(np.unique(labels)) > 1:
        print("\nApplying feature selection for imbalanced classes...")
        
        # Create feature names
        feature_names = []
        feature_names.extend([f'bert_{i}' for i in range(bert_emb.shape[1])])
        feature_names.extend([f'bert_stat_{i}' for i in range(bert_stat_features.shape[1])])
        feature_names.extend([f'template_{i}' for i in range(template_features.shape[1])])
        feature_names.extend(available_anomaly)
        feature_names.extend(available_complexity)
        feature_names.extend(available_temporal)
        feature_names.extend(available_statistical)
        feature_names.extend(available_rolling)
        feature_names.extend(available_time)
        
        # Apply feature selection
        full_features = feature_variants['imbalance_aware_full']
        top_indices, selected_features, feature_scores = select_features_for_imbalanced_classes(
            full_features, labels, feature_names, top_k=min(200, full_features.shape[1])
        )
        
        feature_variants['selected_imbalanced'] = full_features[:, top_indices]
        
        # Store feature selection info
        feature_selection_info = {
            'selected_indices': top_indices,
            'selected_features': selected_features,
            'feature_scores': feature_scores,
            'total_features': full_features.shape[1]
        }
    else:
        feature_selection_info = None
    
    hybrid_features_data[log_source] = {
        'feature_variants': feature_variants,
        'labels': labels,
        'texts': df_pandas[pyspark_features_data[log_source]['content_col']].tolist(),
        'feature_selection_info': feature_selection_info,
        'imbalance_analysis': pyspark_features_data[log_source]['imbalance_analysis']
    }
    
    print(f"\nCreated {len(feature_variants)} enhanced feature variants:")
    for variant_name, features in feature_variants.items():
        print(f"  - {variant_name}: {features.shape[1]} features")
    
    if labels is not None:
        unique, counts = np.unique(labels, return_counts=True)
        print(f"\nLabel distribution:")
        for lbl, cnt in zip(unique, counts):
            lbl_name = PROJECT_CONFIG['label_map'].get(int(lbl), 'unknown')
            print(f"  {int(lbl)} ({lbl_name}): {cnt} ({cnt/len(labels)*100:.2f}%)")
    
    print()

Enhanced Feature Assembly for Apache_2k
Enhanced feature dimensions:
  BERT embeddings: (2000, 768)
  BERT statistical: (2000, 30)
  BERT sentence: (2000, 5)
  Template enhanced: (2000, 15)
  Temporal: (2000, 12)
  Statistical: (2000, 7)
  Message complexity: (2000, 9)
  Rolling statistics: (2000, 40)
  Time-based: (2000, 10)

Applying feature selection for imbalanced classes...
Selecting top 200 features for imbalanced learning...
Selected 200 features for imbalanced learning

Created 6 enhanced feature variants:
  - bert_only: 768 features
  - bert_enhanced: 803 features
  - template_enhanced: 15 features
  - imbalance_aware_full: 891 features
  - sentence_focused: 797 features
  - selected_imbalanced: 200 features

Label distribution:
  0 (normal): 1405 (70.25%)
  1 (security_anomaly): 32 (1.60%)
  2 (system_failure): 563 (28.15%)

Enhanced Feature Assembly for BGL_2k
Enhanced feature dimensions:
  BERT embeddings: (2000, 768)
  BERT statistical: (2000, 30)
  BERT sentence: (2000, 5

In [11]:
features_save_path = FEATURES_PATH / "enhanced_imbalanced_features.pkl"
with open(features_save_path, 'wb') as f:
    pickle.dump({
        'hybrid_features_data': hybrid_features_data,
        'template_data': template_data,
        'bert_features_data': bert_features_data,
        'pyspark_features_data': {k: {
            'imbalance_analysis': v['imbalance_analysis'],
            'total_count': v['total_count'],
            'content_col': v['content_col']
        } for k, v in pyspark_features_data.items()},
        'feature_types': list(hybrid_features_data[list(hybrid_features_data.keys())[0]]['feature_variants'].keys()),
        'config': PROJECT_CONFIG,
        'enhancement_info': {
            'anomaly_patterns_added': True,
            'temporal_features_enhanced': True,
            'statistical_features_enhanced': True,
            'template_features_enhanced': True,
            'bert_features_enhanced': True,
            'feature_selection_applied': True,
            'imbalance_analysis_included': True
        },
        'timestamp': datetime.now().isoformat()
    }, f)

print(f"\nSaved: {features_save_path}")


Saved: C:\Computer Science\AIMLDL\log-anomaly-detection\features\enhanced_imbalanced_features.pkl


In [12]:
cross_source_splits = []

for test_source in hybrid_features_data.keys():
    train_sources = [s for s in hybrid_features_data.keys() if s != test_source]
    
    if hybrid_features_data[test_source]['labels'] is None:
        continue
    
    test_samples = len(hybrid_features_data[test_source]['labels'])
    train_samples = sum(len(hybrid_features_data[s]['labels']) 
                       for s in train_sources 
                       if hybrid_features_data[s]['labels'] is not None)
    
    # Add imbalance analysis for this split
    test_imbalance = hybrid_features_data[test_source]['imbalance_analysis']
    
    # Calculate combined train imbalance
    train_label_counts = Counter()
    for s in train_sources:
        if hybrid_features_data[s]['labels'] is not None:
            for label in hybrid_features_data[s]['labels']:
                train_label_counts[label] += 1
    
    train_imbalance_ratio = max(train_label_counts.values()) / min(train_label_counts.values()) if train_label_counts else 1.0
    
    cross_source_splits.append({
        'test_source': test_source,
        'train_sources': train_sources,
        'test_samples': test_samples,
        'train_samples': train_samples,
        'test_imbalance_analysis': test_imbalance,
        'train_imbalance_ratio': train_imbalance_ratio,
        'train_label_distribution': dict(train_label_counts)
    })

splits_save_path = FEATURES_PATH / "enhanced_cross_source_splits.pkl"
with open(splits_save_path, 'wb') as f:
    pickle.dump({'splits': cross_source_splits}, f)

print(f"Saved: {splits_save_path}")

Saved: C:\Computer Science\AIMLDL\log-anomaly-detection\features\enhanced_cross_source_splits.pkl


In [13]:
total_sources = len(hybrid_features_data)
print(f"Processed {total_sources} log sources with enhanced features")

# Feature variant analysis
if hybrid_features_data:
    sample_source = list(hybrid_features_data.keys())[0]
    feature_variants = hybrid_features_data[sample_source]['feature_variants']
    
    print(f"\n🔧 Enhanced Feature Variants Created:")
    for variant_name, features in feature_variants.items():
        print(f"  - {variant_name}: {features.shape[1]} features")
    
    # Imbalance analysis summary
    print(f"\nImbalance Analysis Summary:")
    extreme_imbalance_sources = []
    high_imbalance_sources = []
    minority_class_coverage = {i: 0 for i in range(7)}
    
    for source, data in hybrid_features_data.items():
        if data['imbalance_analysis']:
            imbalance_ratio = data['imbalance_analysis']['imbalance_ratio']
            if imbalance_ratio > 100:
                extreme_imbalance_sources.append(source)
            elif imbalance_ratio > 10:
                high_imbalance_sources.append(source)
            
            # Count class coverage
            for class_id, count in data['imbalance_analysis']['class_distribution'].items():
                if count > 0:
                    minority_class_coverage[class_id] += 1
    
    print(f"  - Sources with extreme imbalance (>100:1): {len(extreme_imbalance_sources)}")
    print(f"  - Sources with high imbalance (>10:1): {len(high_imbalance_sources)}")
    
    print(f"\nClass Coverage Across Sources:")
    for class_id, coverage in minority_class_coverage.items():
        class_name = PROJECT_CONFIG['label_map'][class_id]
        coverage_pct = (coverage / total_sources) * 100
        status = "✅" if coverage_pct > 50 else "⚠️" if coverage_pct > 25 else "🔴"
        print(f"  {status} Class {class_id} ({class_name}): {coverage}/{total_sources} sources ({coverage_pct:.1f}%)")
    
    print(f"Files Saved:")
    print(f"  - Enhanced features: {features_save_path}")
    print(f"  - Enhanced splits: {splits_save_path}")

Processed 9 log sources with enhanced features

🔧 Enhanced Feature Variants Created:
  - bert_only: 768 features
  - bert_enhanced: 803 features
  - template_enhanced: 15 features
  - imbalance_aware_full: 891 features
  - sentence_focused: 797 features
  - selected_imbalanced: 200 features

Imbalance Analysis Summary:
  - Sources with extreme imbalance (>100:1): 5
  - Sources with high imbalance (>10:1): 4

Class Coverage Across Sources:
  ✅ Class 0 (normal): 9/9 sources (100.0%)
  ⚠️ Class 1 (security_anomaly): 4/9 sources (44.4%)
  ✅ Class 2 (system_failure): 7/9 sources (77.8%)
  ⚠️ Class 3 (performance_issue): 4/9 sources (44.4%)
  ✅ Class 4 (network_anomaly): 5/9 sources (55.6%)
  ✅ Class 5 (config_error): 6/9 sources (66.7%)
  ⚠️ Class 6 (hardware_issue): 4/9 sources (44.4%)
Files Saved:
  - Enhanced features: C:\Computer Science\AIMLDL\log-anomaly-detection\features\enhanced_imbalanced_features.pkl
  - Enhanced splits: C:\Computer Science\AIMLDL\log-anomaly-detection\features\e