Setup

In [2]:
import os
import sys
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
import json
import pickle
from collections import Counter

import findspark
findspark.init()

from pyspark.sql import SparkSession, Window
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler, StandardScaler

import torch
from transformers import AutoTokenizer, AutoModel

import drain3
from drain3 import TemplateMiner
from drain3.template_miner_config import TemplateMinerConfig
from drain3.masking import MaskingInstruction

In [3]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

PROJECT_ROOT = Path(r"C:\Computer Science\AIMLDL\log-anomaly-detection")
DATASET_PATH = PROJECT_ROOT / "dataset"
LABELED_DATA_PATH = DATASET_PATH / "labeled_data"
NORMALIZED_DATA_PATH = LABELED_DATA_PATH / "normalized"
FEATURES_PATH = PROJECT_ROOT / "features"
FEATURES_PATH.mkdir(parents=True, exist_ok=True)

os.environ['HADOOP_HOME'] = 'C:\\hadoop'
os.environ['PATH'] = f"{os.environ['HADOOP_HOME']}\\bin;{os.environ['PATH']}"

In [4]:
spark = SparkSession.builder \
    .master("local[*]") \
    .config("spark.driver.memory", "18g") \
    .config("spark.executor.memory", "16g") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.default.parallelism", "8") \
    .appName("MultiClassFeatureEngineering") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

print(f"Spark version: {spark.version}")

Spark version: 3.4.1


In [5]:
PROJECT_CONFIG = {
    'bert_model_name': 'bert-base-uncased',
    'max_sequence_length': 512,
    'num_classes': 7,
    'label_map': {
        0: 'normal',
        1: 'security_anomaly',
        2: 'system_failure',
        3: 'performance_issue',
        4: 'network_anomaly',
        5: 'config_error',
        6: 'hardware_issue'
    },
    'log_sources': []
}

dataset_registry = {}

enhanced_files = list(NORMALIZED_DATA_PATH.glob("*_enhanced.csv"))
for file_path in enhanced_files:
    source_name = file_path.stem.replace('_enhanced', '')
    PROJECT_CONFIG['log_sources'].append(source_name)
    dataset_registry[source_name] = {
        'file_path': str(file_path),
        'log_type': source_name
    }

print(f"Loaded {len(dataset_registry)} log sources")
print(f"Label mapping: {PROJECT_CONFIG['label_map']}")

Loaded 9 log sources
Label mapping: {0: 'normal', 1: 'security_anomaly', 2: 'system_failure', 3: 'performance_issue', 4: 'network_anomaly', 5: 'config_error', 6: 'hardware_issue'}


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

tokenizer = AutoTokenizer.from_pretrained(PROJECT_CONFIG['bert_model_name'])
bert_model = AutoModel.from_pretrained(PROJECT_CONFIG['bert_model_name'])
bert_model.to(device)
bert_model.eval()

drain_configs = {
    'hdfs': {'sim_th': 0.5, 'depth': 4},
    'bgl': {'sim_th': 0.3, 'depth': 5},
    'hadoop': {'sim_th': 0.4, 'depth': 4},
    'apache': {'sim_th': 0.4, 'depth': 4},
    'default': {'sim_th': 0.4, 'depth': 4}
}

Using device: cuda


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Temporal and Statistical Feature Extraction

In [7]:
pyspark_features_data = {}

for log_source in PROJECT_CONFIG['log_sources']:
    if log_source not in dataset_registry:
        continue
    
    print(f"\nProcessing: {log_source}")
    
    file_path = dataset_registry[log_source]['file_path']
    
    df_spark = spark.read.csv(file_path, header=True, inferSchema=True)
    
    print(f"Loaded {df_spark.count()} rows with {len(df_spark.columns)} columns")
    
    content_col = None
    for col in ['Content', 'content', 'Message', 'message', 'Text', 'text']:
        if col in df_spark.columns:
            content_col = col
            break
    
    if content_col is None:
        print(f"No content column found, skipping")
        continue
    
    if 'timestamp_dt' in df_spark.columns:
        df_spark = df_spark.withColumn('timestamp', F.col('timestamp_dt').cast(TimestampType()))
    elif 'timestamp_normalized' in df_spark.columns:
        df_spark = df_spark.withColumn('timestamp', F.to_timestamp('timestamp_normalized'))
    else:
        print("No timestamp column found, skipping")
        continue
    
    print("Extracting temporal features")
    
    df_spark = df_spark.withColumn('hour', F.hour('timestamp')) \
                       .withColumn('day_of_week', F.dayofweek('timestamp')) \
                       .withColumn('day_of_month', F.dayofmonth('timestamp')) \
                       .withColumn('month', F.month('timestamp')) \
                       .withColumn('is_weekend', F.when(F.dayofweek('timestamp').isin([1, 7]), 1).otherwise(0)) \
                       .withColumn('is_business_hours', F.when(F.hour('timestamp').between(9, 17), 1).otherwise(0)) \
                       .withColumn('is_night', F.when(F.hour('timestamp').between(0, 6), 1).otherwise(0))
    
    window_spec = Window.orderBy('timestamp')
    window_spec_1min = Window.orderBy(F.col('timestamp').cast('long')).rangeBetween(-60, 0)
    
    df_spark = df_spark.withColumn('log_index', F.monotonically_increasing_id())
    
    df_spark = df_spark.withColumn('prev_timestamp', F.lag('timestamp', 1).over(window_spec))
    df_spark = df_spark.withColumn('time_diff_seconds', 
                                    F.when(F.col('prev_timestamp').isNotNull(), 
                                           F.unix_timestamp('timestamp') - F.unix_timestamp('prev_timestamp'))
                                    .otherwise(0))
    
    df_spark = df_spark.withColumn('logs_last_minute', 
                                    F.count('*').over(window_spec_1min))
    
    print("Calculating statistical features")
    
    df_spark = df_spark.withColumn('content_length', F.length(F.col(content_col)))
    df_spark = df_spark.withColumn('word_count', F.size(F.split(F.col(content_col), ' ')))
    
    window_10 = Window.orderBy('timestamp').rowsBetween(-9, 0)
    
    df_spark = df_spark.withColumn('content_length_mean_10', F.avg('content_length').over(window_10))
    df_spark = df_spark.withColumn('content_length_std_10', F.stddev('content_length').over(window_10))
    df_spark = df_spark.withColumn('time_diff_mean_10', F.avg('time_diff_seconds').over(window_10))
    df_spark = df_spark.withColumn('time_diff_std_10', F.stddev('time_diff_seconds').over(window_10))
    
    hour_counts = df_spark.groupBy('hour').count().withColumnRenamed('count', 'hour_frequency')
    df_spark = df_spark.join(hour_counts, on='hour', how='left')
    
    if 'AnomalyLabel' in df_spark.columns:
        df_spark = df_spark.withColumn('AnomalyLabel', F.col('AnomalyLabel').cast(IntegerType()))
        df_spark = df_spark.withColumn('AnomalyLabel', 
                                        F.when(F.col('AnomalyLabel').isNull(), 0)
                                        .when(F.col('AnomalyLabel') < 0, 0)
                                        .when(F.col('AnomalyLabel') > 6, 0)
                                        .otherwise(F.col('AnomalyLabel')))
    
    df_spark.cache()
    
    total_count = df_spark.count()
    if 'AnomalyLabel' in df_spark.columns:
        label_dist = df_spark.groupBy('AnomalyLabel').count().orderBy('AnomalyLabel').collect()
        print(f"Total: {total_count:,}")
        print("Label distribution:")
        for row in label_dist:
            lbl = row['AnomalyLabel']
            cnt = row['count']
            lbl_name = PROJECT_CONFIG['label_map'].get(lbl, 'unknown')
            print(f"  {lbl} ({lbl_name}): {cnt:,} ({cnt/total_count*100:.2f}%)")
    
    max_samples = 5000
    if total_count > max_samples:
        sample_fraction = max_samples / total_count
        df_spark_sampled = df_spark.sample(withReplacement=False, fraction=sample_fraction, seed=RANDOM_SEED)
        print(f"Sampled {df_spark_sampled.count()} rows")
    else:
        df_spark_sampled = df_spark
    
    df_pandas = df_spark_sampled.toPandas()
    
    print(f"Converted to Pandas: {df_pandas.shape}")
    print()
    
    pyspark_features_data[log_source] = {
        'spark_df': df_spark,
        'pandas_df': df_pandas,
        'content_col': content_col,
        'total_count': total_count
    }


Processing: Apache_2k
Loaded 2000 rows with 21 columns
Extracting temporal features
Calculating statistical features
Total: 2,000
Label distribution:
  0 (normal): 1,405 (70.25%)
  1 (security_anomaly): 32 (1.60%)
  2 (system_failure): 563 (28.15%)
Converted to Pandas: (2000, 31)


Processing: BGL_2k
Loaded 2000 rows with 28 columns
Extracting temporal features
Calculating statistical features
Total: 2,000
Label distribution:
  0 (normal): 501 (25.05%)
  1 (security_anomaly): 150 (7.50%)
  2 (system_failure): 913 (45.65%)
  5 (config_error): 73 (3.65%)
  6 (hardware_issue): 363 (18.15%)
Converted to Pandas: (2000, 37)


Processing: Hadoop_2k
Loaded 2000 rows with 24 columns
Extracting temporal features
Calculating statistical features
Total: 2,000
Label distribution:
  0 (normal): 1,217 (60.85%)
  2 (system_failure): 156 (7.80%)
  3 (performance_issue): 1 (0.05%)
  4 (network_anomaly): 625 (31.25%)
  5 (config_error): 1 (0.05%)
Converted to Pandas: (2000, 34)


Processing: HDFS_2k
Loa

Drain Template Extraction

In [8]:
template_data = {}

for log_source, data_dict in pyspark_features_data.items():
    print(f"\nExtracting templates for {log_source}")
    
    df_pandas = data_dict['pandas_df']
    content_col = data_dict['content_col']
    
    source_config = drain_configs.get(log_source, drain_configs['default'])
    
    drain_config = TemplateMinerConfig()
    drain_config.drain_sim_th = source_config['sim_th']
    drain_config.drain_depth = source_config['depth']
    drain_config.drain_max_children = 100
    drain_config.masking_instructions = [
        MaskingInstruction(r'\d+', "<NUM>"),
        MaskingInstruction(r'[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}', "<UUID>"),
        MaskingInstruction(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b', "<IP>"),
        MaskingInstruction(r'/[^\s]*', "<PATH>")
    ]
    
    template_miner = TemplateMiner(config=drain_config)
    
    templates = {}
    template_ids = []
    template_class_dist = {}
    
    labels = df_pandas['AnomalyLabel'].values if 'AnomalyLabel' in df_pandas.columns else None
    
    for idx, content in enumerate(df_pandas[content_col].fillna("").astype(str)):
        if content.strip() == "":
            template_ids.append(-1)
            continue
        
        result = template_miner.add_log_message(content.strip())
        tid = result["cluster_id"]
        template_ids.append(tid)
        
        if tid not in templates:
            templates[tid] = {
                'template': result["template_mined"],
                'count': 1,
                'class_dist': [0] * 7
            }
        else:
            templates[tid]['count'] += 1
        
        if labels is not None:
            lbl = int(labels[idx])
            templates[tid]['class_dist'][lbl] += 1
    
    print(f"Extracted {len(templates)} unique templates")
    
    template_counts = Counter(template_ids)
    total = len(template_ids)
    
    enhanced_template_features = []
    for idx, tid in enumerate(template_ids):
        if tid == -1:
            enhanced_template_features.append([0, 0, 0, 0] + [0]*7)
            continue
        
        frequency = template_counts[tid] / total
        rarity = 1.0 / (frequency + 1e-6)
        template_text = templates[tid]['template']
        length = len(template_text.split())
        n_wildcards = sum([template_text.count(w) for w in ['<NUM>', '<IP>', '<PATH>', '<UUID>']])
        
        class_probs = np.array(templates[tid]['class_dist']) / (templates[tid]['count'] + 1e-6)
        
        enhanced_template_features.append([rarity, length, n_wildcards, frequency] + class_probs.tolist())
    
    template_data[log_source] = {
        'templates': templates,
        'template_ids': template_ids,
        'enhanced_features': np.array(enhanced_template_features)
    }
    
    print(f"Enhanced template features shape: {template_data[log_source]['enhanced_features'].shape}")


Extracting templates for Apache_2k
Extracted 6 unique templates
Enhanced template features shape: (2000, 11)

Extracting templates for BGL_2k
Extracted 105 unique templates
Enhanced template features shape: (2000, 11)

Extracting templates for Hadoop_2k
Extracted 102 unique templates
Enhanced template features shape: (2000, 11)

Extracting templates for HDFS_2k
Extracted 16 unique templates
Enhanced template features shape: (2000, 11)

Extracting templates for HPC_2k
Extracted 45 unique templates
Enhanced template features shape: (2000, 11)

Extracting templates for Linux_2k
Extracted 110 unique templates
Enhanced template features shape: (2000, 11)

Extracting templates for OpenSSH_2k
Extracted 23 unique templates
Enhanced template features shape: (2000, 11)

Extracting templates for Proxifier_2k
Extracted 403 unique templates
Enhanced template features shape: (2000, 11)

Extracting templates for Zookeeper_2k
Extracted 46 unique templates
Enhanced template features shape: (2000, 11)


Bert Embeddings and Statistical Features

In [9]:
bert_features_data = {}

for log_source, data_dict in pyspark_features_data.items():
    print(f"\n--- Generating BERT embeddings for {log_source} ---")
    
    df_pandas = data_dict['pandas_df']
    content_col = data_dict['content_col']
    
    texts = df_pandas[content_col].fillna("").astype(str).tolist()
    
    print(f"Processing {len(texts)} texts...")
    all_embeddings = []
    batch_size = 16
    
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            
            encoded = tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=512,
                return_tensors='pt'
            ).to(device)
            
            outputs = bert_model(**encoded)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]
            all_embeddings.append(cls_embeddings.cpu().numpy())
            
            if (i // batch_size) % 10 == 0:
                print(f"  Processed {i}/{len(texts)} texts")
    
    bert_embeddings = np.vstack(all_embeddings)
    print(f"BERT embeddings shape: {bert_embeddings.shape}")
    
    window_size = 10
    statistical_features = []
    
    for i in range(len(bert_embeddings)):
        start = max(0, i - window_size)
        window = bert_embeddings[start:i+1]
        
        mean_emb = np.mean(window, axis=0)
        std_emb = np.std(window, axis=0)
        distance_from_mean = np.linalg.norm(bert_embeddings[i] - mean_emb)
        avg_std = np.mean(std_emb)
        
        if len(window) > 1:
            distances = [np.linalg.norm(bert_embeddings[i] - w) for w in window]
            min_dist = np.min(distances)
            max_dist = np.max(distances)
        else:
            min_dist = max_dist = 0
        
        statistical_features.append([distance_from_mean, avg_std, min_dist, max_dist])
    
    statistical_features = np.array(statistical_features)
    print(f"Statistical features shape: {statistical_features.shape}")
    
    bert_features_data[log_source] = {
        'embeddings': bert_embeddings,
        'statistical_features': statistical_features
    }


--- Generating BERT embeddings for Apache_2k ---
Processing 2000 texts...
  Processed 0/2000 texts
  Processed 160/2000 texts
  Processed 320/2000 texts
  Processed 480/2000 texts
  Processed 640/2000 texts
  Processed 800/2000 texts
  Processed 960/2000 texts
  Processed 1120/2000 texts
  Processed 1280/2000 texts
  Processed 1440/2000 texts
  Processed 1600/2000 texts
  Processed 1760/2000 texts
  Processed 1920/2000 texts
BERT embeddings shape: (2000, 768)
Statistical features shape: (2000, 4)

--- Generating BERT embeddings for BGL_2k ---
Processing 2000 texts...
  Processed 0/2000 texts
  Processed 160/2000 texts
  Processed 320/2000 texts
  Processed 480/2000 texts
  Processed 640/2000 texts
  Processed 800/2000 texts
  Processed 960/2000 texts
  Processed 1120/2000 texts
  Processed 1280/2000 texts
  Processed 1440/2000 texts
  Processed 1600/2000 texts
  Processed 1760/2000 texts
  Processed 1920/2000 texts
BERT embeddings shape: (2000, 768)
Statistical features shape: (2000, 

Hybrid Feature

In [10]:
hybrid_features_data = {}

for log_source in pyspark_features_data.keys():
    if log_source not in bert_features_data or log_source not in template_data:
        continue
    
    print(f"\n--- Assembling features for {log_source} ---")
    
    df_pandas = pyspark_features_data[log_source]['pandas_df']
    bert_emb = bert_features_data[log_source]['embeddings']
    stat_features = bert_features_data[log_source]['statistical_features']
    template_features = template_data[log_source]['enhanced_features']
    
    temporal_cols = ['hour', 'day_of_week', 'is_weekend', 'is_business_hours', 
                     'time_diff_seconds', 'logs_last_minute', 'is_night']
    
    statistical_cols = ['content_length', 'word_count', 'content_length_mean_10', 
                       'content_length_std_10', 'time_diff_mean_10', 'time_diff_std_10',
                       'hour_frequency']
    
    available_temporal = [c for c in temporal_cols if c in df_pandas.columns]
    available_statistical = [c for c in statistical_cols if c in df_pandas.columns]
    
    temporal_features = df_pandas[available_temporal].fillna(0).values if available_temporal else None
    pyspark_statistical = df_pandas[available_statistical].fillna(0).values if available_statistical else None
    
    unique_templates = sorted(set(tid for tid in template_data[log_source]['template_ids'] if tid != -1))
    template_onehot = np.zeros((len(template_data[log_source]['template_ids']), len(unique_templates)))
    
    for i, tid in enumerate(template_data[log_source]['template_ids']):
        if tid != -1 and tid in unique_templates:
            idx = unique_templates.index(tid)
            template_onehot[i, idx] = 1.0
    
    print(f"Feature dimensions:")
    print(f"  BERT embeddings: {bert_emb.shape}")
    print(f"  BERT statistical: {stat_features.shape}")
    print(f"  Template enhanced: {template_features.shape}")
    print(f"  Template one-hot: {template_onehot.shape}")
    if temporal_features is not None:
        print(f"  Temporal: {temporal_features.shape}")
    if pyspark_statistical is not None:
        print(f"  PySpark statistical: {pyspark_statistical.shape}")
    
    feature_variants = {}
    
    feature_variants['bert_only'] = bert_emb
    
    feature_variants['bert_statistical'] = np.hstack([bert_emb, stat_features])
    
    feature_variants['bert_template_enhanced'] = np.hstack([bert_emb, template_features])
    
    feature_variants['bert_statistical_template'] = np.hstack([bert_emb, stat_features, template_features])
    
    if temporal_features is not None:
        feature_variants['bert_statistical_template_temporal'] = np.hstack([
            bert_emb, stat_features, template_features, temporal_features
        ])
    
    all_feature_components = [bert_emb, stat_features, template_features]
    if temporal_features is not None:
        all_feature_components.append(temporal_features)
    if pyspark_statistical is not None:
        all_feature_components.append(pyspark_statistical)
    
    feature_variants['all_features'] = np.hstack(all_feature_components)
    
    labels = df_pandas['AnomalyLabel'].values if 'AnomalyLabel' in df_pandas.columns else None
    
    hybrid_features_data[log_source] = {
        'feature_variants': feature_variants,
        'labels': labels,
        'texts': df_pandas[pyspark_features_data[log_source]['content_col']].tolist()
    }
    
    print(f"Created {len(feature_variants)} feature variants")
    if labels is not None:
        unique, counts = np.unique(labels, return_counts=True)
        print(f"Label distribution:")
        for lbl, cnt in zip(unique, counts):
            lbl_name = PROJECT_CONFIG['label_map'].get(int(lbl), 'unknown')
            print(f"  {int(lbl)} ({lbl_name}): {cnt} ({cnt/len(labels)*100:.2f}%)")


--- Assembling features for Apache_2k ---
Feature dimensions:
  BERT embeddings: (2000, 768)
  BERT statistical: (2000, 4)
  Template enhanced: (2000, 11)
  Template one-hot: (2000, 6)
  Temporal: (2000, 7)
  PySpark statistical: (2000, 7)
Created 6 feature variants
Label distribution:
  0 (normal): 1405 (70.25%)
  1 (security_anomaly): 32 (1.60%)
  2 (system_failure): 563 (28.15%)

--- Assembling features for BGL_2k ---
Feature dimensions:
  BERT embeddings: (2000, 768)
  BERT statistical: (2000, 4)
  Template enhanced: (2000, 11)
  Template one-hot: (2000, 105)
  Temporal: (2000, 7)
  PySpark statistical: (2000, 7)
Created 6 feature variants
Label distribution:
  0 (normal): 501 (25.05%)
  1 (security_anomaly): 150 (7.50%)
  2 (system_failure): 913 (45.65%)
  5 (config_error): 73 (3.65%)
  6 (hardware_issue): 363 (18.15%)

--- Assembling features for Hadoop_2k ---
Feature dimensions:
  BERT embeddings: (2000, 768)
  BERT statistical: (2000, 4)
  Template enhanced: (2000, 11)
  Templ

In [11]:
features_save_path = FEATURES_PATH / "multiclass_hybrid_features.pkl"
with open(features_save_path, 'wb') as f:
    pickle.dump({
        'hybrid_features_data': hybrid_features_data,
        'template_data': template_data,
        'feature_types': list(hybrid_features_data[list(hybrid_features_data.keys())[0]]['feature_variants'].keys()),
        'config': PROJECT_CONFIG,
        'timestamp': datetime.now().isoformat()
    }, f)

print(f"\nSaved: {features_save_path}")


Saved: C:\Computer Science\AIMLDL\log-anomaly-detection\features\multiclass_hybrid_features.pkl


In [12]:
cross_source_splits = []

for test_source in hybrid_features_data.keys():
    train_sources = [s for s in hybrid_features_data.keys() if s != test_source]
    
    if hybrid_features_data[test_source]['labels'] is None:
        continue
    
    test_samples = len(hybrid_features_data[test_source]['labels'])
    train_samples = sum(len(hybrid_features_data[s]['labels']) 
                       for s in train_sources 
                       if hybrid_features_data[s]['labels'] is not None)
    
    cross_source_splits.append({
        'test_source': test_source,
        'train_sources': train_sources,
        'test_samples': test_samples,
        'train_samples': train_samples
    })

splits_save_path = FEATURES_PATH / "multiclass_cross_source_splits.pkl"
with open(splits_save_path, 'wb') as f:
    pickle.dump({'splits': cross_source_splits}, f)

print(f"Saved: {splits_save_path}")

Saved: C:\Computer Science\AIMLDL\log-anomaly-detection\features\multiclass_cross_source_splits.pkl


In [13]:
for ft in list(hybrid_features_data[list(hybrid_features_data.keys())[0]]['feature_variants'].keys()):
    shape = hybrid_features_data[list(hybrid_features_data.keys())[0]]['feature_variants'][ft].shape
    print(f"  - {ft}: {shape[1]} features")

  - bert_only: 768 features
  - bert_statistical: 772 features
  - bert_template_enhanced: 779 features
  - bert_statistical_template: 783 features
  - bert_statistical_template_temporal: 790 features
  - all_features: 797 features
