Setup

In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torch.autograd import Function
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import f1_score, matthews_corrcoef, roc_auc_score, precision_recall_curve, auc, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from transformers import AdamW
import matplotlib.pyplot as plt

In [24]:
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)

ROOT = Path(r"C:\Computer Science\AIMLDL\log-anomaly-detection")
FEAT_PATH = ROOT / "features"
RES_PATH = ROOT / "results" / "cross_source_transfer" / "advanced"
RES_PATH.mkdir(parents=True, exist_ok=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

with open(FEAT_PATH / "hybrid_features.pkl", 'rb') as f:
    feat_data = pickle.load(f)['hybrid_features_data']

with open(FEAT_PATH / "cross_source_splits.pkl", 'rb') as f:
    splits = pickle.load(f)['splits']

cfg = {
    'hidden': 768,
    'batch': 32,
    'lr': 1e-3,
    'epochs': 15,
    'dropout': 0.3,
    'patience': 3
}

print(f"Loaded {len(feat_data)} sources")

Device: cuda
Loaded 6 sources


Data Preparation

In [26]:
exp = splits[0]
test_src = exp['test_source']
train_srcs = exp['train_sources']

test_X = feat_data[test_src]['bert_only']
test_y = feat_data[test_src]['labels']

train_X_list = []
train_y_list = []

for src in train_srcs:
    if feat_data[src]['labels'] is not None:
        train_X_list.append(feat_data[src]['bert_only'])
        train_y_list.append(feat_data[src]['labels'])

train_X = np.vstack(train_X_list)
train_y = np.concatenate(train_y_list)

print(f"Test: {test_src} ({len(test_y)} samples)")
print(f"Train: {len(train_srcs)} sources ({len(train_y)} samples)")

Test: Apache (2000 samples)
Train: 5 sources (10000 samples)


In [27]:
train_counts = np.bincount(train_y)
test_counts = np.bincount(test_y)
print(f"\nTrain class distribution: Normal={train_counts[0]}, Anomaly={train_counts[1]}")
print(f"Test class distribution: Normal={test_counts[0]}, Anomaly={test_counts[1]}")
print(f"Train anomaly rate: {np.mean(train_y):.3f}")
print(f"Test anomaly rate: {np.mean(test_y):.3f}")


Train class distribution: Normal=5707, Anomaly=4293
Test class distribution: Normal=1405, Anomaly=595
Train anomaly rate: 0.429
Test anomaly rate: 0.297


In [29]:
scaler = StandardScaler()
train_X_scaled = scaler.fit_transform(train_X)
test_X_scaled = scaler.transform(test_X)

class_weights = compute_class_weight('balanced', classes=np.unique(train_y), y=train_y)
class_weights = torch.FloatTensor(class_weights).to(device)
print(f"Class weights: {class_weights.cpu().numpy()}")

Class weights: [0.87611705 1.1646867 ]


In [30]:
train_X_t = torch.FloatTensor(train_X_scaled)
train_y_t = torch.LongTensor(train_y)
test_X_t = torch.FloatTensor(test_X_scaled)
test_y_t = torch.LongTensor(test_y)

train_ds = TensorDataset(train_X_t, train_y_t)
test_ds = TensorDataset(test_X_t, test_y_t)
train_dl = DataLoader(train_ds, batch_size=cfg['batch'], shuffle=True)
test_dl = DataLoader(test_ds, batch_size=cfg['batch'])

Bert Classifier

In [31]:
class BertCls(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(cfg['hidden'], 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(cfg['dropout']),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(cfg['dropout']),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(cfg['dropout']),
            nn.Linear(64, 2)
        )
    
    def forward(self, x):
        return self.net(x)

model = BertCls().to(device)
optimizer = AdamW(model.parameters(), lr=cfg['lr'])
criterion = nn.CrossEntropyLoss(weight=class_weights)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2, verbose=True)

In [32]:
best_f1 = 0
patience_counter = 0

for epoch in range(cfg['epochs']):
    # Training
    model.train()
    total_loss = 0
    
    for X_batch, y_batch in train_dl:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        optimizer.zero_grad()
        out = model(X_batch)
        loss = criterion(out, y_batch)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_dl)
    
    # Evaluation
    model.eval()
    val_preds = []
    val_probs = []
    val_labels = []
    
    with torch.no_grad():
        for X_batch, y_batch in test_dl:
            X_batch = X_batch.to(device)
            out = model(X_batch)
            prob = F.softmax(out, dim=1)
            pred = torch.argmax(out, dim=1)
            
            val_preds.extend(pred.cpu().numpy())
            val_probs.extend(prob[:, 1].cpu().numpy())
            val_labels.extend(y_batch.numpy())
    
    val_f1 = f1_score(val_labels, val_preds, zero_division=0)
    val_mcc = matthews_corrcoef(val_labels, val_preds)
    
    print(f"Epoch {epoch+1:2d}: Loss={avg_loss:.4f} | Val F1={val_f1:.3f} | Val MCC={val_mcc:.3f}")
    
    # Learning rate scheduling
    scheduler.step(val_f1)
    
    # Early stopping
    if val_f1 > best_f1:
        best_f1 = val_f1
        torch.save(model.state_dict(), RES_PATH / 'best_bert_cls.pt')
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= cfg['patience']:
            print(f"Early stopping at epoch {epoch+1}")
            break

# Load best model and evaluate
model.load_state_dict(torch.load(RES_PATH / 'best_bert_cls.pt'))
model.eval()

Epoch  1: Loss=0.0817 | Val F1=0.818 | Val MCC=0.750
Epoch  2: Loss=0.0296 | Val F1=0.661 | Val MCC=0.518
Epoch  3: Loss=0.0171 | Val F1=0.816 | Val MCC=0.747
Epoch  4: Loss=0.0161 | Val F1=0.555 | Val MCC=0.350
Early stopping at epoch 4


BertCls(
  (net): Sequential(
    (0): Linear(in_features=768, out_features=256, bias=True)
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=256, out_features=128, bias=True)
    (5): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.3, inplace=False)
    (8): Linear(in_features=128, out_features=64, bias=True)
    (9): ReLU()
    (10): Dropout(p=0.3, inplace=False)
    (11): Linear(in_features=64, out_features=2, bias=True)
  )
)

In [33]:
preds = []
probs = []
labels = []

with torch.no_grad():
    for X_batch, y_batch in test_dl:
        X_batch = X_batch.to(device)
        out = model(X_batch)
        prob = F.softmax(out, dim=1)
        pred = torch.argmax(out, dim=1)
        
        preds.extend(pred.cpu().numpy())
        probs.extend(prob[:, 1].cpu().numpy())
        labels.extend(y_batch.numpy())

y_true = np.array(labels)
y_pred = np.array(preds)
y_prob = np.array(probs)

# Calculate metrics
f1 = f1_score(y_true, y_pred, zero_division=0)
mcc = matthews_corrcoef(y_true, y_pred)
acc = accuracy_score(y_true, y_pred)
cm = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = cm.ravel()
sens = tp / (tp + fn) if (tp + fn) > 0 else 0
spec = tn / (tn + fp) if (tn + fp) > 0 else 0
bal_acc = (sens + spec) / 2

try:
    roc = roc_auc_score(y_true, y_prob)
    p, r, _ = precision_recall_curve(y_true, y_prob)
    pr_auc = auc(r, p)
except:
    roc = 0
    pr_auc = 0

print(f"BERT Classifier Final Results:")
print(f"F1: {f1:.3f} | PR-AUC: {pr_auc:.3f} | MCC: {mcc:.3f} | ROC-AUC: {roc:.3f}")
print(f"Accuracy: {acc:.3f} | Balanced Accuracy: {bal_acc:.3f}")
print(f"Sensitivity: {sens:.3f} | Specificity: {spec:.3f}")

bert_results = {
    'f1': f1, 'pr_auc': pr_auc, 'mcc': mcc, 'roc': roc,
    'acc': acc, 'bal_acc': bal_acc, 'sens': sens, 'spec': spec,
    'preds': y_pred, 'probs': y_prob
}

BERT Classifier Final Results:
F1: 0.818 | PR-AUC: 0.999 | MCC: 0.750 | ROC-AUC: 1.000
Accuracy: 0.868 | Balanced Accuracy: 0.906
Sensitivity: 1.000 | Specificity: 0.812


DANN-BERT (Domain Adversarial)

In [34]:
class GradReverse(Function):
    @staticmethod
    def forward(ctx, x, alpha):
        ctx.alpha = alpha
        return x.view_as(x)
    
    @staticmethod
    def backward(ctx, grad):
        return grad.neg() * ctx.alpha, None

class DANN(nn.Module):
    def __init__(self, n_domains):
        super().__init__()
        
        # Feature extractor
        self.feat = nn.Sequential(
            nn.Linear(cfg['hidden'], 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(cfg['dropout']),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU()
        )
        
        # Anomaly classifier
        self.anomaly = nn.Sequential(
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(cfg['dropout']),
            nn.Linear(64, 2)
        )
        
        # Domain classifier
        self.domain = nn.Sequential(
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(cfg['dropout']),
            nn.Linear(64, n_domains)
        )
    
    def forward(self, x, alpha=1.0):
        feat = self.feat(x)
        anomaly_out = self.anomaly(feat)
        
        # Reverse gradient for domain classifier
        reversed_feat = GradReverse.apply(feat, alpha)
        domain_out = self.domain(reversed_feat)
        
        return anomaly_out, domain_out

In [37]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torch.autograd import Function
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import f1_score, matthews_corrcoef, roc_auc_score, precision_recall_curve, auc, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from transformers import AdamW
import matplotlib.pyplot as plt

SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)

ROOT = Path(r"C:\Computer Science\AIMLDL\log-anomaly-detection")
FEAT_PATH = ROOT / "features"
RES_PATH = ROOT / "results" / "cross_source_transfer" / "advanced"
RES_PATH.mkdir(parents=True, exist_ok=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

# Load data
with open(FEAT_PATH / "hybrid_features.pkl", 'rb') as f:
    feat_data = pickle.load(f)['hybrid_features_data']

with open(FEAT_PATH / "cross_source_splits.pkl", 'rb') as f:
    splits = pickle.load(f)['splits']

# Updated config
cfg = {
    'hidden': 768,
    'batch': 32,
    'lr': 5e-4,  # Lower learning rate for stability
    'epochs': 20,
    'dropout': 0.4,  # More dropout to prevent overfitting
    'patience': 5  # More patience
}

print(f"Loaded {len(feat_data)} sources")

# ============================================================================
# DATA PREPARATION - ADD VALIDATION SPLIT
# ============================================================================

exp = splits[0]
test_src = exp['test_source']
train_srcs = exp['train_sources']

test_X = feat_data[test_src]['bert_only']
test_y = feat_data[test_src]['labels']

train_X_list = []
train_y_list = []

for src in train_srcs:
    if feat_data[src]['labels'] is not None:
        train_X_list.append(feat_data[src]['bert_only'])
        train_y_list.append(feat_data[src]['labels'])

train_X = np.vstack(train_X_list)
train_y = np.concatenate(train_y_list)

print(f"\nTest: {test_src} ({len(test_y)} samples)")
print(f"Train: {len(train_srcs)} sources ({len(train_y)} samples)")

# Create train/validation split (80/20)
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(
    train_X, train_y, test_size=0.2, random_state=SEED, stratify=train_y
)

print(f"After split: Train={len(train_y)}, Val={len(val_y)}, Test={len(test_y)}")

# Check class distribution
train_counts = np.bincount(train_y)
val_counts = np.bincount(val_y)
test_counts = np.bincount(test_y)
print(f"\nTrain class distribution: Normal={train_counts[0]}, Anomaly={train_counts[1]}")
print(f"Val class distribution: Normal={val_counts[0]}, Anomaly={val_counts[1]}")
print(f"Test class distribution: Normal={test_counts[0]}, Anomaly={test_counts[1]}")
print(f"Train anomaly rate: {np.mean(train_y):.3f}")
print(f"Val anomaly rate: {np.mean(val_y):.3f}")
print(f"Test anomaly rate: {np.mean(test_y):.3f}")

# Normalize features
scaler = StandardScaler()
train_X_scaled = scaler.fit_transform(train_X)
val_X_scaled = scaler.transform(val_X)
test_X_scaled = scaler.transform(test_X)

# Compute class weights for imbalanced data
class_weights = compute_class_weight('balanced', classes=np.unique(train_y), y=train_y)
class_weights = torch.FloatTensor(class_weights).to(device)
print(f"\nClass weights: {class_weights.cpu().numpy()}")

# Convert to tensors
train_X_t = torch.FloatTensor(train_X_scaled)
train_y_t = torch.LongTensor(train_y)
val_X_t = torch.FloatTensor(val_X_scaled)
val_y_t = torch.LongTensor(val_y)
test_X_t = torch.FloatTensor(test_X_scaled)
test_y_t = torch.LongTensor(test_y)

train_ds = TensorDataset(train_X_t, train_y_t)
val_ds = TensorDataset(val_X_t, val_y_t)
test_ds = TensorDataset(test_X_t, test_y_t)
train_dl = DataLoader(train_ds, batch_size=cfg['batch'], shuffle=True)
val_dl = DataLoader(val_ds, batch_size=cfg['batch'])
test_dl = DataLoader(test_ds, batch_size=cfg['batch'])

# ============================================================================
# MODEL 1: BERT CLASSIFIER
# ============================================================================

print("\n" + "="*80)
print("TRAINING BERT CLASSIFIER")
print("="*80)

class BertCls(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(cfg['hidden'], 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(cfg['dropout']),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(cfg['dropout']),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(cfg['dropout']),
            nn.Linear(64, 2)
        )
    
    def forward(self, x):
        return self.net(x)

model = BertCls().to(device)
optimizer = AdamW(model.parameters(), lr=cfg['lr'])
criterion = nn.CrossEntropyLoss(weight=class_weights)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2, verbose=True)

best_f1 = 0
patience_counter = 0

for epoch in range(cfg['epochs']):
    # Training
    model.train()
    total_loss = 0
    
    for X_batch, y_batch in train_dl:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        optimizer.zero_grad()
        out = model(X_batch)
        loss = criterion(out, y_batch)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_dl)
    
    # Evaluation on validation set
    model.eval()
    val_preds = []
    val_probs = []
    val_labels = []
    
    with torch.no_grad():
        for X_batch, y_batch in val_dl:
            X_batch = X_batch.to(device)
            out = model(X_batch)
            prob = F.softmax(out, dim=1)
            pred = torch.argmax(out, dim=1)
            
            val_preds.extend(pred.cpu().numpy())
            val_probs.extend(prob[:, 1].cpu().numpy())
            val_labels.extend(y_batch.numpy())
    
    val_f1 = f1_score(val_labels, val_preds, zero_division=0)
    val_mcc = matthews_corrcoef(val_labels, val_preds)
    
    print(f"Epoch {epoch+1:2d}: Loss={avg_loss:.4f} | Val F1={val_f1:.3f} | Val MCC={val_mcc:.3f}")
    
    # Learning rate scheduling
    scheduler.step(val_f1)
    
    # Early stopping
    if val_f1 > best_f1:
        best_f1 = val_f1
        torch.save(model.state_dict(), RES_PATH / 'best_bert_cls.pt')
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= cfg['patience']:
            print(f"Early stopping at epoch {epoch+1}")
            break

# Load best model and evaluate
model.load_state_dict(torch.load(RES_PATH / 'best_bert_cls.pt'))
model.eval()

preds = []
probs = []
labels = []

with torch.no_grad():
    for X_batch, y_batch in test_dl:
        X_batch = X_batch.to(device)
        out = model(X_batch)
        prob = F.softmax(out, dim=1)
        pred = torch.argmax(out, dim=1)
        
        preds.extend(pred.cpu().numpy())
        probs.extend(prob[:, 1].cpu().numpy())
        labels.extend(y_batch.numpy())

y_true = np.array(labels)
y_pred = np.array(preds)
y_prob = np.array(probs)

# Calculate metrics
f1 = f1_score(y_true, y_pred, zero_division=0)
mcc = matthews_corrcoef(y_true, y_pred)
acc = accuracy_score(y_true, y_pred)
cm = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = cm.ravel()
sens = tp / (tp + fn) if (tp + fn) > 0 else 0
spec = tn / (tn + fp) if (tn + fp) > 0 else 0
bal_acc = (sens + spec) / 2

try:
    roc = roc_auc_score(y_true, y_prob)
    p, r, _ = precision_recall_curve(y_true, y_prob)
    pr_auc = auc(r, p)
except:
    roc = 0
    pr_auc = 0

print(f"\nBERT Classifier Final Results:")
print(f"F1: {f1:.3f} | PR-AUC: {pr_auc:.3f} | MCC: {mcc:.3f} | ROC-AUC: {roc:.3f}")
print(f"Accuracy: {acc:.3f} | Balanced Accuracy: {bal_acc:.3f}")
print(f"Sensitivity: {sens:.3f} | Specificity: {spec:.3f}")

bert_results = {
    'f1': f1, 'pr_auc': pr_auc, 'mcc': mcc, 'roc': roc,
    'acc': acc, 'bal_acc': bal_acc, 'sens': sens, 'spec': spec,
    'preds': y_pred, 'probs': y_prob
}

# ============================================================================
# MODEL 2: DANN-BERT (Domain Adversarial)
# ============================================================================

print("\n" + "="*80)
print("TRAINING DANN-BERT")
print("="*80)

class GradReverse(Function):
    @staticmethod
    def forward(ctx, x, alpha):
        ctx.alpha = alpha
        return x.view_as(x)
    
    @staticmethod
    def backward(ctx, grad):
        return grad.neg() * ctx.alpha, None

class DANN(nn.Module):
    def __init__(self, n_domains):
        super().__init__()
        
        # Feature extractor
        self.feat = nn.Sequential(
            nn.Linear(cfg['hidden'], 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(cfg['dropout']),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU()
        )
        
        # Anomaly classifier
        self.anomaly = nn.Sequential(
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(cfg['dropout']),
            nn.Linear(64, 2)
        )
        
        # Domain classifier
        self.domain = nn.Sequential(
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(cfg['dropout']),
            nn.Linear(64, n_domains)
        )
    
    def forward(self, x, alpha=1.0):
        feat = self.feat(x)
        anomaly_out = self.anomaly(feat)
        
        # Reverse gradient for domain classifier
        reversed_feat = GradReverse.apply(feat, alpha)
        domain_out = self.domain(reversed_feat)
        
        return anomaly_out, domain_out

# Prepare domain labels - use validation split
from sklearn.model_selection import train_test_split as tts

domain_map = {src: i for i, src in enumerate(train_srcs)}
train_domains_full = []
train_X_dann_list = []
train_y_dann_list = []

for src in train_srcs:
    if feat_data[src]['labels'] is not None:
        n_samples = len(feat_data[src]['labels'])
        train_domains_full.extend([domain_map[src]] * n_samples)

# Split with domains preserved
combined_indices = np.arange(len(train_X))
train_idx, val_idx = tts(
    combined_indices, test_size=0.2, random_state=SEED, stratify=train_y
)

# Get validation data for DANN
val_X_dann = train_X[val_idx]
val_y_dann = train_y[val_idx]
train_X_dann = train_X[train_idx]
train_y_dann = train_y[train_idx]
train_domains = [train_domains_full[i] for i in train_idx]

# Scale
train_X_dann_scaled = scaler.transform(train_X_dann)
val_X_dann_scaled = scaler.transform(val_X_dann)

train_X_dann_t = torch.FloatTensor(train_X_dann_scaled)
train_y_dann_t = torch.LongTensor(train_y_dann)
val_X_dann_t = torch.FloatTensor(val_X_dann_scaled)
val_y_dann_t = torch.LongTensor(val_y_dann)
train_dom_t = torch.LongTensor(train_domains)

train_ds_dann = TensorDataset(train_X_dann_t, train_y_dann_t, train_dom_t)
val_ds_dann = TensorDataset(val_X_dann_t, val_y_dann_t)
train_dl_dann = DataLoader(train_ds_dann, batch_size=cfg['batch'], shuffle=True)
val_dl_dann = DataLoader(val_ds_dann, batch_size=cfg['batch'])

# Initialize DANN
dann = DANN(len(domain_map)).to(device)
opt = AdamW(dann.parameters(), lr=cfg['lr'])
crit_anom = nn.CrossEntropyLoss(weight=class_weights)
crit_dom = nn.CrossEntropyLoss()
scheduler_dann = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode='max', factor=0.5, patience=2, verbose=True)

best_f1_dann = 0
patience_counter_dann = 0

for epoch in range(cfg['epochs']):
    # Training
    dann.train()
    total_anom_loss = 0
    total_dom_loss = 0
    
    # Dynamic lambda (standard DANN schedule: 0 to 1)
    p = float(epoch) / float(cfg['epochs'])
    alpha = 2. / (1. + np.exp(-10. * p)) - 1.
    
    for X_batch, y_batch, d_batch in train_dl_dann:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)
        d_batch = d_batch.to(device)
        
        opt.zero_grad()
        
        # Forward
        anom_out, dom_out = dann(X_batch, alpha)
        
        # Losses
        anom_loss = crit_anom(anom_out, y_batch)
        dom_loss = crit_dom(dom_out, d_batch)
        total_loss = anom_loss + dom_loss
        
        total_loss.backward()
        opt.step()
        
        total_anom_loss += anom_loss.item()
        total_dom_loss += dom_loss.item()
    
    avg_anom = total_anom_loss / len(train_dl_dann)
    avg_dom = total_dom_loss / len(train_dl_dann)
    
    # Evaluation on validation set
    dann.eval()
    val_preds_dann = []
    val_probs_dann = []
    val_labels_dann = []
    
    with torch.no_grad():
        for X_batch, y_batch in val_dl_dann:
            X_batch = X_batch.to(device)
            anom_out, _ = dann(X_batch)
            prob = F.softmax(anom_out, dim=1)
            pred = torch.argmax(anom_out, dim=1)
            
            val_preds_dann.extend(pred.cpu().numpy())
            val_probs_dann.extend(prob[:, 1].cpu().numpy())
            val_labels_dann.extend(y_batch.numpy())
    
    val_f1_dann = f1_score(val_labels_dann, val_preds_dann, zero_division=0)
    val_mcc_dann = matthews_corrcoef(val_labels_dann, val_preds_dann)
    
    print(f"Epoch {epoch+1:2d}: Anom={avg_anom:.4f} | Dom={avg_dom:.4f} | Alpha={alpha:.3f} | Val F1={val_f1_dann:.3f} | Val MCC={val_mcc_dann:.3f}")
    
    # Learning rate scheduling
    scheduler_dann.step(val_f1_dann)
    
    # Early stopping
    if val_f1_dann > best_f1_dann:
        best_f1_dann = val_f1_dann
        torch.save(dann.state_dict(), RES_PATH / 'best_dann.pt')
        patience_counter_dann = 0
    else:
        patience_counter_dann += 1
        if patience_counter_dann >= cfg['patience']:
            print(f"Early stopping at epoch {epoch+1}")
            break

# Load best model and evaluate
dann.load_state_dict(torch.load(RES_PATH / 'best_dann.pt'))
dann.eval()

preds_dann = []
probs_dann = []
labels_dann = []

with torch.no_grad():
    for X_batch, y_batch in test_dl:
        X_batch = X_batch.to(device)
        anom_out, _ = dann(X_batch)
        prob = F.softmax(anom_out, dim=1)
        pred = torch.argmax(anom_out, dim=1)
        
        preds_dann.extend(pred.cpu().numpy())
        probs_dann.extend(prob[:, 1].cpu().numpy())
        labels_dann.extend(y_batch.numpy())

y_pred_dann = np.array(preds_dann)
y_prob_dann = np.array(probs_dann)

# Calculate metrics
f1_dann = f1_score(y_true, y_pred_dann, zero_division=0)
mcc_dann = matthews_corrcoef(y_true, y_pred_dann)
acc_dann = accuracy_score(y_true, y_pred_dann)
cm_dann = confusion_matrix(y_true, y_pred_dann)
tn_d, fp_d, fn_d, tp_d = cm_dann.ravel()
sens_dann = tp_d / (tp_d + fn_d) if (tp_d + fn_d) > 0 else 0
spec_dann = tn_d / (tn_d + fp_d) if (tn_d + fp_d) > 0 else 0
bal_acc_dann = (sens_dann + spec_dann) / 2

try:
    roc_dann = roc_auc_score(y_true, y_prob_dann)
    p_d, r_d, _ = precision_recall_curve(y_true, y_prob_dann)
    pr_auc_dann = auc(r_d, p_d)
except:
    roc_dann = 0
    pr_auc_dann = 0

print(f"\nDANN-BERT Final Results:")
print(f"F1: {f1_dann:.3f} | PR-AUC: {pr_auc_dann:.3f} | MCC: {mcc_dann:.3f} | ROC-AUC: {roc_dann:.3f}")
print(f"Accuracy: {acc_dann:.3f} | Balanced Accuracy: {bal_acc_dann:.3f}")
print(f"Sensitivity: {sens_dann:.3f} | Specificity: {spec_dann:.3f}")

dann_results = {
    'f1': f1_dann, 'pr_auc': pr_auc_dann, 'mcc': mcc_dann, 'roc': roc_dann,
    'acc': acc_dann, 'bal_acc': bal_acc_dann, 'sens': sens_dann, 'spec': spec_dann,
    'preds': y_pred_dann, 'probs': y_prob_dann
}

# ============================================================================
# MODEL 3: HYBRID CLASSIFIER (Template + BERT)
# ============================================================================

print("\n" + "="*80)
print("TRAINING HYBRID CLASSIFIER")
print("="*80)

test_X_hyb = feat_data[test_src]['hybrid_variants']['bert_embedding_concat']

train_X_hyb_list = []
train_y_hyb_list = []

for src in train_srcs:
    if (feat_data[src]['labels'] is not None and 
        'bert_embedding_concat' in feat_data[src]['hybrid_variants']):
        train_X_hyb_list.append(feat_data[src]['hybrid_variants']['bert_embedding_concat'])
        train_y_hyb_list.append(feat_data[src]['labels'])

train_X_hyb = np.vstack(train_X_hyb_list)
train_y_hyb = np.concatenate(train_y_hyb_list)

# Split hybrid data for validation
train_X_hyb, val_X_hyb, train_y_hyb, val_y_hyb = train_test_split(
    train_X_hyb, train_y_hyb, test_size=0.2, random_state=SEED, stratify=train_y_hyb
)

print(f"Hybrid features: {train_X_hyb.shape[1]} dimensions")

# Normalize hybrid features
scaler_hyb = StandardScaler()
train_X_hyb_scaled = scaler_hyb.fit_transform(train_X_hyb)
val_X_hyb_scaled = scaler_hyb.transform(val_X_hyb)
test_X_hyb_scaled = scaler_hyb.transform(test_X_hyb)

train_X_hyb_t = torch.FloatTensor(train_X_hyb_scaled)
train_y_hyb_t = torch.LongTensor(train_y_hyb)
val_X_hyb_t = torch.FloatTensor(val_X_hyb_scaled)
val_y_hyb_t = torch.LongTensor(val_y_hyb)
test_X_hyb_t = torch.FloatTensor(test_X_hyb_scaled)

train_ds_hyb = TensorDataset(train_X_hyb_t, train_y_hyb_t)
val_ds_hyb = TensorDataset(val_X_hyb_t, val_y_hyb_t)
test_ds_hyb = TensorDataset(test_X_hyb_t, test_y_t)
train_dl_hyb = DataLoader(train_ds_hyb, batch_size=cfg['batch'], shuffle=True)
val_dl_hyb = DataLoader(val_ds_hyb, batch_size=cfg['batch'])
test_dl_hyb = DataLoader(test_ds_hyb, batch_size=cfg['batch'])

class HybridCls(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(cfg['dropout']),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(cfg['dropout']),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(cfg['dropout']),
            nn.Linear(64, 2)
        )
    
    def forward(self, x):
        return self.net(x)

hybrid = HybridCls(train_X_hyb.shape[1]).to(device)
opt_hyb = AdamW(hybrid.parameters(), lr=cfg['lr'])
crit_hyb = nn.CrossEntropyLoss(weight=class_weights)
scheduler_hyb = torch.optim.lr_scheduler.ReduceLROnPlateau(opt_hyb, mode='max', factor=0.5, patience=2, verbose=True)

best_f1_hyb = 0
patience_counter_hyb = 0

for epoch in range(cfg['epochs']):
    # Training
    hybrid.train()
    total_loss = 0
    
    for X_batch, y_batch in train_dl_hyb:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        opt_hyb.zero_grad()
        out = hybrid(X_batch)
        loss = crit_hyb(out, y_batch)
        loss.backward()
        opt_hyb.step()
        
        total_loss += loss.item()
    
    avg_loss_hyb = total_loss / len(train_dl_hyb)
    
    # Evaluation on validation set
    hybrid.eval()
    val_preds_hyb = []
    val_probs_hyb = []
    val_labels_hyb = []
    
    with torch.no_grad():
        for X_batch, y_batch in val_dl_hyb:
            X_batch = X_batch.to(device)
            out = hybrid(X_batch)
            prob = F.softmax(out, dim=1)
            pred = torch.argmax(out, dim=1)
            
            val_preds_hyb.extend(pred.cpu().numpy())
            val_probs_hyb.extend(prob[:, 1].cpu().numpy())
            val_labels_hyb.extend(y_batch.numpy())
    
    val_f1_hyb = f1_score(val_labels_hyb, val_preds_hyb, zero_division=0)
    val_mcc_hyb = matthews_corrcoef(val_labels_hyb, val_preds_hyb)
    
    print(f"Epoch {epoch+1:2d}: Loss={avg_loss_hyb:.4f} | Val F1={val_f1_hyb:.3f} | Val MCC={val_mcc_hyb:.3f}")
    
    # Learning rate scheduling
    scheduler_hyb.step(val_f1_hyb)
    
    # Early stopping
    if val_f1_hyb > best_f1_hyb:
        best_f1_hyb = val_f1_hyb
        torch.save(hybrid.state_dict(), RES_PATH / 'best_hybrid.pt')
        patience_counter_hyb = 0
    else:
        patience_counter_hyb += 1
        if patience_counter_hyb >= cfg['patience']:
            print(f"Early stopping at epoch {epoch+1}")
            break

# Load best model and evaluate
hybrid.load_state_dict(torch.load(RES_PATH / 'best_hybrid.pt'))
hybrid.eval()

preds_hyb = []
probs_hyb = []

with torch.no_grad():
    for X_batch, y_batch in test_dl_hyb:
        X_batch = X_batch.to(device)
        out = hybrid(X_batch)
        prob = F.softmax(out, dim=1)
        pred = torch.argmax(out, dim=1)
        
        preds_hyb.extend(pred.cpu().numpy())
        probs_hyb.extend(prob[:, 1].cpu().numpy())

y_pred_hyb = np.array(preds_hyb)
y_prob_hyb = np.array(probs_hyb)

# Calculate metrics
f1_hyb = f1_score(y_true, y_pred_hyb, zero_division=0)
mcc_hyb = matthews_corrcoef(y_true, y_pred_hyb)
acc_hyb = accuracy_score(y_true, y_pred_hyb)
cm_hyb = confusion_matrix(y_true, y_pred_hyb)
tn_h, fp_h, fn_h, tp_h = cm_hyb.ravel()
sens_hyb = tp_h / (tp_h + fn_h) if (tp_h + fn_h) > 0 else 0
spec_hyb = tn_h / (tn_h + fp_h) if (tn_h + fp_h) > 0 else 0
bal_acc_hyb = (sens_hyb + spec_hyb) / 2

try:
    roc_hyb = roc_auc_score(y_true, y_prob_hyb)
    p_h, r_h, _ = precision_recall_curve(y_true, y_prob_hyb)
    pr_auc_hyb = auc(r_h, p_h)
except:
    roc_hyb = 0
    pr_auc_hyb = 0

print(f"\nHybrid Classifier Final Results:")
print(f"F1: {f1_hyb:.3f} | PR-AUC: {pr_auc_hyb:.3f} | MCC: {mcc_hyb:.3f} | ROC-AUC: {roc_hyb:.3f}")
print(f"Accuracy: {acc_hyb:.3f} | Balanced Accuracy: {bal_acc_hyb:.3f}")
print(f"Sensitivity: {sens_hyb:.3f} | Specificity: {spec_hyb:.3f}")

hybrid_results = {
    'f1': f1_hyb, 'pr_auc': pr_auc_hyb, 'mcc': mcc_hyb, 'roc': roc_hyb,
    'acc': acc_hyb, 'bal_acc': bal_acc_hyb, 'sens': sens_hyb, 'spec': spec_hyb,
    'preds': y_pred_hyb, 'probs': y_prob_hyb
}

# ============================================================================
# RESULTS COMPARISON AND VISUALIZATION
# ============================================================================

print("\n" + "="*80)
print("FINAL COMPARISON")
print("="*80)

# Collect all results
results = {
    'BERT Classifier': bert_results,
    'DANN-BERT': dann_results,
    'Hybrid Classifier': hybrid_results
}

# Create comparison DataFrame
comparison_data = []
for model, res in results.items():
    comparison_data.append({
        'Model': model,
        'F1': res['f1'],
        'PR-AUC': res['pr_auc'],
        'MCC': res['mcc'],
        'ROC-AUC': res['roc'],
        'Accuracy': res['acc'],
        'Bal-Acc': res['bal_acc'],
        'Sensitivity': res['sens'],
        'Specificity': res['spec']
    })

comp_df = pd.DataFrame(comparison_data)
print("\n=== MODEL COMPARISON ===")
print(comp_df.round(3).to_string(index=False))

Device: cuda
Loaded 6 sources

Test: Apache (2000 samples)
Train: 5 sources (10000 samples)
After split: Train=8000, Val=2000, Test=2000

Train class distribution: Normal=4566, Anomaly=3434
Val class distribution: Normal=1141, Anomaly=859
Test class distribution: Normal=1405, Anomaly=595
Train anomaly rate: 0.429
Val anomaly rate: 0.429
Test anomaly rate: 0.297

Class weights: [0.8760403 1.1648223]

TRAINING BERT CLASSIFIER
Epoch  1: Loss=0.1437 | Val F1=0.988 | Val MCC=0.980
Epoch  2: Loss=0.0299 | Val F1=0.988 | Val MCC=0.980
Epoch  3: Loss=0.0186 | Val F1=0.988 | Val MCC=0.980
Epoch  4: Loss=0.0229 | Val F1=0.991 | Val MCC=0.985
Epoch  5: Loss=0.0169 | Val F1=0.994 | Val MCC=0.989
Epoch  6: Loss=0.0150 | Val F1=0.994 | Val MCC=0.989
Epoch  7: Loss=0.0097 | Val F1=0.995 | Val MCC=0.992
Epoch  8: Loss=0.0086 | Val F1=0.995 | Val MCC=0.991
Epoch  9: Loss=0.0079 | Val F1=0.992 | Val MCC=0.987
Epoch 10: Loss=0.0092 | Val F1=0.996 | Val MCC=0.993
Epoch 11: Loss=0.0098 | Val F1=0.995 | Val