In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# Load your regex-classified dataset
df = pd.read_csv('../results/nova_logs_with_regex.csv')
print(f"Total logs: {len(df)}")

# Filter unclassified logs for BERT training (medium clusters only)
bert_target_clusters = [3, 5, 6, 9, 13]  # Your medium-sized clusters
bert_logs = df[df['regex_label'].isnull() & df['cluster_id'].isin(bert_target_clusters)]

print(f"BERT training logs: {len(bert_logs)}")
print("Cluster distribution:")
print(bert_logs['cluster_id'].value_counts().sort_index())


Total logs: 54646
BERT training logs: 9933
Cluster distribution:
cluster_id
3     2967
5     2467
6     2462
9     1100
13     937
Name: count, dtype: int64


In [8]:
# Create semantic labels based on cluster analysis
cluster_to_label = {
    3: 'Network_Operations',      # os_vif operations
    5: 'Resource_Management',     # compute claims
    6: 'Scheduler_Operations',    # scheduler reports
    9: 'Network_VIF_Operations',  # VIF operations
    13: 'Error_Handling'          # error patterns
}

# Apply semantic labels
bert_logs = bert_logs.copy()
bert_logs['semantic_label'] = bert_logs['cluster_id'].map(cluster_to_label)

print("Semantic label distribution:")
print(bert_logs['semantic_label'].value_counts())

# Encode labels for training
label_encoder = LabelEncoder()
bert_logs['encoded_label'] = label_encoder.fit_transform(bert_logs['semantic_label'])

print(f"Label mapping: {dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))}")


Semantic label distribution:
semantic_label
Network_Operations        2967
Resource_Management       2467
Scheduler_Operations      2462
Network_VIF_Operations    1100
Error_Handling             937
Name: count, dtype: int64
Label mapping: {'Error_Handling': 0, 'Network_Operations': 1, 'Network_VIF_Operations': 2, 'Resource_Management': 3, 'Scheduler_Operations': 4}


In [9]:
class LogDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # Tokenize and encode
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [10]:
# Initialize DistilBERT tokenizer
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

# Prepare texts and labels
texts = bert_logs['raw_log_text'].tolist()
labels = bert_logs['encoded_label'].tolist()

# Train-validation split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)

print(f"Training samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Training samples: 7946
Validation samples: 1987


In [12]:
# Create datasets
train_dataset = LogDataset(train_texts, train_labels, tokenizer)
val_dataset = LogDataset(val_texts, val_labels, tokenizer)

# Create data loaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

print(f"Training batches: {len(train_loader)}")
print(f"Validation batches: {len(val_loader)}")


Training batches: 497
Validation batches: 125


In [14]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Initialize DistilBERT model
num_labels = len(label_encoder.classes_)
model = DistilBertForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=num_labels
)
model.to(device)

# Setup optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

print(f"Model initialized with {num_labels} labels")


Using device: cpu


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model initialized with 5 labels


In [None]:
def train_epoch(model, data_loader, optimizer, device):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    
    for batch_idx, batch in enumerate(data_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        logits = outputs.logits
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        predictions = torch.argmax(logits, dim=-1)
        correct_predictions += (predictions == labels).sum().item()
        total_predictions += labels.size(0)
        
        # Print batch-level details every N batches, e.g. every 10 batches
        if (batch_idx + 1) % 10 == 0:
            batch_acc = (predictions == labels).sum().item() / labels.size(0)
            print(f"  Batch {batch_idx+1}/{len(data_loader)} - Loss: {loss.item():.4f}, Accuracy: {batch_acc:.4f}")
    
    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions / total_predictions
    
    return avg_loss, accuracy

# Train the model
num_epochs = 3
print("Starting training...")

for epoch in range(num_epochs):
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, device)
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"  Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}")


Starting training...


In [None]:
def evaluate_model(model, data_loader, device):
    model.eval()
    total_loss = 0
    all_predictions = []
    all_labels = []
    all_confidences = []
    
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            logits = outputs.logits
            
            total_loss += loss.item()
            
            # Get predictions and confidences
            probabilities = torch.softmax(logits, dim=-1)
            predictions = torch.argmax(logits, dim=-1)
            confidences = torch.max(probabilities, dim=-1)[0]
            
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            all_confidences.extend(confidences.cpu().numpy())
    
    avg_loss = total_loss / len(data_loader)
    accuracy = accuracy_score(all_labels, all_predictions)
    
    return avg_loss, accuracy, all_predictions, all_labels, all_confidences

# Evaluate the model
val_loss, val_acc, val_preds, val_labels, val_confidences = evaluate_model(model, val_loader, device)
print(f"\nValidation Results:")
print(f"  Loss: {val_loss:.4f}, Accuracy: {val_acc:.4f}")

# Classification report
print("\nClassification Report:")
print(classification_report(val_labels, val_preds, target_names=label_encoder.classes_))


In [None]:
# Save model
torch.save(model.state_dict(), '../models/distilbert_log_classifier.pth')
print("Model saved!")

# Save updated dataset
df.to_csv('../results/nova_logs_with_bert.csv', index=False)
print("Dataset with BERT classifications saved!")

# Summary statistics
bert_classified = df['bert_label'].notnull().sum()
high_confidence_count = sum(high_confidence_mask)
low_confidence_count = len(bert_confidences) - high_confidence_count

print(f"\n=== STAGE 4 SUMMARY ===")
print(f"Total logs: {len(df)}")
print(f"Regex classified: {df['regex_label'].notnull().sum()}")
print(f"BERT classified (high confidence): {bert_classified}")
print(f"Remaining for LLM: {df['regex_label'].isnull().sum() - bert_classified}")
print(f"Average BERT confidence: {np.mean(bert_confidences):.3f}")

print(f"\nBERT Label Distribution:")
bert_label_counts = df['bert_label'].value_counts()
for label, count in bert_label_counts.items():
    print(f"  {label}: {count}")
