In [1]:
import torch
from transformers import BertTokenizer, BertForMaskedLM
import numpy as np
import torch.nn.functional as F

In [None]:
model_dir = "./logbert_finetuned"
tokenizer = BertTokenizer.from_pretrained(model_dir)
model = BertForMaskedLM.from_pretrained(model_dir)
model.eval()

def get_log_embeddings(logs):
    inputs = tokenizer(logs, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model.bert(**inputs)
    cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    cls_embeddings /= np.linalg.norm(cls_embeddings, axis=1, keepdims=True) + 1e-10
    return cls_embeddings

logs = [
    "ERROR: failed to connect to database",
    "INFO: user login successful",
    "WARN: disk space low",
]

embeddings = get_log_embeddings(logs)
print("Shape embeddings:", embeddings.shape)

In [None]:
def get_mlm_loss(logs):
    inputs = tokenizer(logs, padding=True, truncation=True, return_tensors="pt")
    labels = inputs["input_ids"].clone()

    attention_mask = inputs["attention_mask"]
    
    with torch.no_grad():
        outputs = model(**inputs, labels=labels)
        
    logits = outputs.logits
    shift_logits = logits[:, :-1, :].contiguous()
    shift_labels = labels[:, 1:].contiguous()
    shift_attention = attention_mask[:, 1:].contiguous()

    loss_fct = torch.nn.CrossEntropyLoss(reduction='none')
    loss_per_token = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    loss_per_token = loss_per_token.view(shift_labels.size())
    
    loss_per_seq = (loss_per_token * shift_attention).sum(dim=1) / shift_attention.sum(dim=1)
    
    return loss_per_seq.cpu().numpy()

losses = get_mlm_loss(logs)
for log, loss in zip(logs, losses):
    print(f"Loss: {loss:.4f} | Log: {log}")