In [None]:
import os
import re
import numpy as np
import pandas as pd
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict, ClassLabel, load_dataset, concatenate_datasets

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
def extract_fixed_segments(template):
    pattern = r"(<\*?>|<>|<<.*?>>)"
    segments = re.split(pattern, template)
    fixed_parts = [seg.strip() for seg in segments if not re.match(pattern, seg) and seg.strip()]
    return fixed_parts

In [None]:
def generate_bio_labels(content, template):
    # Count number of expected variables
    variable_count = template.count("<*>") + template.count("<>") + len(re.findall(r"<<.*?>>", template))

    # Extract fixed segments
    fixed_segments = extract_fixed_segments(template)
    
    # Escape fixed segments for regex pattern
    escaped_segments = [re.escape(seg) for seg in fixed_segments]
    
    # Build pattern to extract variable parts between fixed segments
    if escaped_segments:
        split_pattern = "(.*?)".join(escaped_segments)
        match = re.match(split_pattern, content)
    else:
        match = None

    if match:
        var_tokens = [v.strip() for v in match.groups()]
    else:
        print(f"❗ Regex split failed: expected {variable_count} vars, but could not match.")
        print(f"  Template: {template}")
        print(f"  Content : {content}")
        return []

    if len(var_tokens) != variable_count:
        print(f"⚠️ Mismatch: expected {variable_count} vars, found {len(var_tokens)}.")
        print(f"  Template: {template}")
        print(f"  Content : {content}")
        return []

    # Tokenize content
    content_tokens = tokenizer.tokenize(content)
    labels = ["O"] * len(content_tokens)

    # Match variable spans and assign BIO tags
    for var in var_tokens:
        var_toks = tokenizer.tokenize(var)
        for i in range(len(content_tokens) - len(var_toks) + 1):
            if content_tokens[i:i+len(var_toks)] == var_toks:
                labels[i] = "B-VAR"
                for j in range(1, len(var_toks)):
                    labels[i+j] = "I-VAR"
                break  # Stop after first match

    return list(zip(content_tokens, labels))

In [None]:
def process_file(csv_path, output_path):
    df = pd.read_csv(csv_path)
    examples = []

    for _, row in df.iterrows():
        content = str(row["Content"])
        template = str(row["EventTemplate"])
        try:
            tokens_and_labels = generate_bio_labels(content, template)
            if tokens_and_labels:
                examples.append(tokens_and_labels)
        except Exception as e:
            print(f"❌ Skipping line due to error: {e}")
            continue

    with open(output_path, "w", encoding="utf-8") as f:
        for sentence in examples:
            for token, label in sentence:
                f.write(f"{token} {label}\n")
            f.write("\n")

In [None]:
# input_folder = "../dataset/structured_data"
# output_folder = "../dataset/bert_format"
# os.makedirs(output_folder, exist_ok=True)

# for file in os.listdir(input_folder):
#     if file.endswith(".csv"):
#         log_type = file.replace(".csv", "")
#         print(f"Processing {log_type}...")
#         process_file(
#             csv_path=os.path.join(input_folder, file),
#             output_path=os.path.join(output_folder, f"{log_type}.txt")
#         )

In [None]:
def load_bio_data(file_path):
    tokens = []
    labels = []
    all_tokens = []
    all_labels = []
    with open(file_path, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    all_tokens.append(tokens)
                    all_labels.append(labels)
                    tokens = []
                    labels = []
            else:
                parts = line.split()
                if len(parts) >= 2:
                    tokens.append(parts[0])
                    labels.append(parts[1])
    return {"tokens": all_tokens, "labels": all_labels}

In [None]:
folder = "../dataset/bert_format"

dataset_list = []
for file in os.listdir(folder):
    if file.endswith(".txt") and "mismatches" not in file:
        data = load_bio_data(os.path.join(folder, file))
        dataset_list.append(Dataset.from_dict(data))

In [None]:
full_dataset = concatenate_datasets(dataset_list)
dataset = full_dataset.train_test_split(test_size=0.1, seed=42)

In [None]:
unique_labels = ["O", "B-VAR", "I-VAR"]
label2id = {l: i for i, l in enumerate(unique_labels)}
id2label = {i: l for l, i in label2id.items()}
num_labels = len(unique_labels)

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding=True,
        is_split_into_words=True
    )

    all_labels = []
    for i in range(len(examples["tokens"])):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label = examples["labels"][i]
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                # Same word, continue I- prefix
                label_ids.append(label2id[label[word_idx]])
            previous_word_idx = word_idx
        all_labels.append(label_ids)

    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs

In [None]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
tokenized_dataset["train"][0]

In [None]:
model = AutoModelForTokenClassification.from_pretrained("bert-base-cased", num_labels=num_labels, id2label=id2label, label2id=label2id)

In [None]:
training_args = TrainingArguments(
    output_dir="./bert-log-parser",
    do_train=True,
    do_eval=True,
    eval_steps=500,  
    save_steps=500,
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs"
)

In [None]:
def compute_metrics(eval_pred):
    from sklearn.metrics import precision_recall_fscore_support
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)

    true_preds = [
        [id2label[p] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(predictions, labels)
    ]

    all_preds = sum(true_preds, [])
    all_labels = sum(true_labels, [])
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="micro")
    return {"precision": precision, "recall": recall, "f1": f1}

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
trainer.save_model("./bert-log-parser/final")
tokenizer.save_pretrained("./bert-log-parser/final")