# Preprocessing

In [None]:
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from transformers import GPT2Tokenizer, AutoTokenizer
from tokenizers import Tokenizer
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import euclidean_distances
from drain3 import TemplateMiner

## Helper functions

In [None]:
def load_data(path, max_line=1000):
    """Load .log file and extract structured data."""
    data = []
    count_line = 0
    
    with open(path, "r") as file:
        for line in file:
            if count_line > max_line:
                continue
            parts = line.split(maxsplit=9)  # Split into max 10 parts (first 9 + rest)
            if len(parts) < 10:  
                parts.extend([""] * (10 - len(parts)))  # Pad if fewer than 10 columns
            data.append(parts)
            count_line += 1
    
    return data

def extract_unique_templates(logs):
    """Convert logs to templates using Drain3."""
    template_miner = TemplateMiner()
    templates = []
    for log in logs:
        result = template_miner.add_log_message(log)
        templates.append(result['template_mined'])
    return set(templates)

def tokenize(logs, tokenizer_type="gpt2"):
    if tokenizer_type == "gpt2":
        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        tokenized_logs = [tokenizer.encode(log, add_special_tokens=False) for log in logs]
    elif tokenizer_type == "bpe":
        tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
        tokenized_logs = [tokenizer.encode(log).ids for log in logs]  # Extract token IDs
    elif tokenizer_type == "logbert":
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  # Replace with LogBERT tokenizer path if available
        tokenized_logs = [tokenizer.encode(log, add_special_tokens=False) for log in logs]
    else:
        raise ValueError("Invalid tokenizer type. Choose 'gpt2', 'bpe', or 'logbert'.")
    
    return tokenized_logs

def tokenize_and_reduce(logs, tokenizer_type="gpt2"):
    """Tokenize log templates using the selected tokenizer (GPT-2, BPE, or LogBERT) and reduce dimensions to 3 using PCA."""
    tokenized_logs = tokenize(logs, tokenizer_type=tokenizer_type)
    
    # Pad sequences to the same length
    max_len = max(len(seq) for seq in tokenized_logs)
    tokenized_logs = [seq + [0] * (max_len - len(seq)) for seq in tokenized_logs]
    
    token_array = np.array(tokenized_logs)
    
    pca = PCA(n_components=3)
    reduced_vectors = pca.fit_transform(token_array)
    
    return reduced_vectors

def compute_color_intensity(data):
    """Compute color intensity based on closeness using Euclidean distance."""
    distances = euclidean_distances(data)
    mean_distances = distances.mean(axis=1)  # Compute the mean distance for each point
    normalized_distances = (mean_distances - mean_distances.min()) / (mean_distances.max() - mean_distances.min())
    return normalized_distances

def plot_3d(data, logs):
    """Plot the reduced data in 3D with interactivity using Plotly and color by closeness."""
    colors = compute_color_intensity(data)
    
    fig = go.Figure()
    
    fig.add_trace(go.Scatter3d(
        x=data[:, 0], y=data[:, 1], z=data[:, 2],
        mode='markers+text',
        marker=dict(size=8, color=colors, colorscale='Viridis', showscale=True),
        text=logs,
        textposition='top center'
    ))
    
    fig.update_layout(
        title='3D Visualization of Tokenized Log Templates (Color by Closeness)',
        scene=dict(
            xaxis_title='PC1', yaxis_title='PC2', zaxis_title='PC3',
        )
    )
    
    fig.show()



## Loading log templates

In [99]:
data = load_data("../data/BGL/BGL.log", max_line=10000)
logs = list(set(log[9] for log in data))
templates = list(extract_unique_templates(logs))

config file not found: drain3.ini


## 3D visualization of the tokenization of 3 different techniques

In [92]:
tokenizer_choice = "gpt2"  # Change to "bpe" if needed
reduced_data = tokenize_and_reduce(templates, tokenizer_choice)
plot_3d(reduced_data, templates)

In [93]:
tokenizer_choice = "bpe"  # Change to "bpe" if needed
reduced_data = tokenize_and_reduce(templates, tokenizer_choice)
plot_3d(reduced_data, templates)

In [94]:
tokenizer_choice = "logbert"  # Change to "bpe" if needed
reduced_data = tokenize_and_reduce(templates, tokenizer_choice)
plot_3d(reduced_data, templates)

## Tokenization consistency and efficiency

In [95]:
for tokenizer_type in ["gpt2", "bpe", "logbert"]:
    print(f"Tokenizer: {tokenizer_type}")
    tokenized = tokenize_and_reduce(logs, tokenizer_type)
    print(tokenized[:3])  

Tokenizer: gpt2
[[-11735.3954867   -5582.79902807  -1368.07241951]
 [ 14914.37141619  17125.04881403  34358.66720025]
 [  1838.16897564  10576.53049148  -7421.93601989]]
Tokenizer: bpe
[[ 10462.23182174  -6118.59026378   3164.10217792]
 [    27.81575744  -7048.31449088 -13183.53393172]
 [ -7164.4204581   12395.18955971    209.72520841]]
Tokenizer: logbert
[[ 10445.49166647  -6131.13529086   3140.33160963]
 [    37.46974452  -7020.12133658 -13232.59644325]
 [ -7158.16151674  12418.92626685    195.24766861]]


In [96]:
for tokenizer_type in ["gpt2", "bpe", "logbert"]:
    tokenized_logs = tokenize(logs, tokenizer_type)
    avg_length = np.mean([len(seq) for seq in tokenized_logs])
    print(f"{tokenizer_type}: Avg tokenized length = {avg_length}")

gpt2: Avg tokenized length = 10.689873417721518
bpe: Avg tokenized length = 11.856540084388186
logbert: Avg tokenized length = 9.856540084388186


## Conclusion

As can be seen, the consistency and 3d visualization of the data shows that Logbert and BPE perform similarly, whereas GPT2 has a completely different tokenized vector. Logbert achieves this with fewer tokenis while preserving the meaning of logs, showing it performs better than BPE. 