In [8]:
import numpy as np
import pandas as pd
import os
from logtools import (
    get_runs_files_df,
    reference_holdout_split,
    get_embedded_ref_entries_df,
    get_embedded_holdout_entries_df,
    calculate_distances_all_log_types,
    calculate_distances,
    calculate_test_anomaly_scores,
    calculate_baselines,
    replace_substrings,
)
from sentence_transformers import SentenceTransformer
import faiss

In [None]:
# Set the data directory.
# Every run should be its own directory under it.
data_dir = 'data'

# Set the model.
model_identifier = "all-MiniLM-L6-v2"
m = SentenceTransformer(model_identifier)

# Set window size.
window_size = 4

# Set k, the number of nearest matches to look for in the reference data.
k = 4

# Set the distance metric.
metric = 'squared_euclidean'

# Use a single GPU as the faiss resource.
res = faiss.StandardGpuResources()  

In [None]:
# Create a df of all log file names in all runs.
runs_files_df = get_runs_files_df(data_dir)

# Split the data into reference and holdout sets.
reference_files_df, holdout_files_df = reference_holdout_split(runs_files_df)

# Determine the reference log types.
reference_log_types = reference_files_df['log_type'].unique()

# Determine the reference runs.
holdout_runs = holdout_files_df['run'].unique()

# Choose a test run.
test_run = holdout_runs[0]
test_log_types = holdout_files_df[holdout_files_df['run'] == test_run]['log_type'].unique()

In [None]:
# Embed the reference entries.
ref_embeddings = get_embedded_ref_entries_df(
    reference_files_df,
    data_dir,
    m,
    window_size)


In [None]:
# Embed the holdout entries.
holdout_embeddings = get_embedded_holdout_entries_df(
    reference_log_types,
    holdout_files_df,
    data_dir,
    m,
    window_size)


In [None]:
# Calculate the distances for the test embeddings in the holdout set.
holdout_distances = calculate_distances_all_log_types(
    holdout_embeddings,
    ref_embeddings,
    res,
    k,
    keep_highest_only=False,
    metric=metric,
    )

In [None]:
# Calculate the baseline distances for each log type.
baselines = calculate_baselines(
    test_log_types,
    holdout_distances,
    test_run)

In [None]:
test_run_df = holdout_embeddings.query("run == @test_run")

In [None]:
scores_df = calculate_test_anomaly_scores(
    test_run_df,
    ref_embeddings,
    k,
    res,
    baselines,
    metric=metric,
)

In [None]:
# Sort by the anomaly score and show the top 10 entries.
sorted_scores_df = scores_df.sort_values(by="anomaly_score", ascending=False)
sorted_scores_df = sorted_scores_df.reset_index(drop=True)
sorted_scores_df.head(10)