In [None]:
import pandas as pd
import re

In [None]:
def parse_log_line(line: str) -> dict:
    # Extract timestamp
    timestamp_match = re.search(r"\[(.*?)\]", line)
    timestamp = timestamp_match.group(1) if timestamp_match else ""

    # Extract time elapsed
    time_elapsed_match = re.search(r"\((.*?)\)", line)
    time_elapsed = time_elapsed_match.group(1) if time_elapsed_match else ""

    # Extract host name and event name
    parts = line.split()
    host_name = parts[2] if len(parts) > 2 else ""
    event_name = parts[3].rstrip(":") if len(parts) > 3 else ""

    # Extract all {...} blocks
    curly_groups = re.findall(r"\{(.*?)\}", line)

    # Initialize fields
    cpu_id = "0"
    process_details = ""
    kernel_details = ""

    # Extract CPU ID from first group (if present)
    if len(curly_groups) >= 1:
        cpu_match = re.search(r"cpu_id\s*=\s*(\d+)", curly_groups[0])
        cpu_id = cpu_match.group(1) if cpu_match else "0"

    # Clean and assign process/kernel details (remove quotes)
    if len(curly_groups) >= 2:
        process_details = curly_groups[1].replace('"', '').replace("'", "").strip()
    if len(curly_groups) >= 3:
        kernel_details = curly_groups[2].replace('"', '').replace("'", "").strip()

    # Build formatted example string and escape all curly braces for PromptTemplate
    formatted = (
        f"timestamp:{timestamp}, time elapsed: (+{time_elapsed}), Host Name: {host_name}, "
        f"Event Name: {event_name}, Cpu Id: {cpu_id}, process_details: {process_details}, "
        f"kernel_details: {kernel_details}"
    )

    # Escape all braces for safe PromptTemplate use
    escaped_formatted = formatted.replace("{", "{{").replace("}", "}}")

    return {"example": escaped_formatted}

In [None]:
logs = []
with open("kernel_trace_1K.txt", "r") as file:
    for line in file:
        if line.strip():
            try:
                parsed = parse_log_line(line.strip())
                logs.append(parsed)
            except Exception as e:
                print(f"Error parsing line: {line}\n{e}")

In [None]:
def parse_key_value_block(block):
    """
    Parse a block like 'key1 = val1, key2 = val2' into a dictionary.
    Handles quoted values and removes extra whitespace.
    """
    pattern = re.findall(r'(\w+)\s*=\s*("[^"]*"|[^,]+)', block)
    return {
      key: re.sub(r"\[.*", "", val.strip('"').strip()) if key == "procname" else val.strip('"').strip()
      for key, val in pattern
  }

def parse_lttng_logs(file_path):
    log_pattern = re.compile(
        r"\[(?P<timestamp>[\d:.]+)\] "
        r"\(\+(?P<time_elapsed>[\d.?\?]+)\) "
        r"(?P<host_name>[\w\-.]+) "
        r"(?P<event_name>[\w_]+): "
        r"\{ cpu_id = (?P<cpu_id>\d+) \}, "
        r"\{ (?P<process_details>.*?) \}, "
        r"\{ (?P<kernel_details>.*?) \}"
    )

    records = []

    with open(file_path, 'r') as file:
        for line in file:
            match = log_pattern.match(line.strip())
            if match:
                base = match.groupdict()

                # Flatten process and kernel details
                process_fields = parse_key_value_block(base.pop("process_details"))
                kernel_fields = parse_key_value_block(base.pop("kernel_details"))

                # Combine all fields
                combined = {**base, **process_fields, **kernel_fields}
                records.append(combined)

    return pd.DataFrame(records)

file_path = "kernel_trace_1K.txt"
real_data = parse_lttng_logs(file_path)

file_path = "synthetic_data_lstm_1K.txt"
synthetic_data = parse_lttng_logs(file_path)

In [None]:
real_data

Unnamed: 0,timestamp,time_elapsed,host_name,event_name,cpu_id,procname,pid,tid,fd,upeer_addrlen,...,ubuf,op_enum,event,msg,offset,nbytes,advice,ufd,utmr,otmr
0,06:13:02.227908688,?.?????????,mendax,syscall_entry_accept,2,elasticsearch,11822,11859,553,246916502706640,...,,,,,,,,,,
1,06:13:02.227912438,0.000003750,mendax,syscall_exit_accept,2,elasticsearch,11822,11859,,246916502706640,...,,,,,,,,,,
2,06:13:02.227914646,0.000002208,mendax,syscall_entry_fcntl,2,elasticsearch,11822,11859,576,,...,,,,,,,,,,
3,06:13:02.227914980,0.000000334,mendax,syscall_exit_fcntl,2,elasticsearch,11822,11859,,,...,,,,,,,,,,
4,06:13:02.227916313,0.000001333,mendax,syscall_entry_getsockname,2,elasticsearch,11822,11859,576,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
965,06:13:02.232285404,0.000000250,mendax,syscall_exit_sync_file_range,0,lttng-consumerd,14323,14330,,,...,,,,,,,,,,
966,06:13:02.232285613,0.000000209,mendax,syscall_entry_fadvise64_64,0,lttng-consumerd,14323,14330,35,,...,,,,,143360,,4,,,
967,06:13:02.232286446,0.000000833,mendax,syscall_exit_fadvise64_64,0,lttng-consumerd,14323,14330,35,,...,,,,,143360,,4,,,
968,06:13:02.232286779,0.000000333,mendax,syscall_entry_ioctl,0,lttng-consumerd,14323,14330,34,,...,,,,,,,,,,


In [None]:
synthetic_data

Unnamed: 0,timestamp,time_elapsed,host_name,event_name,cpu_id,procname,pid,tid,ret,arg,...,val,utime,uaddr2,val3,brk,addr,buff,ubuf,msg,utmr
0,03:46:45.928,0.000123178,mendax,syscall_exit_fcntl,0,curl,14323,14330,0,275115444656760,...,,,,,,,,,,
1,03:46:45.928,0.000844226,mendax,syscall_entry_epoll_ctl,0,curl,14323,14330,,275115444656768,...,,,,,,,,,,
2,03:46:45.928,0.000675907,mendax,syscall_exit_fcntl,0,curl,14323,14330,0,275115444656768,...,,,,,,,,,,
3,03:46:45.928,0.000579098,mendax,syscall_entry_epoll_ctl,0,curl,14323,14330,,275115444656688,...,,,,,,,,,,
4,03:46:45.929,0.000724858,mendax,syscall_exit_fcntl,0,curl,14323,14330,0,275115444656688,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,03:46:45.978,0.000329043,mendax,syscall_exit_brk,0,data-loop.0,3820,3924,1773,4253,...,2169,626,1667,2562,1708,647,917,980,611,542
996,03:46:45.978,0.000610811,mendax,syscall_exit_brk,0,data-loop.0,3820,3924,1773,4253,...,2169,626,1667,2562,1708,647,917,980,611,542
997,03:46:45.978,0.000338697,mendax,syscall_exit_brk,0,data-loop.0,3820,3924,1773,4253,...,2169,626,1667,2562,1708,647,917,980,611,542
998,03:46:45.978,0.000817468,mendax,syscall_exit_brk,0,data-loop.0,3820,3924,1773,4253,...,2169,626,1667,2562,1708,647,918,980,611,542


In [None]:
real_data.at[0, 'time_elapsed'] = "0.000003735"

In [None]:
synthetic_df = synthetic_data.copy(deep=True)
real_df = real_data.copy(deep=True)

In [None]:
# Converting timestamps to numerical values (seconds since epoch)
real_df['timestamp'] = pd.to_datetime(real_df['timestamp']).astype(int) / 10**9
synthetic_df['timestamp'] = pd.to_datetime(synthetic_df['timestamp']).astype(int) / 10**9

  real_df['timestamp'] = pd.to_datetime(real_df['timestamp']).astype(int) / 10**9
  synthetic_df['timestamp'] = pd.to_datetime(synthetic_df['timestamp']).astype(int) / 10**9


In [None]:
real_df["cpu_id"] = pd.to_numeric(real_df["cpu_id"], errors="coerce")
synthetic_df["cpu_id"] = pd.to_numeric(synthetic_df["cpu_id"], errors="coerce")

In [None]:
real_df['time_elapsed'] = pd.to_numeric(real_df['time_elapsed'], errors='coerce')
synthetic_df['time_elapsed'] = pd.to_numeric(synthetic_df['time_elapsed'], errors='coerce')

In [None]:
real_df.head()

Unnamed: 0,timestamp,time_elapsed,host_name,event_name,cpu_id,procname,pid,tid,fd,upeer_addrlen,...,ubuf,op_enum,event,msg,offset,nbytes,advice,ufd,utmr,otmr
0,1746512000.0,3.735e-06,mendax,syscall_entry_accept,2,elasticsearch,11822,11859,553.0,246916502706640.0,...,,,,,,,,,,
1,1746512000.0,3.75e-06,mendax,syscall_exit_accept,2,elasticsearch,11822,11859,,246916502706640.0,...,,,,,,,,,,
2,1746512000.0,2.208e-06,mendax,syscall_entry_fcntl,2,elasticsearch,11822,11859,576.0,,...,,,,,,,,,,
3,1746512000.0,3.34e-07,mendax,syscall_exit_fcntl,2,elasticsearch,11822,11859,,,...,,,,,,,,,,
4,1746512000.0,1.333e-06,mendax,syscall_entry_getsockname,2,elasticsearch,11822,11859,576.0,,...,,,,,,,,,,


In [None]:
synthetic_df.dtypes

Unnamed: 0,0
timestamp,float64
time_elapsed,float64
host_name,object
event_name,object
cpu_id,int64
procname,object
pid,object
tid,object
ret,object
arg,object


In [None]:
import pandas as pd
import numpy as np
from scipy.stats import wasserstein_distance
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score
from scipy.stats import chi2_contingency

In [None]:
from sklearn.preprocessing import LabelEncoder

# Convert categorical columns to strings before encoding
categorical_columns = ["host_name", "event_name", "procname"]
for col in categorical_columns:
    real_df[col] = real_df[col].astype(str)
    synthetic_df[col] = synthetic_df[col].astype(str)

    encoder = LabelEncoder()

    # Fit encoder on combined unique values
    encoder.fit(pd.concat([real_df[col], synthetic_df[col]], axis=0).astype(str))

    # Transform both datasets
    real_df[col] = encoder.transform(real_df[col])
    synthetic_df[col] = encoder.transform(synthetic_df[col])


In [None]:
# ----- 1. Basic Statistical Comparison -----
print("\n--- Basic Statistical Comparison ---")
print("\nReal Data Summary:\n", real_df.describe())
print("\nSynthetic Data Summary:\n", synthetic_df.describe())


--- Basic Statistical Comparison ---

Real Data Summary:
           timestamp  time_elapsed  host_name  event_name      cpu_id  \
count  9.700000e+02  9.700000e+02      970.0  970.000000  970.000000   
mean   1.746512e+09  4.162677e-06        0.0   24.987629    0.588660   
std    1.290041e-03  2.475441e-05        0.0   13.205366    1.066845   
min    1.746512e+09  0.000000e+00        0.0    0.000000    0.000000   
25%    1.746512e+09  2.080000e-07        0.0   11.000000    0.000000   
50%    1.746512e+09  3.330000e-07        0.0   28.000000    0.000000   
75%    1.746512e+09  1.030750e-06        0.0   36.000000    1.000000   
max    1.746512e+09  5.419020e-04        0.0   46.000000    3.000000   

         procname  
count  970.000000  
mean     8.512371  
std      2.889137  
min      0.000000  
25%     10.000000  
50%     10.000000  
75%     10.000000  
max     12.000000  

Synthetic Data Summary:
           timestamp  time_elapsed  host_name   event_name  cpu_id     procname
count  

In [None]:
# ----- 2. Wasserstein Distance -----
print("\n--- Wasserstein Distance (Numerical Feature Distributions) ---")
for col in ["timestamp", "time_elapsed", "cpu_id", 'event_name', 'procname']:
    dist = wasserstein_distance(real_df[col], synthetic_df[col])
    print(f"Wasserstein Distance for {col}: {dist}")


--- Wasserstein Distance (Numerical Feature Distributions) ---
Wasserstein Distance for timestamp: 8776.277441619208
Wasserstein Distance for time_elapsed: 0.0005607990116804125
Wasserstein Distance for cpu_id: 0.588659793814433
Wasserstein Distance for event_name: 11.643659793814432
Wasserstein Distance for procname: 5.706237113402063


In [None]:
# ----- 4. Domain Classifier Test -----
print("\n--- Domain Classifier Test (Discriminability) ---")
real_df["label"] = 1  # Real data label
synthetic_df["label"] = 0  # Synthetic data label
combined_df = pd.concat([real_df, synthetic_df])


--- Domain Classifier Test (Discriminability) ---


In [None]:
# Features for classification
features = ["timestamp", "time_elapsed", "cpu_id", "host_name", "event_name"]
X = combined_df[features]
y = combined_df["label"]

In [None]:
# Train-Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a classifier (Random Forest)
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Evaluate classifier
y_pred = clf.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred)
print(f"AUC Score (closer to 0.5 means high similarity): {auc_score}")

AUC Score (closer to 0.5 means high similarity): 1.0


In [None]:
# ----- 5. Anomaly Detection Test -----
print("\n--- Anomaly Detection Test ---")
iso_forest = IsolationForest(contamination=0.05, random_state=42)
iso_forest.fit(real_df[features])  # Train on real logs

# Predict anomalies in synthetic logs
synthetic_anomaly_scores = iso_forest.decision_function(synthetic_df[features])
anomalies = np.mean(synthetic_anomaly_scores < -0.1) * 100  # % of anomalies
print(f"Percentage of anomalies in synthetic data: {anomalies:.2f}%")


--- Anomaly Detection Test ---
Percentage of anomalies in synthetic data: 100.00%
