In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from tqdm import tqdm
import os

import subprocess

In [2]:
TSP_CLIENT_PATH = "C:\\Users\\eavkhas\\OneDrive - Ericsson\\Documents\\Projects\\TC Projects\\tsp-python-client-personal"

In [9]:
EXPERIMENT_UUID = "8092e69f-6413-3816-912b-4185058bd421"
EXPERIMENT_START_TIMESTAMP = 1713924790254076392
EXPERIMENT_END_TIMESTAMP = 1713924791800370832

In [15]:
XY_TYPE = "org.eclipse.tracecompass.internal.analysis.timing.core.segmentstore.scatter.dataprovider:lttng.analysis.irq"
XY_X_TYPE = int
XY_Y_TYPE = float

TABLE_TYPE = "org.eclipse.tracecompass.internal.provisional.tmf.core.model.events.TmfEventTableDataProvider"

In [3]:
def load_experiment(uuid: str):
    subprocess.run(['python', 'tsp_cli_client', '--list-outputs', uuid], cwd=TSP_CLIENT_PATH, capture_output=True)

In [4]:
def get_outputs(uuid:str):
    process = subprocess.run(['python', 'tsp_cli_client', '--list-outputs', uuid], cwd=TSP_CLIENT_PATH, capture_output=True)
    outputs = process.stdout.decode('utf-8').split('\n')

    results = {}
    for output in outputs:
        # Get the text between two last parentheses
        matches = re.findall(r'\(([^)]+)', output)
        if matches:
            output_id = matches[-1]
            rest = output.split(output_id)[0].split('(')[0].strip()

            results[rest] = output_id

    return results

In [5]:
def fetch_tree(experiment_uuid: str, tree_type: str):
    # Fetch the tree
    process = subprocess.run(['python', 'tsp_cli_client', '--get-tree', tree_type, '--uuid', experiment_uuid], 
    cwd=TSP_CLIENT_PATH, shell=True, capture_output=True)
    if process.stderr:
        print(process.stderr)
        return None
    
    # Parse the tree
    tree = process.stdout.decode('utf-8').split('\n')
    tree = [line.strip() for line in tree if line.strip() != '']

    output_tree = {}
    
    # Parse each line based on their regex
    pattern = re.compile(r'([\w\W\d]+)\s*\(([\w\W\d\s*]+?)\)\s*(-?\d+)')
    for line in tree:
        match = pattern.match(line)
        if match:
            tree_name = match.group(1).replace(' ', '')
            tree_id = int(match.group(2).split(',')[1].replace(' ', ''))
            tree_parent_id = int(match.group(3))

            output_tree[tree_name] = {'id': tree_id, 'parent_id': tree_parent_id if tree_parent_id != -1 else None}

    return output_tree

In [6]:
def fetch_xy_tree(experiment_uuid: str, xy_type: str):
    # Fetch the tree
    process = subprocess.run(['python', 'tsp_cli_client', '--get-xy-tree', xy_type, '--uuid', experiment_uuid], 
    cwd=TSP_CLIENT_PATH, shell=True, capture_output=True)
    if process.stderr:
        print(process.stderr)
        return None
    
    # Parse the tree
    tree = process.stdout.decode('utf-8').split('\n')
    tree = [line.strip() for line in tree if line.strip() != '']
    tree = [line.replace('|____', '') for line in tree]

    output_tree = {}
    
    # Parse each line based on their regex
    pattern = re.compile(r'([\w\W\d]+)\s*\(([\w\W\d\s*]+?)\)\s*(-?\d+)')
    for line in tree:
        match = pattern.match(line)
        if match:
            tree_name = match.group(1).replace(' ', '')
            tree_id = int(match.group(2).split(',')[1].replace(' ', ''))
            tree_parent_id = int(match.group(3))

            output_tree[tree_name] = {'id': tree_id, 'parent_id': tree_parent_id if tree_parent_id != -1 else None}

    return output_tree

In [7]:
def fetch_xy_data(experiment_uuid: str, xy_type: str, items: list, start: int, end: int, num_items: int, xtype: type, ytype: type):
    output_x_values = []
    output_y_values = []

    while True:
        process = subprocess.run([
            'python',
            'tsp_cli_client',
            '--get-xy',
            xy_type,
            '--uuid',
            experiment_uuid,
            '--items',
            ' '.join(items),
            '--time-range',
            str(start),
            str(end),
            str(num_items)
        ], cwd=TSP_CLIENT_PATH, shell=True, capture_output=True)

        output = process.stdout.decode('utf-8')
        
        # Extract the data from the output
        # Find the lines where they start with "Series X-values" and "Series Y-values"
        try:
            x_values = re.findall(
            r"Series X-values: \[(.*)\]", output)[0].split(',')
            y_values = re.findall(
                r"Series Y-values: \[(.*)\]", output)[0].split(',')
        except Exception as e:
            print("Error: ", e)
            break
        
        if len(x_values) == 0 or len(y_values) == 0:
            break

        if len(x_values) == 1 and x_values[0] == '':
            break

        # Convert the values to floats
        x_values = list(map(xtype, x_values))
        y_values = list(map(ytype, y_values))

        output_x_values.extend(x_values)
        output_y_values.extend(y_values)

        if len(x_values) == 0 or len(y_values) == 0:
            break

        last_x_value = x_values[-1] + 1
        start = last_x_value

    return output_x_values, output_y_values

In [8]:
def fetch_virtual_table_lines(experiment_uuid: str, table_type: str, index: int, num_items: int, column_ids: list[int] = [], recursive: bool = False):
    virtual_table = {}

    while True:
        command = ['python','tsp_cli_client','--get-virtual-table-lines',table_type,'--uuid',experiment_uuid,'--table-line-index',str(index),'--table-line-count',str(num_items)]
        if len(column_ids) > 0:
            command.append('--table-column-ids')
            for id in column_ids:
                command.append(str(id))

        process = subprocess.run(command, cwd=TSP_CLIENT_PATH, shell=True, capture_output=True)
        if process.stderr:
            print(process.stderr)
            return {}

        output = process.stdout.decode('utf-8').split('\n')

        line_index = -1
        for line in output:
            line = line.strip()
            if line.startswith('index:'):
                line_index = int(line.split(':')[1].strip())
                virtual_table[line_index] = []

            if line.startswith('"content":'):
                content = line[10:].strip().replace('\"', '')
                virtual_table[line_index].append(content)

        if line_index == -1 or not recursive:
            break

        index = line_index + 1
        print("Index: ", index, len(virtual_table))

    return virtual_table

In [14]:
load_experiment(EXPERIMENT_UUID)
analysis_outputs = get_outputs(EXPERIMENT_UUID)

In [None]:
xy_tree = fetch_xy_tree(experiment_uuid=EXPERIMENT_UUID, xy_type=XY_TYPE)
virtual_table_tree = fetch_tree(experiment_uuid=EXPERIMENT_UUID, tree_type=TABLE_TYPE)

In [16]:
xy_df = pd.DataFrame(columns=['category', 'x', 'y'])

for item_name, item_data in tqdm(xy_tree.items()):
    if item_data['parent_id'] is not None:
        x_values, y_values = fetch_xy_data(
            experiment_uuid=EXPERIMENT_UUID,
            xy_type=XY_TYPE,
            items=[str(item_data['id'])],
            start=EXPERIMENT_START_TIMESTAMP,
            end=EXPERIMENT_END_TIMESTAMP,
            num_items=1000,
            xtype=XY_X_TYPE,
            ytype=XY_Y_TYPE
        )

        for x, y in zip(x_values, y_values):
            xy_df.loc[len(xy_df)] = [item_name, x, y]

0it [00:00, ?it/s]


In [19]:
DESIRED_COLUMNS = ['Channel', 'CPU', 'Eventtype', 'Contents', 'TracePacketHeader', 'PacketContext', 'Timestampns']

column_ids = []
for column_name, column_data in virtual_table_tree.items():
    if column_name in DESIRED_COLUMNS:
        column_ids.append(column_data['id'])
column_ids.sort()

virtual_table_df = pd.DataFrame(columns=DESIRED_COLUMNS)

table = fetch_virtual_table_lines(
    experiment_uuid=EXPERIMENT_UUID,
    table_type=TABLE_TYPE,
    index=0,
    num_items=100000,
    column_ids=column_ids,
    recursive=True
)

virtual_table_df = pd.DataFrame([value for value in table.values()], columns=DESIRED_COLUMNS)

Index:  100000 100000
Index:  200000 200000
Index:  300000 300000
Index:  355008 355008


In [40]:
# Extracting "size" and "ptr" from Contents column and adding them as separate columns
# The size structure is this: size=1234 => column size = 1234
# The ptr structure is this: ptr=0x1234 => column ptr = 0x1234
import re
virtual_table_df['Size'] = virtual_table_df['Contents'].apply(lambda x: int(re.findall(r'size=(\d+)', x)[0]) if len(re.findall(r'size=(\d+)', x)) > 0 else 0)
virtual_table_df['Ptr'] = virtual_table_df['Contents'].apply(lambda x: int(re.findall(r'ptr=0x(\w+)', x)[0], 16) if len(re.findall(r'ptr=0x(\w+)', x)) > 0 else 0)

In [41]:
virtual_table_df[virtual_table_df['Eventtype'] == 'lttng_ust_libc:malloc'].head()

Unnamed: 0,Channel,CPU,Eventtype,Contents,TracePacketHeader,PacketContext,Timestampns,Size,Ptr
66,u_0,0,lttng_ust_libc:malloc,"size=59, ptr=0x5629b63fc0b0, context.packet_se...","[magic=3254525889, uuid=[87, 68, 213, 51, 89, ...","[timestamp_begin=8512570527590, timestamp_end=...",1713924790267586429,59,94737151279280
197,u_0,0,lttng_ust_libc:malloc,"size=19, ptr=0x5629b63fc1b0, context.packet_se...","[magic=3254525889, uuid=[87, 68, 213, 51, 89, ...","[timestamp_begin=8512570527590, timestamp_end=...",1713924790267864159,19,94737151279536
342,u_0,0,lttng_ust_libc:malloc,"size=60, ptr=0x5629b63fc6d0, context.packet_se...","[magic=3254525889, uuid=[87, 68, 213, 51, 89, ...","[timestamp_begin=8512570527590, timestamp_end=...",1713924790268265767,60,94737151280848
486,u_0,0,lttng_ust_libc:malloc,"size=53, ptr=0x5629b63fc070, context.packet_se...","[magic=3254525889, uuid=[87, 68, 213, 51, 89, ...","[timestamp_begin=8512570527590, timestamp_end=...",1713924790268563544,53,94737151279216
614,u_0,0,lttng_ust_libc:malloc,"size=61, ptr=0x5629b63fc760, context.packet_se...","[magic=3254525889, uuid=[87, 68, 213, 51, 89, ...","[timestamp_begin=8512570527590, timestamp_end=...",1713924790268839933,61,94737151280992


In [67]:
target_df = virtual_table_df[virtual_table_df['Eventtype'] == 'lttng_ust_libc:malloc'][['Timestampns', 'Size', 'Ptr']]

# Apply a simple clustering algorithm to the data
from sklearn.metrics import silhouette_score
from pycaret.clustering import ClusteringExperiment

df = target_df.drop(columns=['category', 'cluster'], errors='ignore')

exp = ClusteringExperiment()
exp.setup(data=df, normalize=True, normalize_method='zscore', session_id=42, verbose=False, encoding_method='ordinal', preprocess=True)

min_improvement = 0.01  # Minimum improvement threshold for silhouette score
num_clusters = 2  # Optimal number of clusters
best_score = -1  # Best silhouette score

for k in range(2, 10):
  print(f"Trying with {k} clusters, current number of clusters: {num_clusters}")
  model = exp.create_model('kmeans', num_clusters=k, verbose=False)
  cluster_labels = exp.assign_model(model)
  cluster_labels = cluster_labels['Cluster']

  silhouette = silhouette_score(df, cluster_labels)

  # Check for improvement and minimum clusters
  if silhouette > best_score + min_improvement and k > 2:
    print(f"Improvement found: {silhouette} > {best_score} + {min_improvement}")
    best_score = silhouette
    num_clusters = k

print(f"Optimal number of clusters: {num_clusters}")

optimal_model = exp.create_model('kmeans', num_clusters=num_clusters, verbose=False)
cluster_labels = exp.assign_model(optimal_model)
cluster_labels = cluster_labels['Cluster']

target_df.loc['cluster'] = cluster_labels

# Plot the clustering results
exp.plot_model(optimal_model, plot='tsne')

Trying with 2 clusters, current number of clusters: 2
Trying with 3 clusters, current number of clusters: 2
Improvement found: -0.022841554676284133 > -1 + 0.01
Trying with 4 clusters, current number of clusters: 3
Trying with 5 clusters, current number of clusters: 3
Trying with 6 clusters, current number of clusters: 3
Trying with 7 clusters, current number of clusters: 3
Trying with 8 clusters, current number of clusters: 3
Trying with 9 clusters, current number of clusters: 3
Optimal number of clusters: 3


In [68]:
from pycaret.anomaly import AnomalyExperiment

df = target_df.drop(columns=['category', 'anomaly'], errors='ignore')

if len(df) > 2:
    exp = AnomalyExperiment()
    exp.setup(data=df, normalize=True, normalize_method='zscore', session_id=42, verbose=False)

    iforest = exp.create_model('iforest', verbose=False)
    anomaly_labels = exp.assign_model(iforest)
    anomaly_labels = anomaly_labels['Anomaly']

    target_df.loc['anomaly'] = anomaly_labels

    exp.plot_model(iforest, plot='tsne')

In [27]:
from tabulate import tabulate

# Print the anomaly results
for category in xy_df['category'].unique():
    print(f"Anomaly results for {category}:")
    
    df = xy_df[xy_df['category'] == category]

    if 'anomaly' not in df.columns:
        print("No anomaly data available.")
        continue

    anomaly_df = df[df['anomaly'] == 1]
    if len(anomaly_df) == 0:
        print("No anomalies detected.")
        continue

    table = []
    headers = ['Time', 'Latency']

    for _, row in anomaly_df.iterrows():
        table.append([row['x'], row['y']])

    print(tabulate(table, headers=headers, tablefmt='grid'))
