# Complexity Metric Problematization - Examples

In [1]:
# imports

import datetime
import random
import math
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
from pm4py.objects.log.obj import Event, EventLog, Trace

from complexity_sample_size_correlation_analysis import (
    compute_metrics_for_samples, sample_random_traces_with_replacement)

from pm4py.objects.bpmn.importer import importer as bpmn_importer
from pm4py.objects.conversion.bpmn import converter as bpmn_converter
from pm4py.algo.simulation.playout.petri_net import algorithm as simulator



: 

In [None]:
# helpers
def make_trace(variant, trace_id: int):
    # give each trace a unique name/id
    trace = Trace(attributes={"concept:name": f"Case{trace_id}"})
    # add events with both concept:name and timestamp
    base_time = datetime.datetime(2020, 1, 1, 0, 0, 0)  # arbitrary starting point
    for i, act in enumerate(variant):
        event = Event({
            "concept:name": act,
            "time:timestamp": base_time + datetime.timedelta(minutes=i)
        })
        trace.append(event)
    return trace

## P1 Strict Monotone growth

In [None]:
sample_metrics_per_log = {}

# Create a trace variant A-B-C-D-E
variant = ["A", "B", "C", "D", "E"]

# Build EventLog with 10,000 identical traces
event_log = EventLog([make_trace(variant, id) for id in range(10000)])

sizes = range(50, 501, 50)
samples_per_size = 200
random_state = 1
samples = sample_random_traces_with_replacement(event_log, sizes, samples_per_size, random_state)
adapters = ["vidgof_sample"]
df_metrics = compute_metrics_for_samples(samples, adapters)
sample_metrics_per_log['strict_monotone_growth'] = df_metrics
print(df_metrics)

# plot Number of events
fig, ax = plt.subplots(figsize=(6, 4))
df_metrics.boxplot(
    column="Number of Events",
    by="window_size",
    ax=ax,
    grid=False
)
ax.set_xlabel("Window Size")
ax.set_ylabel("Number of Events")
ax.set_title("Number of Events vs Window Size (Single Variant Log)")
plt.suptitle("")  # remove automatic title from pandas

# Ensure output directory exists
out_path = Path("results/correlations/problematization/p1.png")
out_path.parent.mkdir(parents=True, exist_ok=True)

# Save figure
plt.savefig(out_path, dpi=300, bbox_inches="tight")
plt.close(fig)

## P2 Infinite Support

In [None]:
## P2 Infinite support
# Looped models cause unbounded number of distinct variants

# Path to the BPMN file exported from BPMN.io (A->B->C with self-loop on B)
bpmn_path = r"data\synthetic\simple_loop\simple_loop.bpmn"
# Import BPMN
bpmn_graph = bpmn_importer.apply(bpmn_path)

# Convert BPMN -> (Petri net, initial marking, final marking)
net, im, fm = bpmn_converter.apply(bpmn_graph)

event_log = simulator.apply(
    net,
    im,
    fm,
    parameters = {
        "no_traces": 10000,
        "random_seed": 1
    },
    variant=simulator.Variants.BASIC_PLAYOUT,
)

# quick peek
for i, trace in enumerate(event_log[:5], 1):
    print(f"Trace {i}:", [ev["concept:name"] for ev in trace])

sizes = range(50, 501, 50)
samples_per_size = 200
random_state = 1
samples = sample_random_traces_with_replacement(event_log, sizes, samples_per_size, random_state)
adapters = ["vidgof_sample"]
df_metrics = compute_metrics_for_samples(samples, adapters)
sample_metrics_per_log['infinite_support'] = df_metrics
print(df_metrics)

# Boxplot Distinct traces vs Window Size
fig, ax = plt.subplots(figsize=(6, 4))
df_metrics.boxplot(
    column="Number of Distinct Traces",
    by="window_size",
    ax=ax,
    grid=False
)

ax.set_xlabel("Window Size")
ax.set_ylabel("Number of Distinct Traces")
ax.set_title("Number of Distinct Traces vs Window Size (Looping Log)")
plt.suptitle("")  # remove automatic title from pandas

# Ensure output directory exists
out_path = Path("results/correlations/problematization/p2.png")
out_path.parent.mkdir(parents=True, exist_ok=True)

# Save figure
plt.savefig(out_path, dpi=300, bbox_inches="tight")
plt.close(fig)

## Rare occurrences

In [None]:
## P3 Rare occurrences
# The complexity metric under-estimates rarely occurring behavior due to skewed occurrence distributions.

# Create trace variants
frequent_variant = ["A"]
once_occuring_variants = [[f"B_{i}"] for i in range (0, 10)] # 10 variants that occur only once

# Build EventLog with 10,000 traces of two variants
traces = []
for id in range(10000):
    if id < 9990:
        traces.append(make_trace(frequent_variant, id))
    else:
        traces.append(make_trace(once_occuring_variants[id - 9990], id))

# shuffle traces (should not matter due to random sampling but just to be sure)
random.seed(1)
random.shuffle(traces)

event_log = EventLog(traces)

sizes = range(50, 501, 50)
samples_per_size = 200
random_state = 1
samples = sample_random_traces_with_replacement(event_log, sizes, samples_per_size, random_state)
adapters = ["vidgof_sample"]
df_metrics = compute_metrics_for_samples(samples, adapters)
sample_metrics_per_log['rare_occurrences'] = df_metrics
print(df_metrics)

# Boxplot Variety vs Window Size
fig, ax = plt.subplots(figsize=(6, 4))
df_metrics.boxplot(
    column="Number of Distinct Activities",
    by="window_size",
    ax=ax,
    grid=False
)

ax.set_xlabel("Window Size")
ax.set_ylabel("Number of Distinct Activities")
ax.set_title("Number of Distinct Activities vs Window Size (Skewed Variant Log)")
plt.suptitle("")  # remove automatic title from pandas

# Ensure output directory exists
out_path = Path("results/correlations/problematization/p3.png")
out_path.parent.mkdir(parents=True, exist_ok=True)

# Save figure
plt.savefig(out_path, dpi=300, bbox_inches="tight")
plt.close(fig)

out_path

## P4 Variance
# The complexity metric fluctuates at small sample sizes due to the law of small numbers. Only at higher sample window sizes, the metric becomes asymptotic.

# Create a trace variant A-B-C-D-E
variant_a = ["A", "B", "C", "D", "E"] # length: 5
variant_b = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"] # length: 10]


# Build EventLog with 10,000 traces of two variants
traces = []
for id in range(10000):
    if id % 2 == 0:
        traces.append(make_trace(variant_a, id))
    else:
        traces.append(make_trace(variant_b, id))

event_log = EventLog(traces)

sizes = range(50, 501, 50)
samples_per_size = 200
random_state = 1
samples = sample_random_traces_with_replacement(event_log, sizes, samples_per_size, random_state)
adapters = ["vidgof_sample"]
df_metrics = compute_metrics_for_samples(samples, adapters)
sample_metrics_per_log['variance'] = df_metrics
print(df_metrics)

# plot Trace length
fig, ax = plt.subplots(figsize=(6, 4))
df_metrics.boxplot(
    column="Avg. Trace Length",
    by="window_size",
    ax=ax,
    grid=False
)
ax.set_xlabel("Window Size")
ax.set_ylabel("Avg. Trace Length")
ax.set_title("Avg. Trace Length vs Window Size (Two Variant Log)")
plt.suptitle("")  # remove automatic title from pandas

# Ensure output directory exists
out_path = Path("results/correlations/problematization/p4.png")
out_path.parent.mkdir(parents=True, exist_ok=True)

# Save figure
plt.savefig(out_path, dpi=300, bbox_inches="tight")
plt.close(fig)

NameError: name 'make_trace' is not defined

## Get correlations

In [None]:
def get_correlations_for_dictionary(sample_metrics_per_log):
    # create a correlation analysis for all measures
    from pathlib import Path
    import pandas as pd
    from scipy import stats

    # mapping of your dict keys to desired column names
    rename_map = {
        "strict_monotone_growth": "P1",
        "infinite_support": "P2",
        "rare_occurrences": "P3",
        "variance": "P4",
    }

    r_results, p_results = {}, {}

    for key, df in sample_metrics_per_log.items():
        col_tag = rename_map[key]
        r_results[col_tag] = {}
        p_results[col_tag] = {}

        for col in df.columns:
            if col in {"window_size", "sample_id", "Time Granularity"}:
                continue
            # drop missing values pairwise
            tmp = df[["window_size", col]].dropna()
            if len(tmp) < 2:
                r, p = float("nan"), float("nan")
            else:
                r, p = stats.pearsonr(tmp["window_size"], tmp[col])
            r_results[col_tag][col] = r
            p_results[col_tag][col] = p

    # DataFrames: measures as index, P1..P4 as columns
    corr_df = pd.DataFrame(r_results)
    pval_df = pd.DataFrame(p_results).reindex(corr_df.index)
    print("Correlations:")
    print(corr_df)
    print()
    print("P-values:")
    print(pval_df)

    return corr_df, pval_df

corr_df, pval_df = get_correlations_for_dictionary(df_metrics)

# Save
out_dir = Path("results/correlations/problematization")
out_dir.mkdir(parents=True, exist_ok=True)
corr_df.to_csv(out_dir / "correlations_r.csv")
pval_df.to_csv(out_dir / "correlations_p.csv")


# Create Latex output
import pandas as pd
import numpy as np
from pathlib import Path

def _stars(p: float) -> str:
    if pd.isna(p):
        return ""
    if p < 0.001:
        return "***"
    if p < 0.01:
        return "**"
    if p < 0.05:
        return "*"
    return ""

def corr_p_to_latex_stars(corr_df: pd.DataFrame, pval_df: pd.DataFrame, out_path: Path) -> None:
    # keep P1..P4 order if present
    cols = [c for c in ["P1", "P2", "P3", "P4"] if c in corr_df.columns]
    corr = corr_df[cols].copy()
    pval = pval_df[cols].copy()
    corr, pval = corr.align(pval, join="outer", axis=0)

    # Build display DataFrame with "+/-r***" format
    disp = corr.copy().astype(object)
    for c in cols:
        out_col = []
        for r, p in zip(corr[c], pval[c]):
            if pd.isna(r):
                out_col.append("")
            else:
                out_col.append(f"{r:+.2f}{_stars(p)}")
        disp[c] = out_col

    latex_body = disp.to_latex(
        escape=True,
        na_rep="",
        index=True,
        column_format="l" + "c"*len(cols),
        bold_rows=False
    )

    wrapped = rf"""
    \begin{{table}}[htbp]
    \centering
    \caption{{Pearson correlation ($r$) between window size and each measure.}}
    \scriptsize
    \setlength{{\tabcolsep}}{{6pt}}
    \renewcommand{{\arraystretch}}{{1.15}}
    {latex_body}
    \vspace{{2pt}}
    \begin{{minipage}}{{0.95\linewidth}}\footnotesize
    Stars denote significance: $^*p<0.05$, $^{{**}}p<0.01$, $^{{***}}p<0.001$.
    \end{{minipage}}
    \end{{table}}
    """

    out_path.parent.mkdir(parents=True, exist_ok=True)
    with open(out_path, "w", encoding="utf-8") as f:
        f.write(wrapped)

corr_p_to_latex_stars(corr_df, pval_df, Path("results/correlations/problematization/correlations_table.tex"))

## Fix problems

In [None]:
def fix_p1(metrics_dict):
    fixed_metrics_dict = metrics_dict.copy()
    fixed_metrics_dict['Number of Events'] = metrics_dict['Number of Events'] / metrics_dict['Number of Traces'] # normalize by number of traces to get avg trace length
    fixed_metrics_dict['Number of Traces'] = None # do not report number of traces as this will always correlate with trace count based sample sizes (alternatively, devide by number of traces which would lead to constant 1)
    fixed_metrics_dict['Percentage of Distinct Traces'] = metrics_dict['Percentage of Distinct Traces'] * metrics_dict['Number of Traces'] # this gives the same as number of distinct traces
    fixed_metrics_dict['Deviation from Random'] = (
        metrics_dict['Deviation from Random']
        / np.sqrt(1 - (1 / metrics_dict['Number of Traces']))
    ) # devision by the worst case scenario (single activity transition)
    fixed_metrics_dict['Lempel-Ziv Complexity'] = (
        metrics_dict['Lempel-Ziv Complexity']
        / (
            metrics_dict['Number of Traces']
            / (
                np.log(metrics_dict['Number of Distinct Activities'])
                / np.log(metrics_dict['Number of Traces'])
            )
        )
    ) #  see Kaspar and Schuster 1987
    fixed_metrics_dict['Number of Ties in Paths to Goal'] = metrics_dict['Number of Ties in Paths to Goal'] / metrics_dict['Number of Traces'] # TODO better approach desired
    fixed_metrics_dict['Variant Entropy'] = metrics_dict['Variant Entropy'] / metrics_dict['Number of Traces'] # TODO better approach desired
    fixed_metrics_dict['Normalized Variant Entropy'] = metrics_dict['Normalized Variant Entropy'] / metrics_dict['Number of Traces'] # TODO better approach desired
    fixed_metrics_dict['Trace Entropy'] = metrics_dict['Trace Entropy'] / metrics_dict['Number of Traces'] # TODO better approach desired
    fixed_metrics_dict['Normalized Trace Entropy'] = metrics_dict['Normalized Trace Entropy'] / metrics_dict['Number of Traces'] # TODO better approach desired

    return fixed_metrics_dict

def apply_fixes_all_metrics_dicts(metrics_dicts_per_log, problems_to_fix=['p1']):
    fixed_metrics_per_log = {}
    for log_name in metrics_dicts_per_log:
        if 'p1' in problems_to_fix:
            # fix p1
            fixed_metrics_per_log[log_name] = fix_p1(metrics_dicts_per_log[log_name])
        # TODO fix further problems
    
    return fixed_metrics_per_log

## Expectation 1: convergence

fixed_metrics_per_log = apply_fixes_all_metrics_dicts(sample_metrics_per_log)

# get new correlations
corr_df, pval_df = get_correlations_for_dictionary(fixed_metrics_per_log)

# Save
out_dir = Path("results/correlations/problematization")
out_dir.mkdir(parents=True, exist_ok=True)
corr_df.to_csv(out_dir / "correlations_r_p1_fixed.csv")
pval_df.to_csv(out_dir / "correlations_p_p1_fixed.csv")


NameError: name 'sample_metrics_per_log' is not defined