In [None]:
%run shared.ipynb

In [None]:
cell_log = log.copy()

case_durations = pm4py.get_all_case_durations(cell_log)
median_case_duration = get_median_case_duration(cell_log)
print(f"Median case duration: {median_case_duration // 60 // 60 // 24} days")
single_case_duration = pm4py.get_case_duration(
    cell_log, cell_log["case:concept:name"].iloc[0]
)

case_arrival_average = pm4py.get_case_arrival_average(cell_log)
print(
    f"Average distance between the arrival of two consecutive cases: {case_arrival_average // 60 // 60} hours"
)

case_dispersion_ratio = get_case_dispersion_avg(cell_log)
print(
    f"Average distance between the finishing of two consecutive cases: {case_dispersion_ratio // 60 // 60} hours"
)

activity_position_summary = pm4py.get_activity_position_summary(cell_log, "completed")
variants = pm4py.statistics.variants.pandas.get.get_variants_count(
    filter_endpoints_events_log
)
sorted_variants = sorted(variants.items(), key=lambda item: item[1], reverse=True)
num_variants = 10
for k, v in sorted_variants[:num_variants]:
    print(k, v)

sorted_variants = sorted(variants.items(), key=lambda item: item[1], reverse=True)
top_variants = sorted_variants[:num_variants]

labels = [" -> ".join(k) for k, v in top_variants]
counts = [v for k, v in top_variants]

plt.figure(figsize=(12, 6))
plt.barh(labels[::-1], counts[::-1])  # Reverse for better visualization
plt.xlabel("Count")
plt.ylabel("Variant")
plt.title("Top 10 Process Variants")

if SAVE_VIS:
    plt.savefig("top_variants")

if VIEW_VIS:
    plt.show()

In [None]:
cell_log = filter_bot.copy()

# Convert timestamp column to datetime
cell_log["time:timestamp"] = pandas.to_datetime(cell_log["time:timestamp"])

# Count occurrences of events per user
top_users = cell_log["org:resource"].value_counts().head(10).index  # Get top 10 users

# Filter dataset to include only top users
filtered_cell_log = cell_log[cell_log["org:resource"].isin(top_users)]

# Group by time and user
grouped_df = (
    filtered_cell_log.groupby(
        [filtered_cell_log["time:timestamp"].dt.to_period("M"), "org:resource"]
    )
    .size()
    .unstack()
)

# Plot
grouped_df.plot(kind="line", figsize=(12, 6), marker="o")
plt.xlabel("Time")
plt.ylabel("Event Count")
plt.title("Top Users Event Distribution Over Time")
plt.legend(title="User", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.xticks(rotation=45)
plt.grid(True)

if SAVE_VIS:
    plt.savefig("top_users_over_time")

if VIEW_VIS:
    plt.show()

In [None]:
cell_log = log.copy()

cell_log["time:timestamp"] = pandas.to_datetime(cell_log["time:timestamp"])
grouped_df = (
    cell_log.groupby([cell_log["time:timestamp"].dt.to_period("M"), "concept:name"])
    .size()
    .unstack()
)

plt.figure(figsize=(12, 6))
grouped_df.plot(kind="line", marker="o", figsize=(12, 6))

plt.xlabel("Time")
plt.ylabel("Event Frequency")
plt.title("Event Frequency Over Time")
plt.legend(title="Event", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.xticks(rotation=45)
plt.grid(True)

if SAVE_VIS:
    plt.savefig("event_frequency_over_time_by_event")

if VIEW_VIS:
    plt.show()

In [None]:
cell_log = filter_endpoints_events_log.copy()

cell_log["time:timestamp"] = pandas.to_datetime(cell_log["time:timestamp"])
event_log = pm4py.convert_to_event_log(cell_log)
case_endings = []
for trace in event_log:
    last_event = trace[-1]  # Get the last event in the case
    case_endings.append(
        {
            "time:timestamp": last_event["time:timestamp"],
            "concept:name": last_event["concept:name"],
        }
    )

endings_df = pandas.DataFrame(case_endings)
endings_df["time:timestamp"] = pandas.to_datetime(endings_df["time:timestamp"])
endings_df["period"] = endings_df["time:timestamp"].dt.to_period("M")  # Group by month

summary = endings_df.groupby(["period", "concept:name"]).size().unstack(fill_value=0)
summary["total"] = summary.sum(axis=1)
summary["completed_pct"] = summary.get("completed", 0) / summary["total"] * 100
summary["not_planned_pct"] = summary.get("not_planned", 0) / summary["total"] * 100

plt.figure(figsize=(12, 6))
plt.plot(
    summary.index.astype(str), summary["completed_pct"], marker="o", label="Completed"
)
plt.plot(
    summary.index.astype(str),
    summary["not_planned_pct"],
    marker="o",
    label="Not Planned",
)

plt.xlabel("Time")
plt.ylabel("Percentage of Cases")
plt.title('Percentage of Cases Ending in "Completed" vs "Not Planned" Over Time')
plt.legend()
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

In [None]:
cell_log = filter_bot.copy()

if VIEW_VIS:
    pm4py.view_events_distribution_graph(cell_log, distr_type="days_week")
    pm4py.view_events_distribution_graph(cell_log, distr_type="hours")
    pm4py.view_events_per_time_graph(cell_log)
    pm4py.view_case_duration_graph(cell_log)

if SAVE_VIS:
    pm4py.save_vis_events_distribution_graph(
        cell_log, distr_type="days_week", file_path="events_over_days_of_week.png"
    )
    pm4py.save_vis_events_distribution_graph(
        cell_log, distr_type="hours", file_path="events_over_hour_of_day.png"
    )
    pm4py.save_vis_events_per_time_graph(cell_log, file_path="events_over_time.png")
    pm4py.save_vis_case_duration_graph(cell_log, file_path="case_duration.png")

In [None]:
%matplotlib inline
cell_log = filter_bot_endpoints_noisy_events_top_variants_log.copy()

case_durations = cell_log.groupby("case:concept:name")["time:timestamp"].agg(
    ["min", "max"]
)
case_durations.columns = ["start_time", "end_time"]
case_durations = case_durations.sort_values(by="start_time")

event_points = pandas.DataFrame(
    {
        "timestamp": pandas.concat(
            [case_durations["start_time"], case_durations["end_time"]]
        ),
        "change": [1] * len(case_durations)
        + [-1] * len(case_durations),  # +1 for start, -1 for end
    }
)

event_points = event_points.sort_values(by="timestamp")
event_points["active_cases"] = event_points["change"].cumsum()


plt.figure(figsize=(10, 5))
plt.plot(
    event_points["timestamp"],
    event_points["active_cases"],
    marker="o",
    linestyle="-",
    color="blue",
)
plt.xlabel("Time")
plt.ylabel("Active Cases")
plt.title("Active Cases Over Time")
plt.grid()

if SAVE_VIS:
    plt.savefig("active_cases_over_time.png")

if VIEW_VIS:
    plt.show()


plt.figure(figsize=(10, 5))
plt.hist(log["time:timestamp"], bins=200, color="blue", alpha=0.7, edgecolor="black")
plt.xlabel("Time")
plt.ylabel("Number of Events")
plt.title("Histogram of Events Over Time")
plt.grid()

if SAVE_VIS:
    plt.savefig("histogram_events_over_time.png")

if VIEW_VIS:
    plt.show()

In [None]:
cell_log = pm4py.sample_cases(
    log, num_cases=len(log["case:concept:name"].unique()) // 3
)

if VIEW_VIS:
    pm4py.view_dotted_chart(cell_log, show_legend=False)

if SAVE_VIS:
    pm4py.save_vis_dotted_chart(
        cell_log, show_legend=False, file_path="dotted_line_chart.png"
    )

In [None]:
cell_log = filter_endpoints_and_noisy_events_and_top_variants_log

noise_threshold = 0.5
petri_net, initial_marking, final_marking = pm4py.discover_petri_net_inductive(
    cell_log, noise_threshold=noise_threshold
)

if VIEW_VIS:
    pm4py.view_petri_net(petri_net, initial_marking, final_marking)

# fitness = pm4py.fitness_token_based_replay(
#     log, petri_net, initial_marking, final_marking
# )
# simplicity = pm4py.analysis.simplicity_petri_net(
#     petri_net, initial_marking, final_marking
# )
# precision = pm4py.precision_token_based_replay(
#     log, petri_net, initial_marking, final_marking
# )
# generalization = pm4py.algo.evaluation.generalization.algorithm.apply(
#     log, petri_net, initial_marking, final_marking
# )
# soundness = pm4py.check_soundness(petri_net, initial_marking, final_marking)


# print("Evaluate model: ")
# print(f"percentage_of_fitting_traces {fitness['percentage_of_fitting_traces']}")
# print(f"average_trace_fitness {fitness['average_trace_fitness']}")
# print(f"log_fitness {fitness['log_fitness']}")
# print(f"simplicity: {simplicity}")
# print(f"precision: {precision}")
# print(f"generalization: {generalization}")

gviz_frequency = petri_net_visualizer.apply(
    petri_net,
    initial_marking,
    final_marking,
    variant=petri_net_visualizer.Variants.FREQUENCY,
    log=cell_log,
)

if VIEW_VIS:
    petri_net_visualizer.view(gviz_frequency)

gviz_performance = petri_net_visualizer.apply(
    petri_net,
    initial_marking,
    final_marking,
    variant=petri_net_visualizer.Variants.PERFORMANCE,
    log=log,
)

if VIEW_VIS:
    petri_net_visualizer.view(gviz_performance)


if SAVE_VIS:
    pm4py.save_vis_petri_net(
        petri_net, initial_marking, final_marking, file_path="petri_net.png"
    )
    petri_net_visualizer.save(
        gviz_frequency, output_file_path="petri_net_frequency.png"
    )
    petri_net_visualizer.save(
        gviz_performance, output_file_path="petri_net_performance.png"
    )

In [None]:
cell_log = filter_endpoints_and_noisy_events_and_top_variants_log

noise_threshold = 0
bpmn_diagram = pm4py.discover_bpmn_inductive(cell_log, noise_threshold=noise_threshold)

if VIEW_VIS:
    pm4py.view_bpmn(bpmn_diagram)

if SAVE_VIS:
    pm4py.save_vis_bpmn(bpmn_diagram, file_path="bpmn.png")

In [None]:
cell_log = filter_endpoints_and_noisy_events_and_top_variants_log.copy()

# Discover the frequency DFG using activites and paths to filter
activities = pm4py.get_event_attribute_values(log, "concept:name")
print(activities)
frequency_dfg, start_activities, end_activities = pm4py.discover_dfg(cell_log)
activities_perc = 0.95
paths_perc = 0.95
max_num_edges = 100
frequency_dfg, start_activities, end_activities, activities = (
    filter_dfg_on_activities_percentage(
        frequency_dfg, start_activities, end_activities, activities, activities_perc
    )
)
frequency_dfg, start_activities, end_activities, activities = (
    filter_dfg_on_paths_percentage(
        frequency_dfg, start_activities, end_activities, activities, paths_perc
    )
)

# dfg_time = clean_dfg_time.apply(cell_log)
# gviz = timeline_gviz_generator.apply(frequency_dfg, dfg_time, parameters={'max_no_of_edges_in_diagram': 10, 'start_activities': start_activities, "end_activities": end_activities})
# dfg_visualizer.view(gviz)

if VIEW_VIS:
    pm4py.view_dfg(
        frequency_dfg,
        start_activities,
        end_activities,
        max_num_edges=max_num_edges,
        rankdir="LR",
    )

# Discover the performance DFG (does not support activites and paths filtering)
performance_dfg, start_activities, end_activities = pm4py.discover_performance_dfg(
    cell_log
)

# Uuse the frequency DFG to filter the performance DFG
removal_list = []
for edge in performance_dfg:
    if edge not in frequency_dfg:
        removal_list.append(edge)

for edge in removal_list:
    if edge in performance_dfg:
        del performance_dfg[edge]

if VIEW_VIS:
    pm4py.view_performance_dfg(
        performance_dfg,
        start_activities,
        end_activities,
        aggregation_measure="sum",
        rankdir="LR",
    )

    pm4py.view_performance_dfg(
        performance_dfg,
        start_activities,
        end_activities,
        aggregation_measure="median",
        rankdir="LR",
    )

if SAVE_VIS:
    pm4py.save_vis_dfg(
        frequency_dfg,
        start_activities,
        end_activities,
        max_num_edges=max_num_edges,
        rankdir="TB",
        file_path="frequency_dfg.png",
    )
    pm4py.save_vis_performance_dfg(
        performance_dfg,
        start_activities,
        end_activities,
        aggregation_measure="sum",
        rankdir="TB",
        file_path="performance_dfg_sum.png",
    )
    pm4py.save_vis_performance_dfg(
        performance_dfg,
        start_activities,
        end_activities,
        aggregation_measure="median",
        rankdir="TB",
        file_path="performance_dfg_median.png",
    )

In [None]:
cell_log = filter_bot_endpoints_noisy_events_top_variants_log.copy()

cell_log = cell_log[~cell_log["concept:name"].str.contains("commented")]


properties = {
    "business_hours": False,
}

cases_description = case_statistics.get_cases_description(
    cell_log, parameters=properties
)


start_times = []
durations = []

for case_id, desc in cases_description.items():
    if "startTime" in desc and "caseDuration" in desc:
        start_times.append(desc["startTime"])
        durations.append(desc["caseDuration"] // 60 // 60 // 24)

plt.figure(figsize=(10, 6))
plt.scatter(start_times, durations, alpha=0.6)
plt.xlabel("Case Start Time")
plt.ylabel("Case Duration (days)")
plt.title("Case Durations Over Time")
plt.xticks(rotation=45)
plt.tight_layout()

if SAVE_VIS:
    plt.savefig("case_durations_over_time")

if VIEW_VIS:
    plt.show()

In [None]:
cell_log = log.copy()

# Identify first event per case (assuming case ID column is "case:concept:name")
first_event = (
    cell_log.groupby("case:concept:name")["time:timestamp"].min().reset_index()
)
first_event = first_event.rename(columns={"time:timestamp": "first_event_time"})

maintainer_roles = {"owner", "member", "collaborator"}
first_response_log = cell_log[
    ~cell_log["concept:name"].str.contains("created", case=False, na=False)
]
df_maintainer = first_response_log[
    first_response_log["author_association"].isin(maintainer_roles)
]
first_response = (
    df_maintainer.groupby("case:concept:name")["time:timestamp"].min().reset_index()
)
first_response = first_response.rename(
    columns={"time:timestamp": "first_response_time"}
)

merged_df = first_event.merge(first_response, on="case:concept:name", how="inner")
merged_df["response_time"] = (
    merged_df["first_response_time"] - merged_df["first_event_time"]
).dt.total_seconds() / 3600  # Convert to hours

plt.figure(figsize=(10, 5))
plt.hist(merged_df["response_time"], bins=10000, edgecolor="black")
plt.xlabel("Time Until First Response (hours)")
plt.ylabel("Frequency")
plt.title("Distribution of Time Until First Response from Maintainer")
ax = plt.gca()
ax.set_xlim([0, 3 * 24])

if VIEW_VIS:
    plt.show()

if VIEW_VIS:
    plt.show()

In [None]:
cell_log = log

# Convert timestamp columns to datetime
cell_log["time:timestamp"] = pandas.to_datetime(cell_log["time:timestamp"])

# cell_log = cell_log[~cell_log['concept:name'].str.contains("not_planned", case=False, na=False)]

# Identify first event per case
first_event = (
    cell_log.groupby("case:concept:name")["time:timestamp"].min().reset_index()
)
first_event = first_event.rename(columns={"time:timestamp": "first_event_time"})

# Identify first response from a maintainer
maintainer_roles = {"collaborator"}
df_maintainer = cell_log[cell_log["author_association"].isin(maintainer_roles)]

first_response = (
    df_maintainer.groupby("case:concept:name")["time:timestamp"].min().reset_index()
)
first_response = first_response.rename(
    columns={"time:timestamp": "first_response_time"}
)

# Merge datasets
merged_df = first_event.merge(first_response, on="case:concept:name", how="inner")
merged_df["response_time"] = (
    merged_df["first_response_time"] - merged_df["first_event_time"]
).dt.total_seconds() / 3600  # Convert to hours

# Sort by first event time
merged_df = merged_df.sort_values("first_event_time")


print(merged_df.nlargest(10, "response_time"))

# Plot response time over time
plt.figure(figsize=(12, 6))
plt.scatter(merged_df["first_event_time"], merged_df["response_time"], alpha=0.6)
plt.xlabel("Time of Issue Creation")
plt.ylabel("Time Until First Response (hours)")
plt.title("Time Until First Response Over Time")
plt.xticks(rotation=45)
plt.grid(True)

if VIEW_VIS:
    plt.show()

In [None]:
cell_log = filter_top_variants_log

num_cases = 1000  # len(log)

petri_net, initial_marking, final_marking = pm4py.discover_petri_net_inductive(
    cell_log, noise_threshold=noise_threshold
)
pm4py.view_petri_net(petri_net, initial_marking, final_marking)

parameters_tbr = {
    token_based_replay_algorithm.Variants.TOKEN_REPLAY.value.Parameters.DISABLE_VARIANTS: True,
    token_based_replay_algorithm.Variants.TOKEN_REPLAY.value.Parameters.ENABLE_PLTR_FITNESS: True,
}

replayed_traces, place_fitness, trans_fitness, unwanted_activities_from_replay = (
    token_based_replay_algorithm.apply(
        pm4py.sample_cases(filter_endpoints_events_log, num_cases=num_cases),
        petri_net,
        initial_marking,
        final_marking,
        parameters=parameters_tbr,
    )
)

formatted_unwanted_activities = {}
for trace in pm4py.convert.convert_to_event_log(log):
    for event in trace:
        activity_name = event["concept:name"]

        if activity_name in unwanted_activities_from_replay:
            if activity_name not in formatted_unwanted_activities:
                formatted_unwanted_activities[activity_name] = []
            formatted_unwanted_activities[activity_name].append(trace)


act_diagnostics = diagnose_from_notexisting_activities(
    pm4py.sample_cases(filter_endpoints_events_log, num_cases=num_cases),
    formatted_unwanted_activities,
)
sorted_items = sorted(
    act_diagnostics.items(), key=lambda x: x[1]["relative_throughput"], reverse=True
)

print("For each problematic activity, diagnostics about case duration")
# Print the sorted elements
for key, value in sorted_items:
    print(f"{key}: {json.dumps(value, indent=4)}")

In [None]:
cell_log = pm4py.convert_to_event_log(pm4py.sample_cases(log, num_cases=10000))

if VIEW_VIS:
    pm4py.view_performance_spectrum(cell_log, ["created", "not_planned"])

if SAVE_VIS:
    pm4py.save_vis_performance_spectrum(
        cell_log, ["created", "not_planned"], file_path="performance_spectrum.png"
    )

In [None]:
loga = pm4py.convert_to_event_log(
    pm4py.filter_time_range(
        log, "2023-01-01 00:00:00", "2023-12-31 11:59:59", mode="traces_contained"
    )
)
logb = pm4py.convert_to_event_log(
    pm4py.filter_time_range(
        log, "2024-01-01 00:00:00", "2024-12-31 11:59:59", mode="traces_contained"
    )
)

loga = pm4py.sample_cases(loga, num_cases=min(len(loga), len(logb)))
logb = pm4py.sample_cases(logb, num_cases=min(len(loga), len(logb)))

statistics = compare_element_usage_two_logs(
    petri_net, initial_marking, final_marking, loga, logb
)
gviz = petri_net_visualizer.apply(
    petri_net,
    initial_marking,
    final_marking,
    variant=petri_net_visualizer.Variants.FREQUENCY,
    aggregated_statistics=statistics,
)

if VIEW_VIS:
    petri_net_visualizer.view(gviz)

if SAVE_VIS:
    petri_net_visualizer.save(gviz, output_file_path="compare_petri_nets_yearly.png")