In [None]:
import os
import pm4py
from src.data_pipeline.s3 import fetch_file

# Define repository and S3 bucket details
OWNER = "facebook"
REPO = "react"
S3_BUCKET = "process-mining-github-issues-staging"
VIEW_VIS = True
SAVE_VIS = True

# Dynamically get the project root directory (one level up from notebooks)
NOTEBOOK_DIR = os.path.abspath(os.getcwd())  # Current notebook directory
ROOT_DIR = os.path.dirname(NOTEBOOK_DIR)  # Move up one level to project root

# Define the file path in the root directory
file_name = f"{OWNER}_{REPO}_event_log.xes"
file_path = os.path.join(ROOT_DIR, file_name)  # Store directly in the root folder

# Fetch the file from S3 (if not present locally)
local_file = fetch_file(file_path, S3_BUCKET, file_name)

# Process the XES file if successfully downloaded
if local_file:
    log = pm4py.read_xes(local_file)
    legacy_log = pm4py.read_xes(local_file, return_legacy_log_object=True)
    print(log.head())
else:
    print("Failed to fetch or process the XES file.")

In [None]:
import json
import matplotlib.pyplot as plt
import os
import pandas
import pm4py

from pm4py.algo.conformance.tokenreplay import algorithm as token_based_replay_algorithm
from pm4py.algo.conformance.tokenreplay.diagnostics import duration_diagnostics
from pm4py.algo.evaluation.replay_fitness import algorithm as replay_fitness_algorithm
from pm4py.algo.evaluation.precision import algorithm as precision_evaluator_algorithm
from pm4py.algo.comparison.petrinet.element_usage_comparison import (
    compare_element_usage_two_logs,
)
from pm4py.algo.discovery.dfg.variants import clean_time as clean_dfg_time
from pm4py.algo.filtering.dfg.dfg_filtering import clean_dfg_based_on_noise_thresh
from pm4py.algo.filtering.dfg.dfg_filtering import filter_dfg_on_activities_percentage
from pm4py.algo.filtering.dfg.dfg_filtering import filter_dfg_on_paths_percentage
from pm4py.algo.filtering.log.attributes import attributes_filter
from pm4py.algo.organizational_mining.sna import util as sna_util
from pm4py.algo.organizational_mining.sna import algorithm as sna_algorithm
from pm4py.algo.organizational_mining.roles import algorithm as roles_algorithm


from pm4py.statistics.traces.generic.log import case_statistics
from pm4py.statistics.traces.generic.log.case_statistics import get_median_case_duration
from pm4py.statistics.traces.generic.log.case_arrival import get_case_dispersion_avg

from pm4py.stats import get_case_duration
from pm4py.stats import get_cycle_time
from pm4py.stats import get_activity_position_summary
from pm4py.stats import get_case_arrival_average

from pm4py.util import variants_util

from pm4py.visualization.dfg.variants import timeline as timeline_gviz_generator
from pm4py.visualization.dfg import visualizer as dfg_visualizer
from pm4py.visualization.petri_net import visualizer as petri_net_visualizer
from pm4py.visualization.sna import visualizer as sna_vis

In [None]:
filter_noisy_events_log = pm4py.filter_event_attribute_values(
    log,
    "concept:name",
    {"subscribed", "unsubscribed", "pinned", "unpinned"},
    retain=False,
    level="event",
)
filter_endpoints_events_log = pm4py.filter_end_activities(
    log, {"closed", "not_planned"}
)
filter_endpoints_and_noisy_events_log = pm4py.filter_end_activities(
    filter_noisy_events_log, {"closed", "not_planned"}
)
filter_event_attributes_log = pm4py.filter_event_attribute_values(
    log, "concept:name", {"created", "labeled", "closed"}, level="event"
)
# filter_trace_attributes_log = pm4py.filter_trace_attribute_values(log, 'case:Label', {'React Core Team'})
filter_directly_follows_log = pm4py.filter_directly_follows_relation(
    log, [("closed", "commented")], retain=True
)
filter_top_variants_log = pm4py.filter_variants_top_k(log, 100)
filter_endpoints_and_noisy_events_and_top_variants_log = pm4py.filter_variants_top_k(
    filter_endpoints_and_noisy_events_log, 100
)
filter_time_log = pm4py.filter_time_range(
    log, "2023-01-01 00:00:00", "2025-01-31 00:00:00", mode="traces_contained"
)

filter_bot = log[~log["author_association"].str.contains("bot", case=False, na=False)]
filter_bot_noisy_events_log = pm4py.filter_end_activities(
    filter_bot, {"closed", "not_planned"}
)
filter_bot_endpoints_noisy_events_log = pm4py.filter_end_activities(
    filter_bot_noisy_events_log, {"closed", "not_planned"}
)
filter_bot_endpoints_noisy_events_top_variants_log = pm4py.filter_variants_top_k(
    filter_bot_endpoints_noisy_events_log, 10
)

# print(len(log.loc[~log['pr_merged_at'].isnull()]))
filter_noisy_events_case_pr_merged = pm4py.filter_event_attribute_values(
    filter_noisy_events_log, "has_merged_pr", {True}, retain=True, level="case"
)


filter_noisy_events_case_pr_merged_endpoints = pm4py.filter_end_activities(
    filter_noisy_events_case_pr_merged, {"closed", "not_planned"}
)

print(filter_noisy_events_case_pr_merged_endpoints)