# Cleaning of RequestForPayment.xes

In [None]:
import pm4py
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Load the XES file and convert it to a DataFrame
xes_filename = '../datasets/RequestForPayment.xes'
log = pm4py.read_xes(xes_filename)
df = pm4py.convert_to_dataframe(log)

df

In [None]:
# Rename the column
df = df.rename(columns={'case:Rfp_id': 'case:id'})

# Convert the 'case:id' column to string type
df['case:id'] = df['case:id'].astype(str)

In [None]:
# Display the DataFrame information
df.info()

In [None]:
# List the columns in the DataFrame
df.columns

In [None]:
# Compute frequency of activities
activity_counts = df["concept:name"].value_counts()
print("Frequency of Activities:\n", activity_counts)
print(type(activity_counts))

In [None]:
#Count unique case:id values
unique_case_ids = df['case:id'].nunique()

print(f"Number of unique case id: {unique_case_ids}")

In [None]:
# Check for NaN values across all columns
nan_counts = df.isnull().sum()

# Display only columns that have NaN values
print("Count of NaN values per column:")
print(nan_counts[nan_counts > 0]) 

if nan_counts.sum() == 0:
    print("\nNo NaN values found in the DataFrame.")
else:
    print(f"\nTotal NaN values found: {nan_counts.sum()}")

In [None]:
# Count the unknown activities
unknwon_count = df['case:Activity'].astype(str).str.contains('UNKNOWN').sum()

# Calculate the percentage of unknown activities
percentage_unknown = (unknwon_count / len(df)) * 100

print(f"Found {unknwon_count} UNKNOWN actvities, {percentage_unknown:.2f}% of total rows")

In [None]:
# Get counts of each unique value in 'case:Activity'
activity_counts = df['case:Activity'].value_counts()

# Get percentages of each unique value in 'case:Activity'
activity_percentages = df['case:Activity'].value_counts(normalize=True) * 100

# Combine them
combined_activity_info = pd.DataFrame({
    'Count': activity_counts,
    'Percentage': activity_percentages
})

print("Counts and Percentages of each unique value in 'case:Activity':")
print(combined_activity_info)

In [None]:
# Get and display the variants

variants = pm4py.get_variants(
    df,
    activity_key='concept:name',
    case_id_key='case:id',
    timestamp_key='time:timestamp'
)

print(variants)

In [None]:
# Also for variants, but provides a batter overview of the variants and their frequencies

variants_df = pm4py.get_variants_paths_duration(
    df,
    activity_key='concept:name',
    case_id_key='case:id',
    timestamp_key='time:timestamp'
)

variants_df

In [None]:
# Dataframe with sorting of the variants
variants = variants_df.groupby('@@variant_column').agg({'@@variant_count': 'first'}).reset_index()

# Sorting the DataFrame by '@@variant_count' column in descending order
variants = variants.sort_values(by='@@variant_count', ascending=False).reset_index(drop=True) # This resets the index after sorting. drop=True ensures the old index is not added as a new column.

variants

In [None]:
# Create a variable for the distribution 
distribution = variants['@@variant_count']

# Raw Frequency Plot (Linear Scale)
plt.figure(figsize=(8, 5))
plt.bar(range(1, len(distribution)+1), distribution, color='red', alpha=0.7)
plt.title("Actual Variant Frequencies (Linear Scale)")
plt.xlabel("Variant Rank (sorted by frequency)")
plt.ylabel("Frequency")
plt.grid(True, axis='y', ls="--", alpha=0.5)
plt.show()

In [None]:
# Log-Log Plot
plt.figure(figsize=(8, 5))
plt.loglog(range(1, len(distribution)+1), distribution, 'bo', alpha=0.7)
plt.title("Variant Frequencies (Log-Log Scale)")
plt.xlabel("Variant Rank")
plt.ylabel("Frequency")
plt.grid(True, which="both", ls="--", alpha=0.5)
plt.show()

In [None]:
# Display the top N variants
top_n = 5
print(top_n, " more frequent variants")
print(variants.head(top_n))

# Evaluate the coverage of the top N variants
total_cases = variants['@@variant_count'].sum()
cases_in_top_n = variants['@@variant_count'].head(top_n).sum()
percentage_coverage = (cases_in_top_n / total_cases) * 100
print(f"\nTop {top_n} variants cover {percentage_coverage:.2f}% of total cases")

In [None]:
# Top k variants

filtered_df = pm4py.filter_variants_top_k(
    df,
    5,
    activity_key='concept:name',
    case_id_key='case:id',
    timestamp_key='time:timestamp'
)

filtered_df

In [None]:
# Get original counts
original_rows = len(df)
original_cases = df["case:id"].nunique()

# Get filtered counts
filtered_rows = len(filtered_df)
filtered_cases = filtered_df["case:id"].nunique()

# Calculate the difference
removed_rows = original_rows - filtered_rows
removed_cases = original_cases - filtered_cases

print(f"Original DataFrame: {original_rows} rows, {original_cases} unique case IDs")
print(f"Filtered DataFrame (Top K variants): {filtered_rows} rows, {filtered_cases} unique case IDs")
print(f"\nRemoved: {removed_rows} rows")
print(f"Removed: {removed_cases} unique case IDs")

print(f"\nPercentage of rows removed: {(removed_rows / original_rows) * 100:.2f}%")
print(f"Percentage of cases removed: {(removed_cases / original_cases) * 100:.2f}%")

In [None]:
# Create a temporary column with the variant tuple for each event's case
filtered_df['_temp_variant_tuple'] = filtered_df.sort_values(by='time:timestamp').groupby('case:id')['concept:name'].transform(lambda x: tuple(x))

# Get unique variant tuples and assign them a number
unique_variant_tuples = filtered_df['_temp_variant_tuple'].unique()
variant_to_number = {variant_tuple: i for i, variant_tuple in enumerate(unique_variant_tuples, 1)}

# Map the temporary variant tuple column to the new 'variant_number' column
filtered_df['variant_number'] = filtered_df['_temp_variant_tuple'].map(variant_to_number)

# Drop the temporary column
filtered_df = filtered_df.drop(columns=['_temp_variant_tuple'])

# Display the number of unique variant numbers assigned and their counts
print(f"\nNumber of unique variant numbers assigned: {filtered_df['variant_number'].nunique()}")
print("Counts of each variant number:")
print(filtered_df['variant_number'].value_counts().sort_index())

filtered_df

In [None]:
# Add a new DataFrame to store the durations of each case
filtered_df_dur = filtered_df.copy()

# Min and Max Timestamps for Case Durations
min_timestamps = filtered_df_dur.groupby('case:id')['time:timestamp'].min()
max_timestamps = filtered_df_dur.groupby('case:id')['time:timestamp'].max()

# Case Durations as Timedelta and convert to seconds
case_durations_timedelta = max_timestamps - min_timestamps
case_durations_seconds = case_durations_timedelta.dt.total_seconds()

# Mapping, conversion and new columns
filtered_df_dur.loc[:, 'case_duration_seconds'] = filtered_df_dur['case:id'].map(case_durations_seconds)
filtered_df_dur.loc[:, 'case_duration_minutes'] = filtered_df_dur['case_duration_seconds'] / 60
filtered_df_dur.loc[:, 'case_duration_hours'] = filtered_df_dur['case_duration_seconds'] / 3600
filtered_df_dur.loc[:, 'case_duration_days'] = filtered_df_dur['case_duration_seconds'] / (24 * 3600)

# Statistics for case durations
print(f"Mean duration (days): {filtered_df_dur['case_duration_days'].mean():.2f}")
print(f"Median duration (days): {filtered_df_dur['case_duration_days'].median():.2f}")
print(f"Max duration (days): {filtered_df_dur['case_duration_days'].max():.2f}")
print(f"Min duration (days): {filtered_df_dur['case_duration_days'].min():.2f}")
print(f"N° of NaN created: {filtered_df_dur['case_duration_seconds'].isna().sum()}")

In [None]:
# Find the number of cases with zero duration
zero_duration_cases_count = (filtered_df_dur['case_duration_seconds'] == 0).sum()

print(f"Number of cases with zero duration (seconds): {zero_duration_cases_count}")

In [None]:
# Create a copy of the filtered DataFrame for further analysis
df_cleaned = filtered_df_dur.copy()

In [None]:
# Directly Follows Graph (DFG)

# Create graph from original DF and visualise it
dfg, start_activities, end_activities = pm4py.discover_dfg(df)
pm4py.view_dfg(dfg, start_activities, end_activities)

# Create graph from filtered DF and visualise it
dfg, start_activities, end_activities = pm4py.discover_dfg(df_cleaned)
pm4py.view_dfg(dfg, start_activities, end_activities)

In [None]:
# Heuristic Miner (HM)

# Discover the HM-map of the original DF and visualise it
map = pm4py.discover_heuristics_net(df)
pm4py.view_heuristics_net(map)

# Discover the HM-map of the filtered DF and visualise it
map = pm4py.discover_heuristics_net(df_cleaned)
pm4py.view_heuristics_net(map)

In [None]:
# Export the cleaned DataFrame to an XES file

"""
log_name_for_file = "RequestForPayments_cleaned.xes"
log = pm4py.convert_to_event_log(df_cleaned,
                                 case_id_key='case:id',
                                 activity_key='concept:name',
                                 timestamp_key='time:timestamp')

pm4py.write_xes(log, log_name_for_file)

print(f"Log succesfully exported in: {log_name_for_file}")
"""