In [1]:
import pm4py
import pandas

from pm4py.objects.log.importer.xes import importer as xes_import_factory
from pm4py.objects.log.exporter.xes import exporter as xes_export_factory
from pm4py.objects.log.obj import EventLog, Trace

# This Notebook's purpose is to split the process log provided in the Road_Traffic_Fine_Management_Process.xes file. 
# This is because git can only store 100 Mega Byte per file. The Road_Traffic_Fine_Management_Process.xes. 
# Attention: This only works if the Road_Traffic_Fine_Management_Process.xes file is put into the Road_Traffic_Fine_Management_Process_Data directory.

# Step 1: Read the XES file
log = xes_import_factory.apply("Road_Traffic_Fine_Management_Process_Data/Road_Traffic_Fine_Management_Process.xes")

# Step 2: Split the log into two smaller parts
split_index = len(log) // 2  # Find the midpoint of traces

# Step 3: Create new EventLog objects for both parts
# Use the EventLog constructor and pass a list of traces
log_part_1 = EventLog(log[:split_index])  # First half of the log
log_part_2 = EventLog(log[split_index:])  # Second half of the log

# Step 4: Save the smaller logs
xes_export_factory.apply(log_part_1, "Road_Traffic_Fine_Management_Process_Data/Road_Traffic_Fine_Management_Process_Part1.xes")
xes_export_factory.apply(log_part_2, "Road_Traffic_Fine_Management_Process_Data/Road_Traffic_Fine_Management_Process_Part2.xes")

# Step 5: Read smaller logs back
log_part_1_imported = xes_import_factory.apply("Road_Traffic_Fine_Management_Process_Data/Road_Traffic_Fine_Management_Process_Part1.xes")
log_part_2_imported = xes_import_factory.apply("Road_Traffic_Fine_Management_Process_Data/Road_Traffic_Fine_Management_Process_Part2.xes")

# Debug: Check the length of imported logs
print(f"Log Part 1 Length: {len(log_part_1_imported)}")
print(f"Log Part 2 Length: {len(log_part_2_imported)}")

# Step 6: Combine the logs (Note this is how to call the log in the future)
combined_log = EventLog(list(log_part_1_imported) + list(log_part_2_imported))

# Debug: Check the combined log length
print(f"Combined Log Length: {len(combined_log)}")

parsing log, completed traces ::   0%|          | 0/150370 [00:00<?, ?it/s]

exporting log, completed traces ::   0%|          | 0/75185 [00:00<?, ?it/s]

exporting log, completed traces ::   0%|          | 0/75185 [00:00<?, ?it/s]

parsing log, completed traces ::   0%|          | 0/75185 [00:00<?, ?it/s]

parsing log, completed traces ::   0%|          | 0/75185 [00:00<?, ?it/s]

Log Part 1 Length: 75185
Log Part 2 Length: 75185
Combined Log Length: 150370


In [5]:
# Check whether the "new" combined_log equals the "old" log
print(f'The old list contains the exact same entries as the new one (list(log)==list(combined_log)):{list(log)==list(combined_log)})')

The old list contains the exact same entries as the new one (list(log)==list(combined_log)=True)


In [7]:
print(list(combined_log)[:5])
print(list(log)[:5])

[{'attributes': {'concept:name': 'A1'}, 'events': [{'amount': 35.0, 'org:resource': '561', 'dismissal': 'NIL', 'concept:name': 'Create Fine', 'vehicleClass': 'A', 'totalPaymentAmount': 0.0, 'lifecycle:transition': 'complete', 'time:timestamp': datetime.datetime(2006, 7, 24, 0, 0, tzinfo=datetime.timezone.utc), 'article': 157, 'points': 0}, '..', {'concept:name': 'Send Fine', 'lifecycle:transition': 'complete', 'expense': 11.0, 'time:timestamp': datetime.datetime(2006, 12, 5, 0, 0, tzinfo=datetime.timezone.utc)}]}, {'attributes': {'concept:name': 'A100'}, 'events': [{'amount': 35.0, 'org:resource': '561', 'dismissal': 'NIL', 'concept:name': 'Create Fine', 'vehicleClass': 'A', 'totalPaymentAmount': 0.0, 'lifecycle:transition': 'complete', 'time:timestamp': datetime.datetime(2006, 8, 2, 0, 0, tzinfo=datetime.timezone.utc), 'article': 157, 'points': 0}, '..', {'concept:name': 'Send for Credit Collection', 'lifecycle:transition': 'complete', 'time:timestamp': datetime.datetime(2009, 3, 30, 

In [None]:
## After running this notebook and confirming that the log is the same as the combined log we can now safely remove the Road_Traffic_Fine_Management_Process.xes file and replace it with the two Road_Traffic_Fine_Management_Process_Part1.xes and Road_Traffic_Fine_Management_Process_Part2.xes files