In [1]:
import pm4py

el = pm4py.read_xes('../../raw_eventlogs/Road_Traffic_Fine_Management_Process.xes/Road_Traffic_Fine_Management_Process.xes')

In [2]:
import pandas as pd

el['time:timestamp'] = pd.to_datetime(el['time:timestamp'])
grouped = el.groupby('case:concept:name')['time:timestamp'].agg(['min', 'max'])

grouped['duration_days'] = (grouped['max'] - grouped['min']).dt.total_seconds() / (24 * 3600)

threshold = grouped['duration_days'].quantile(0.95)

top_5_percent_cases = grouped[grouped['duration_days'] >= threshold]

lowest_value_top_5_percent = top_5_percent_cases['duration_days'].min()

print(f"The lowest value of the top 5% longest durations is: {lowest_value_top_5_percent:.2f} days")

In [3]:
import plotly.express as px

start_times = el.groupby('case:concept:name')['time:timestamp'].min().reset_index()

start_times['start_date'] = start_times['time:timestamp'].dt.date

case_counts = start_times['start_date'].value_counts().sort_index().reset_index()
case_counts.columns = ['start_date', 'count']

fig = px.line(case_counts, x='start_date', y='count', title='Number of Cases Started Each Day',
              labels={'start_date': 'Date', 'count': 'Number of Cases Started'})

fig.update_layout(xaxis_title='Date', yaxis_title='Number of Cases Started', xaxis_tickangle=45)
fig.show()

In [4]:
grouped = el.groupby('case:concept:name')['time:timestamp'].agg(['min', 'max']).reset_index()

# Calculate duration for each case in days
grouped['duration_days'] = (grouped['max'] - grouped['min']).dt.total_seconds() / (24 * 3600)

# Extract the start date for each case
grouped['start_date'] = grouped['min'].dt.date

# Calculate the average duration of cases for each start date
average_durations = grouped.groupby('start_date')['duration_days'].mean().reset_index()

# Create a line plot using Plotly
fig = px.line(average_durations, x='start_date', y='duration_days', title='Average Duration of Cases by Start Date',
              labels={'start_date': 'Start Date', 'duration_days': 'Average Duration (days)'})

fig.update_layout(xaxis_title='Start Date', yaxis_title='Average Duration (days)', xaxis_tickangle=45)
fig.show()

In [5]:
import numpy as np


def start_from_date(dataset, start_date):
    '''
    removes outliers starting before start date from dataset
    Args:
        dataset: pandas DataFrame
        start_date: string "MM-YYYY": dataset starts here after removing outliers

    Returns:
        dataset: pandas Dataframe

    '''
    case_starts_df = pd.DataFrame(dataset.groupby("case:concept:name")["time:timestamp"].min().reset_index())
    case_starts_df['date'] = case_starts_df["time:timestamp"].dt.to_period('M')
    cases_after = case_starts_df[case_starts_df['date'].astype('str') >= start_date]["case:concept:name"].values
    dataset = dataset[dataset["case:concept:name"].isin(cases_after)]
    return dataset

def end_before_date(dataset, end_date):
    '''

    removes outliers ending after end date from dataset
    Args:
        dataset: pandas DataFrame
        end_date: string "MM-YYYY": dataset stops here after removing outliers

    Returns:
        dataset: pandas Dataframe
    '''
    case_stops_df = pd.DataFrame(dataset.groupby("case:concept:name")["time:timestamp"].max().reset_index())
    case_stops_df['date'] = case_stops_df["time:timestamp"].dt.to_period('M')
    cases_before = case_stops_df[case_stops_df['date'].astype('str') <= end_date]["case:concept:name"].values
    dataset = dataset[dataset["case:concept:name"].isin(cases_before)]
    return dataset

def limited_duration(dataset, max_duration):
    '''

    limits dataset to cases shorter than maximal duration and debiases the end of the dataset
    by dropping cases starting after the last timestamp of the dataset - max_duration
    Args:
        dataset: pandas DataFrame
        max_duration: float

    Returns:
        dataset: pandas Dataframe
        latest_start: timeStamp with new end time for the dataset

    '''
    # compute each case's duration
    agg_dict = {"time:timestamp": ['min', 'max']}
    duration_df = pd.DataFrame(dataset.groupby("case:concept:name").agg(agg_dict)).reset_index()
    duration_df["duration"] = (duration_df[("time:timestamp", "max")] - duration_df[
        ("time:timestamp", "min")]).dt.total_seconds() / (24 * 60 * 60)
    # condition 1: cases are shorter than max_duration
    condition_1 = duration_df["duration"] <= max_duration * 1.00000000001
    cases_retained = duration_df[condition_1]["case:concept:name"].values
    dataset = dataset[dataset["case:concept:name"].isin(cases_retained)].reset_index(drop=True)
    # condition 2: drop cases starting after the dataset's last timestamp - the max_duration
    latest_start = dataset["time:timestamp"].max() - pd.Timedelta(max_duration, unit='D')
    condition_2 = duration_df[("time:timestamp", "min")] <= latest_start
    cases_retained = duration_df[condition_2]["case:concept:name"].values
    dataset = dataset[dataset["case:concept:name"].isin(cases_retained)].reset_index(drop=True)
    return dataset, latest_start

def train_test_split(df, test_len, latest_start, targets):
    '''
    splits the dataset in train and test set, applying strict temporal splitting and
    debiasing the test set
    Args:
        df: pandas DataFrame
        test_len: float: share of cases belonging in test set
        latest_start: timeStamp with new end time for the dataset
    Returns:
        df_train: pandas DataFrame
        df_test: pandas DataFrame
    '''
    case_starts_df = df.groupby("case:concept:name")["time:timestamp"].min()
    case_nr_list_start = case_starts_df.sort_values().index.array
    case_stops_df = df.groupby("case:concept:name")["time:timestamp"].max().to_frame()

    first_test_case_nr = int(len(case_nr_list_start) * (1 - test_len))
    first_test_start_time = np.sort(case_starts_df.values)[first_test_case_nr]
    test_case_nrs = case_stops_df[case_stops_df["time:timestamp"].values >= first_test_start_time].index.array
    df_test_all = df[df["case:concept:name"].isin(test_case_nrs)].reset_index(drop=True)

    df_test = df_test_all[df_test_all["time:timestamp"] <= latest_start]

    df_test.loc[df_test["time:timestamp"].values < first_test_start_time, targets] = np.nan

    train_case_nrs = case_stops_df[
        case_stops_df["time:timestamp"].values < first_test_start_time].index.array
    df_train = df[df["case:concept:name"].isin(train_case_nrs)].reset_index(drop=True)

    return df_train, df_test

max_duration = 968
end = '2012-01'

el = end_before_date(el, end)
el, latest_start = limited_duration(el, max_duration)
train, test = train_test_split(el, 0.2, latest_start, None)
print(len(train['case:concept:name'].unique()))
print(len(test['case:concept:name'].unique()))