In [7]:
import sys
sys.path.insert(0, '../')

from ppm_benchmark.DatasetNormalizers import BPI2014Normalizer
from ppm_benchmark.DatasetLoaders import LocalCSV

csvs = [
    '../../raw_eventlogs/BPI_Challenge_2014_Detail_Change.csv',
    '../../raw_eventlogs/BPI_Challenge_2014_Detail_Incident.csv',
    '../../raw_eventlogs/BPI_Challenge_2014_Detail_Incident_Activity.csv',
    '../../raw_eventlogs/BPI_Challenge_2014_Detail_Interaction.csv'
]

normalizer = BPI2014Normalizer()
loader = LocalCSV()
dfs = loader.load_data(csvs[2])
el = normalizer.normalize_next_attribute_classification(dfs)

In [8]:
import pandas as pd

el['time:timestamp'] = pd.to_datetime(el['time:timestamp'], format='%d-%m-%Y %H:%M:%S')
grouped = el.groupby('case:concept:name')['time:timestamp'].agg(['min', 'max'])

grouped['duration_days'] = (grouped['max'] - grouped['min']).dt.total_seconds() / (24 * 3600)

threshold = grouped['duration_days'].quantile(0.95)

top_5_percent_cases = grouped[grouped['duration_days'] >= threshold]

lowest_value_top_5_percent = top_5_percent_cases['duration_days'].min()

print(f"The lowest value of the top 5% longest durations is: {lowest_value_top_5_percent:.2f} days")

In [9]:
import plotly.express as px

start_times = el.groupby('case:concept:name')['time:timestamp'].min().reset_index()

start_times['start_date'] = start_times['time:timestamp'].dt.date

case_counts = start_times['start_date'].value_counts().sort_index().reset_index()
case_counts.columns = ['start_date', 'count']

fig = px.line(case_counts, x='start_date', y='count', title='Number of Cases Started Each Day',
              labels={'start_date': 'Date', 'count': 'Number of Cases Started'})

fig.update_layout(xaxis_title='Date', yaxis_title='Number of Cases Started', xaxis_tickangle=45)
fig.show()

In [10]:
grouped = el.groupby('case:concept:name')['time:timestamp'].agg(['min', 'max']).reset_index()

# Calculate duration for each case in days
grouped['duration_days'] = (grouped['max'] - grouped['min']).dt.total_seconds() / (24 * 3600)

# Extract the start date for each case
grouped['start_date'] = grouped['min'].dt.date

# Calculate the average duration of cases for each start date
average_durations = grouped.groupby('start_date')['duration_days'].mean().reset_index()

# Create a line plot using Plotly
fig = px.line(average_durations, x='start_date', y='duration_days', title='Average Duration of Cases by Start Date',
              labels={'start_date': 'Start Date', 'duration_days': 'Average Duration (days)'})

fig.update_layout(xaxis_title='Start Date', yaxis_title='Average Duration (days)', xaxis_tickangle=45)
fig.show()