<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Dataset-Infos" data-toc-modified-id="Dataset-Infos-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Dataset Infos</a></span></li><li><span><a href="#Results" data-toc-modified-id="Results-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Results</a></span></li></ul></div>

Latex export
=====

This notebook prints the information about the event_logs as shown in the paper. Next to that, it prints the results of the process prediction experiment

In [1]:
from mppn.imports import *
from mppn.preprocessing import *
from mppn.baselines import *
from mppn.pipeline import *


## Dataset Infos

In [2]:
ds=[i for i in EventLogs.__dict__.keys() if i[:1] != '_']
ds

['Helpdesk',
 'BPIC_12',
 'BPIC_12_W',
 'BPIC_12_Wcomplete',
 'BPIC_12_A',
 'BPIC_12_O',
 'BPIC_13_CP',
 'BPIC_17_OFFER',
 'BPIC_20_RFP',
 'Mobis']

In [3]:
cols=('#Traces, #Events, Avg. trace length, Avg. trace duration').split(', ')
res=[]
for i,dn in enumerate(ds):
    log=import_log(getattr(EventLogs, ds[i]))
    number_traces=len(np.unique(log.index))
    number_events=len(log)
    avg_events_per_trace=log.groupby(log.index).size().mean().round(2)
    o=PPObj(log,Datetify,date_names='timestamp')
    df=o.items[o.items.event_id!=0].timestamp_Relative_elapsed
    mean_trace_dur=f'{(df.groupby(level=0).sum().mean()/(60*60*24)).round(1)} days'
    res.append([number_traces,number_events,avg_events_per_trace,mean_trace_dur])

res_index=[get_ds_name(getattr(EventLogs, i)) for i in ds]

res=pd.DataFrame(res,columns=cols,index=res_index)
res['Input attributes']=pd.Series(attr_dict).loc[res.index].apply(lambda x:", ".join([j for i in x.values() for j in L(i)]))
res

Unnamed: 0,#Traces,#Events,Avg. trace length,Avg. trace duration,Input attributes
Helpdesk,4580,21348,4.66,62.9 days,"activity, resource, timestamp"
BPIC12,13087,262200,20.04,150.2 days,"activity, resource, AMOUNT_REQ, timestamp"
BPIC12_W,9658,170107,17.61,161.7 days,"activity, resource, AMOUNT_REQ, timestamp"
BPIC12_Wc,9658,72413,7.5,95.6 days,"activity, resource, AMOUNT_REQ, timestamp"
BPIC12_A,13087,60849,4.65,14.5 days,"activity, resource, AMOUNT_REQ, timestamp"
BPIC12_O,5015,31244,6.23,37.8 days,"activity, resource, AMOUNT_REQ, timestamp"
BPIC13_CP,1487,6660,4.48,426.5 days,"activity, resource, resource country, organization country, organization involved, impact, product, org:role, timestamp"
BPIC17_O,42995,193849,4.51,23.9 days,"activity, Action, NumberOfTerms, resource, FirstWithdrawalAmount, MonthlyCost, OfferedAmount, CreditScore, timestamp"
BPIC20_RFP,6886,36796,5.34,31.6 days,"org:role, activity, resource, Project, Task, OrganizationalEntity, RequestedAmount, timestamp"
Mobis,6555,166512,25.4,1194.4 days,"activity, resource, type, cost, timestamp"


In [4]:
print(res.to_latex())

\begin{tabular}{lrrrll}
\toprule
{} &  \#Traces &  \#Events &  Avg. trace length & Avg. trace duration &                                                                                                         Input attributes \\
\midrule
Helpdesk   &     4580 &    21348 &               4.66 &           62.9 days &                                                                                            activity, resource, timestamp \\
BPIC12     &    13087 &   262200 &              20.04 &          150.2 days &                                                                                activity, resource, AMOUNT\_REQ, timestamp \\
BPIC12\_W   &     9658 &   170107 &              17.61 &          161.7 days &                                                                                activity, resource, AMOUNT\_REQ, timestamp \\
BPIC12\_Wc  &     9658 &    72413 &               7.50 &           95.6 days &                                                                           

## Results

In [5]:
folder='results/'

res=[pd.read_csv(i) for i in Path(folder).ls() if 'csv' in str(i)]

len(res)

10

The following cell calculates the mean and the std overall runs. Next to that, it sorts and reorders the rows and the columns. Last, it prepares the results df for latex export

In [6]:
# Todo: clean up cell
def bold_extreme_values(data,mean, format_string="%.2f", max_=True):
    if max_:
        extrema = mean != mean.groupby(level=0).transform('max')
    else:
        extrema = mean != mean.groupby(level=0).transform('min')
    bolded = data.apply(lambda x : f"\\textbf{{{x}}}")
    return data.where(extrema, bolded)

change_order=['Unnamed: 0', 'Dataset', 'Model', 'Next Step', 'Next Resource', 'Outcome',
       'Last Resource', 'Next relative Timestamp',
       'Duration to Outcome', 'Activity Suffix', 'Resource Suffix']

df=pd.concat(res)
df=df[change_order]
df.drop(['Unnamed: 0'],axis=1,inplace=True)
df.fillna(0,inplace=True)
df=df.set_index(['Dataset','Model'])
df=df[(df.columns)[:-2]]
cols=['Next activity', 'Next resource', 'Last activity', 'Last resource', 'Time till next', 'Remaining time']
df.columns=cols
logs_in_order=[get_ds_name(getattr(EventLogs, i)) for i in ds]
models_in_order='Evermann Camargo_Spezialized Camargo_concat Camargo_fullconcat Tax_Spezialized Tax_Shared Tax_Mixed Tax_Shared MiDA MPPN'.split()
mean=df.groupby(level=['Dataset','Model']).mean().round(3)
stds=df.groupby(level=['Dataset','Model']).std().round(3)
mean=mean.loc(axis=0)[pd.IndexSlice[logs_in_order ,models_in_order]]
stds=stds.loc(axis=0)[pd.IndexSlice[logs_in_order ,models_in_order]]



df=mean.astype(str)+"+-"+stds.astype(str)

mean=mean.replace(0.,np.nan)
export_df=df.copy()
min_max_cols=[True,True,True,True,False,False]
for i,col in enumerate(list(df)):
    export_df[col] = bold_extreme_values(df[col],mean[col],max_=min_max_cols[i])
export_df=export_df.reset_index().replace('_','\_',regex=True)
export_df=export_df.set_index(['Dataset','Model'])
export_df=export_df.replace('0.0+-0.0',"")
display_df(export_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,Next activity,Next resource,Last activity,Last resource,Time till next,Remaining time
Dataset,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Helpdesk,Evermann,0.651+-0.128,0.222+-0.005,\textbf{0.994+-0.0},0.811+-0.0,,
Helpdesk,Camargo\_Spezialized,0.693+-0.168,0.289+-0.071,\textbf{0.994+-0.0},0.811+-0.0,7.95+-0.576,6.654+-0.101
Helpdesk,Camargo\_concat,0.696+-0.116,0.421+-0.035,\textbf{0.994+-0.0},0.811+-0.0,7.63+-0.052,6.739+-0.253
Helpdesk,Camargo\_fullconcat,0.774+-0.077,0.432+-0.0,\textbf{0.994+-0.0},0.811+-0.0,5.308+-0.288,7.018+-0.225
Helpdesk,Tax\_Spezialized,0.763+-0.082,,\textbf{0.994+-0.0},,7.777+-0.526,6.895+-0.253
Helpdesk,Tax\_Mixed,0.3+-0.003,,\textbf{0.994+-0.0},,14.849+-0.034,7.197+-0.101
Helpdesk,Tax\_Shared,0.793+-0.004,,\textbf{0.994+-0.0},,5.088+-0.129,6.67+-0.1
Helpdesk,MiDA,0.693+-0.12,0.263+-0.089,\textbf{0.994+-0.0},0.811+-0.0,\textbf{4.898+-0.043},\textbf{6.629+-0.166}
Helpdesk,MPPN,\textbf{0.805+-0.003},\textbf{0.691+-0.006},\textbf{0.994+-0.0},\textbf{0.847+-0.008},5.197+-0.126,6.691+-0.089
BPIC12,Evermann,0.595+-0.107,0.149+-0.0,0.417+-0.0,0.172+-0.0,,


In [7]:
print(export_df.to_latex(escape=False))

\begin{tabular}{llllllll}
\toprule
      &      &          Next activity &          Next resource &          Last activity &          Last resource &          Time till next &           Remaining time \\
Dataset & Model &                        &                        &                        &                        &                         &                          \\
\midrule
Helpdesk & Evermann &           0.651+-0.128 &           0.222+-0.005 &    \textbf{0.994+-0.0} &             0.811+-0.0 &                         &                          \\
      & Camargo\_Spezialized &           0.693+-0.168 &           0.289+-0.071 &    \textbf{0.994+-0.0} &             0.811+-0.0 &             7.95+-0.576 &             6.654+-0.101 \\
      & Camargo\_concat &           0.696+-0.116 &           0.421+-0.035 &    \textbf{0.994+-0.0} &             0.811+-0.0 &             7.63+-0.052 &             6.739+-0.253 \\
      & Camargo\_fullconcat &           0.774+-0.077 &             0.432+-