In [None]:
import re
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(font='monospace', style='whitegrid', font_scale=1.5)
# plt.rcParams['font.family'] = 'monospace'
# plt.rcParams['font.family'] = 'sans-serif'
# plt.rcParams['font.sans-serif'] = ['Liberation Sans']
plt.rcParams['savefig.dpi'] = 300

In [None]:
import matplotlib.font_manager as fm
[f.name for f in fm.fontManager.ttflist]

In [None]:
colors = sns.color_palette('Paired')
colors[0]

In [None]:
with open(r'../results/results.pickle', 'rb') as result_file:
    results = pickle.load(result_file)

In [None]:
# covert from dict to DF
r = pd.DataFrame.from_dict(results)
# only handle the BPIC datasets
r = r.loc[r.dataset.str.contains('BPIC')]

In [None]:
# Output dataset information for papers
datasets = {}
dataset_names = [re.match('../datasets/(.+).csv', x).groups()[0] for x in list(sorted(set(r['dataset'])))]
# datasets['Dataset'] = [re.sub('_', ' ', x) for x in dataset_names]
datasets['dataset'] = dataset_names

datasets['Number of Columns'] = [len(pd.read_csv(x, nrows=1).columns) for x in list(sorted(set(r['dataset'])))]
r['dataset'] = r['dataset'].apply(lambda x: re.match('../datasets/(.+).csv', x).groups()[0])

print(pd.DataFrame.from_dict(datasets).to_latex(index=False))

In [None]:
# check if the correct tuple is included in the candidates
r['is_in'] = r[['correct_indices', 'cands']].apply(lambda x: x['correct_indices'] in x['cands'], axis=1)


In [None]:
# check if the candidate with the highest score is the correct one
def if_highest_cands_is_correct(x):
    if x['n_cands'] == 0:
        # this means that no candidate is identified. return a dummy tuple
        return [[-1, -1, -1]]
    elif x['n_cands'] == 1:
        # this means that we only have a single candidate at the first stage
        return [np.array(x['cands']).flatten().tolist()]
    else:
        return [x['cands'][i] for i in np.argwhere(np.amax(x['scores']) == x['scores']).flatten().tolist()]
    
r['identified_indices'] = r.apply(lambda x: if_highest_cands_is_correct(x), axis=1)

In [None]:
# find identification accuracy by each key column
r['case_id_precision'] = r.apply(lambda x: sum([i[0] == x['correct_indices'][0] for i in x['identified_indices']]) / len(x['identified_indices']), axis=1)
r['timestamp_precision'] = r.apply(lambda x: sum([i[1] == x['correct_indices'][1] for i in x['identified_indices']]) / len(x['identified_indices']), axis=1)
r['activity_precision'] = r.apply(lambda x: sum([i[2] == x['correct_indices'][2] for i in x['identified_indices']]) / len(x['identified_indices']), axis=1)

In [None]:
# check if all columns are correctly identified
r['is_all_correct'] = r.apply(lambda x: x['correct_indices'] in x['identified_indices'], axis=1)

In [None]:
with open(r'../results/results_first_stage.pickle', 'rb') as result_file:
    results_first_stage = pickle.load(result_file)
tmp = pd.DataFrame.from_dict(results_first_stage)

# Filter out no candidate cases
tmp = tmp[tmp['n_cands'] > 0]

# Checking if candidates exist in correct_indices
tmp['is_kept_in_first_stage_case_id'] = tmp.apply(lambda x: x['correct_indices'][0] in [i[0] for i in x[['cands']][0]], axis=1)
tmp['is_kept_in_first_stage_timestamp'] = tmp.apply(lambda x: x['correct_indices'][1] in [i[1] for i in x[['cands']][0]], axis=1)
tmp['is_kept_in_first_stage_activity'] = tmp.apply(lambda x: x['correct_indices'][2] in [i[2] for i in x[['cands']][0]], axis=1)

result_tmp_case_id = tmp.groupby(['dataset', 'n_top_cands'])\
      .apply(lambda x: sum(x['is_kept_in_first_stage_case_id']) / len(x['is_kept_in_first_stage_case_id']))\
      .groupby(['n_top_cands'])\
      .apply(lambda x: round(np.mean(x), ndigits=2))\
      .reset_index(name='value')
result_tmp_case_id['attribute'] = 'case-id'     

result_tmp_timestamp = tmp.groupby(['dataset', 'n_top_cands'])\
      .apply(lambda x: sum(x['is_kept_in_first_stage_timestamp']) / len(x['is_kept_in_first_stage_timestamp']))\
      .groupby(['n_top_cands'])\
      .apply(lambda x: round(np.mean(x), ndigits=2))\
      .reset_index(name='value')
result_tmp_timestamp['attribute'] = 'timestamp'

result_tmp_activity = tmp.groupby(['dataset', 'n_top_cands'])\
      .apply(lambda x: sum(x['is_kept_in_first_stage_activity']) / len(x['is_kept_in_first_stage_activity']))\
      .groupby(['n_top_cands'])\
      .apply(lambda x: round(np.mean(x), ndigits=2))\
      .reset_index(name='value')
result_tmp_activity['attribute'] = 'activity'

tmp = pd.concat([result_tmp_case_id, result_tmp_timestamp, result_tmp_activity])
display(tmp)

plt.figure(figsize=(6, 4))
sns.set_style('darkgrid')
sns.set(font_scale=1.5)
sns.lineplot(data=tmp, x='n_top_cands', y='value', hue='attribute',
            palette=sns.color_palette('deep', n_colors=3), linewidth=3)
plt.legend(title='Attribute', loc='lower right')
plt.xlabel('k')
plt.ylabel('Coverage')
plt.ylim(0, 1.05)
plt.savefig('../plots/coverage_k.pdf', bbox_inches='tight')


In [None]:
# find how much we could identify all the key columns correctly
# conditions:
r.groupby(['dataset', 'n_top_cands']).apply(lambda x: round(sum(x['is_all_correct']) / len(x), ndigits=2))

In [None]:
tmp = pd.merge(
    r.groupby(['dataset']).apply(lambda x: round(sum(x['case_id_precision']) / len(x), ndigits=2)).reset_index(name='case_id'),
    r.groupby(['dataset']).apply(lambda x: round(sum(x['activity_precision']) / len(x), ndigits=2)).reset_index(name='activity'), 
    on='dataset')
pd.merge(
    tmp,
    r.groupby(['dataset']).apply(lambda x: round(sum(x['timestamp_precision']) / len(x), ndigits=2)).reset_index(name='timestamp'),
    on='dataset')

In [None]:
r_time = r.groupby(['n_top_cands', 'metric'])\
    .apply(lambda x: round(np.mean(x['time_cand_selection'] + x['time_score_eval']), ndigits=2))\
    .to_frame('time')\
    .reset_index()
display(r_time)

plt.figure(figsize=(6, 4))
sns.set_style('darkgrid')
# sns.set(font_scale=1.5)
sns.barplot(data=r_time, hue='n_top_cands', y='time', x='metric',
            order=['simplicity', 'fitness', 'generalization', 'precision', 'Buijs2014'],
            palette=sns.color_palette('Paired'))
plt.legend(title='k', fontsize=20)
plt.xlabel('Metric', fontsize=20)
plt.xticks(fontsize=11)
plt.ylabel('Computation time [s]', fontsize=20)
plt.savefig('../plots/computation_time.pdf', bbox_inches='tight')

In [None]:
def precision_summary(x):
    c = round(sum(x['case_id_precision']) / len(x), ndigits=2)
    t = round(sum(x['timestamp_precision']) / len(x), ndigits=2)
    a = round(sum(x['activity_precision']) / len(x), ndigits=2)
    avg = round(np.mean([c, t, a]), ndigits=2)
    return pd.DataFrame({'accuracy': [c, t, a, avg]}, 
                        index=['case-id', 'timestamp', 'activity', 'average'])

In [None]:
# metric (accuracy and time)
pd.set_option('display.max_rows', None)

# tmp = r.loc[(r['miner'] == 'inductive_miner') & (len(r['scores']) > 0)]\
tmp = r.groupby(['metric', 'n_top_cands'])\
.apply(func=precision_summary)\
.reset_index()
display(tmp)

plt.figure(figsize=(6, 4))
sns.set_style('darkgrid')
sns.set(font_scale=1.5)
sns.barplot(data=tmp, x='metric', y='accuracy', hue='n_top_cands',
            order=['simplicity', 'fitness', 'generalization', 'precision', 'Buijs2014'],
            palette=sns.color_palette('Paired'))
plt.legend(title='k', fontsize=14)
plt.xlabel('Metric', fontsize=20)
plt.xticks(fontsize=11)
plt.ylabel('Accuracy', fontsize=20)
plt.savefig('../plots/accuracy_metric.pdf', bbox_inches='tight')

In [None]:
tmp = r.groupby(['n_top_cands'])\
    .apply(precision_summary)\
    .reset_index()

plt.figure(figsize=(6, 4))
sns.set_style('darkgrid')
sns.set(font_scale=1.5)
sns.barplot(data=tmp, x='level_1', y='accuracy', hue='n_top_cands',
            order=['case-id', 'timestamp', 'activity'],
            palette=sns.color_palette('Paired'))
plt.legend(title='k')
plt.xlabel('Attribute')
plt.ylabel('Accuracy')
plt.ylim(0, 0.8)
plt.savefig('../plots/accuracy_by_attribute.pdf', bbox_inches='tight')

In [None]:
# metric (accuracy and miner)
# conditions: 

tmp = r.groupby(['miner'])\
    .apply(func=precision_summary)\
    .reset_index()
display(tmp)

plt.figure(figsize=(6, 4))
sns.set_style('darkgrid')
sns.set(font_scale=1.5)
sns.barplot(data=tmp, x='level_1', y='accuracy', hue='miner',
            order=['case-id', 'timestamp', 'activity'],
            palette=sns.color_palette('Paired'))
plt.legend(title='Miner', loc='lower right')
plt.xlabel('Attribute')
plt.ylabel('Accuracy')
plt.ylim(0, 0.7)
plt.savefig('../plots/accuracy_miner.pdf', bbox_inches='tight')

In [None]:
# miner (accuracy and time?)
tmp = r[['dataset', 'miner', 'n_top_cands', 'case_id_precision', 'activity_precision', 'timestamp_precision']]
# replace miner == None with 'NA' when no miner was used (i.e., only identified at stage 1)
tmp.fillna('NA', inplace=True)
tmp.groupby(['dataset', 'miner', 'n_top_cands']) \
.apply(lambda x: round(np.mean(x), ndigits=2))

In [None]:
# accuracy vs number of columns 
tmp = r.groupby(['dataset'])\
.apply(func=precision_summary)\
.reset_index()
tmp = tmp.loc[tmp['level_1'] == 'average']
tmp2 = pd.DataFrame.from_dict(datasets)

r_acc_n_col = pd.merge(tmp, tmp2, on='dataset', how='outer')
display(r_acc_n_col)

plt.figure(figsize=(6, 4))
sns.set_style('darkgrid')
sns.set(font_scale=1.5)
sns.scatterplot(data=r_acc_n_col, x='Number of Columns', y='accuracy', s=100)
plt.xlabel('Number of Attributes')
plt.ylabel('Accuracy')
plt.ylim(0, 1)
plt.savefig('../plots/accuracy_n_columns.pdf', bbox_inches='tight')

In [None]:
tmp = r.groupby(['n_top_cands', 'miner'])\
    .apply(lambda x: round(np.mean(x['time_cand_selection']), ndigits=2))\
    .to_frame('time')\
    .reset_index()
display(tmp)

tmp = r.groupby(['n_top_cands', 'miner'])\
    .apply(lambda x: round(np.mean(x['time_score_eval']), ndigits=2))\
    .to_frame('time')\
    .reset_index()
display(tmp)