In [1]:
import pandas as pd
import numpy as np
import os, sys

idx = pd.IndexSlice

In [2]:
# %load /home/jonathan/.ipython/profile_default/startup/01-setup.py
# start up settings for jupyter notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os, sys

# use plt.style.available() to check out available styles
plt.style.use('seaborn-white')
plt.rcParams['font.size'] = 20
plt.rcParams['axes.labelsize'] = 15.0
plt.rcParams['xtick.labelsize'] = 20
plt.rcParams['ytick.labelsize'] = 20
plt.rcParams['legend.fontsize'] = 20

%matplotlib inline

# set the max column width
pd.options.display.max_colwidth = 1000

# to avoid have warnings from chained assignments
pd.options.mode.chained_assignment = None


fp = '../../prediction-data/clst-IS2017-2-repacked-predictive-data.csv'
df = pd.read_csv(fp, header=[0, 1])
df.rename(columns={
    'Unnamed: 0_level_1': '',
    'Unnamed: 1_level_1': '',
    'Unnamed: 2_level_1': ''
}, level=1, inplace=True)

columns = list(df.columns)
decomposition_type = 'sese_manual'
df['decomposition'] = decomposition_type
columns = [('model', ''), ('log', ''), ('decomposition', ''), ('SP label', '')] + columns[3:]
df = df[columns]

df.to_csv(fp, index=False)

In [19]:
data_dir = '../../prediction-data/'

df_list = []

for f in os.listdir(data_dir):
    if 'inc0' in f or 'final-data' in f:
        continue
    fp = os.path.join(data_dir, f)
    
    if not os.path.isfile(fp):
        continue

    df = pd.read_csv(fp, header=[0, 1])
    df.rename(columns={
        'Unnamed: 0_level_1': '',
        'Unnamed: 1_level_1': '',
        'Unnamed: 2_level_1': '',
        'Unnamed: 3_level_1': ''
    }, level=1, inplace=True)
    df_list.append(df)
    


In [144]:
base_df = pd.concat(df_list, axis=0, sort=False)
base_df.set_index(['model', 'log', 'decomposition', 'SP label'], inplace=True)

# remove traces for which all algorithms are invalid
astar_invalid = (base_df.loc[:,('is_valid', 'astar')] == False) | (base_df.loc[:, ('is_valid', 'astar')].isnull())
inc3_invalid = (base_df.loc[:,('is_valid', 'inc3')] == False) | (base_df.loc[:,('is_valid', 'inc3')].isnull())
recomp_astar_invalid = (base_df.loc[:,('is_valid', 'recomp-astar')] == False) | (base_df.loc[:,('is_valid', 'recomp-astar')].isnull())
recomp_inc3_invalid = (base_df.loc[:,('is_valid', 'recomp-inc3')] == False) | (base_df.loc[:,('is_valid', 'recomp-inc3')].isnull())
all_invalid = astar_invalid & inc3_invalid & recomp_astar_invalid & recomp_inc3_invalid

before_n_rows = base_df.shape[0]
print('Before filtering all invalid no. of rows: {}'.format(before_n_rows))

base_df = base_df.loc[~all_invalid, :]

after_n_rows = base_df.shape[0]
print('After filtering all invalid no. of rows: {}'.format(after_n_rows))
print('{} traces invalid for all algorithms'.format(before_n_rows - after_n_rows))

cost_is_neg_recomp_astar = base_df.loc[:, ('Cost of the alignment', 'recomp-astar')] == -1
cost_is_neg_recomp_inc3 = base_df.loc[:, ('Cost of the alignment', 'recomp-inc3')] == -1

before_n_rows = base_df.shape[0]
print('Before filtering negative cost rows: {}'.format(before_n_rows))

base_df = base_df.loc[~(cost_is_neg_recomp_astar | cost_is_neg_recomp_inc3), :]

after_n_rows = base_df.shape[0]
print('After filtering negative cost rows: {}'.format(after_n_rows))
print('{} traces invalid'.format(before_n_rows - after_n_rows))

Before filtering all invalid no. of rows: 330130
After filtering all invalid no. of rows: 329876
254 traces invalid for all algorithms
Before filtering negative cost rows: 329876
After filtering negative cost rows: 329289
587 traces invalid


In [145]:
base_df.columns.get_level_values(level=0).unique()

Index(['Cost of the alignment', 'Heuristics computed', 'Heuristics derived',
       'Heuristics estimated', 'Markings added to closed set',
       'Markings polled from queue', 'Markings queued', 'Markings reached',
       'Min', 'Number of splits when splitting marking',
       'Number of times replay was restarted', 'Size of the constraintset',
       'Time to compute alignment (us)', 'Total Time including setup (us)',
       'is_valid', 'n_invis_move', 'n_log_move', 'n_model_move', 'n_sync_move',
       'result_dir'],
      dtype='object')

### Find and correct errors with algorithms yielding invalid alignments but has the minimum time

#### Total Time including setup (us)

In [146]:
min_total_astar = base_df.loc[:,('Min', 'Total Time including setup (us)')] == 'astar'
min_total_inc3 = base_df.loc[:,('Min', 'Total Time including setup (us)')] == 'inc3'
min_total_r_astar = base_df.loc[:,('Min', 'Total Time including setup (us)')] == 'recomp-astar'
min_total_r_inc3 = base_df.loc[:,('Min', 'Total Time including setup (us)')] == 'recomp-inc3'

false_min_astar = base_df.loc[astar_invalid & min_total_astar, ]
false_min_inc3 = base_df.loc[inc3_invalid & min_total_inc3, ]
false_min_r_astar = base_df.loc[recomp_astar_invalid & min_total_r_astar, ]
false_min_r_inc3 = base_df.loc[recomp_inc3_invalid & min_total_r_inc3, ]

print('No. of false min astar: {}'.format(false_min_astar.shape[0]))
print('No. of false min inc3: {}'.format(false_min_inc3.shape[0]))
print('No. of false min recomp astar: {}'.format(false_min_r_astar.shape[0]))
print('No. of false min recomp inc3: {}'.format(false_min_r_inc3.shape[0]))

No. of false min astar: 0
No. of false min inc3: 0
No. of false min recomp astar: 0
No. of false min recomp inc3: 0


#### Time to compute alignment (us)

In [147]:
col = ('Min', 'Time to compute alignment (us)')
min_align_astar = base_df.loc[:,col] == 'astar'
min_align_inc3 = base_df.loc[:,col] == 'inc3'
min_align_r_astar = base_df.loc[:,col] == 'recomp-astar'
min_align_r_inc3 = base_df.loc[:,col] == 'recomp-inc3'

false_min_astar = base_df.loc[astar_invalid & min_align_astar, ]
false_min_inc3 = base_df.loc[inc3_invalid & min_align_inc3, ]
false_min_r_astar = base_df.loc[recomp_astar_invalid & min_align_r_astar, ]
false_min_r_inc3 = base_df.loc[recomp_inc3_invalid & min_align_r_inc3, ]

print('No. of false min align time astar: {}'.format(false_min_astar.shape[0]))
print('No. of false min align time inc3: {}'.format(false_min_inc3.shape[0]))
print('No. of false min align time recomp astar: {}'.format(false_min_r_astar.shape[0]))
print('No. of false min align time recomp inc3: {}'.format(false_min_r_inc3.shape[0]))

No. of false min align time astar: 0
No. of false min align time inc3: 0
No. of false min align time recomp astar: 0
No. of false min align time recomp inc3: 0


In [148]:
base_df.describe()

Unnamed: 0_level_0,Cost of the alignment,Cost of the alignment,Cost of the alignment,Cost of the alignment,Heuristics computed,Heuristics computed,Heuristics computed,Heuristics computed,Heuristics derived,Heuristics derived,...,n_log_move,n_log_move,n_model_move,n_model_move,n_model_move,n_model_move,n_sync_move,n_sync_move,n_sync_move,n_sync_move
Unnamed: 0_level_1,astar,inc3,recomp-astar,recomp-inc3,astar,inc3,recomp-astar,recomp-inc3,astar,inc3,...,recomp-astar,recomp-inc3,astar,inc3,recomp-astar,recomp-inc3,astar,inc3,recomp-astar,recomp-inc3
count,329289.0,329289.0,329289.0,329289.0,329289.0,329289.0,329289.0,329289.0,329289.0,329289.0,...,329289.0,329289.0,329289.0,329289.0,329289.0,329289.0,329289.0,329289.0,329289.0,329289.0
mean,57.65861,57.540483,57.722827,57.724036,693.739241,6.518614,757.428879,82.172432,303.494125,266.142868,...,4.261983,4.270969,3.763053,3.741823,3.77575,3.753587,66.944872,66.982675,67.012105,67.003119
std,136.368084,136.173015,136.703001,136.709396,2009.759991,31.558585,2051.419583,43.779062,633.343098,691.859813,...,10.33615,10.379335,8.496652,8.410476,8.535595,8.440035,35.497537,35.645809,35.635604,35.635837
min,0.0,0.0,0.0,0.0,1.0,1.0,12.0,12.0,3.0,3.0,...,0.0,0.0,-1.0,-1.0,0.0,0.0,-1.0,-1.0,1.0,0.0
25%,0.0,0.0,0.0,0.0,1.0,1.0,71.0,71.0,58.0,58.0,...,0.0,0.0,0.0,0.0,0.0,0.0,45.0,45.0,45.0,45.0
50%,0.0,0.0,0.0,0.0,1.0,1.0,74.0,75.0,83.0,83.0,...,0.0,0.0,0.0,0.0,0.0,0.0,57.0,57.0,57.0,57.0
75%,36.0,36.0,36.0,36.0,64.0,2.0,149.0,79.0,181.0,170.0,...,2.0,2.0,2.0,2.0,2.0,2.0,79.0,79.0,79.0,79.0
max,1256.0,1256.0,1256.0,1256.0,38800.0,648.0,25251.0,989.0,23185.0,16646.0,...,100.0,100.0,78.0,78.0,78.0,78.0,419.0,419.0,419.0,419.0


## Model and trace features

In [15]:
fp = '../../prediction-data/features/net1-feature.csv'
df = pd.read_csv(fp)

fp = '../../prediction-data/features/P241-feature.csv'
df = pd.read_csv(fp)

In [16]:
df.head()

Unnamed: 0,caseid,trace_length,n_activity,activity_repeat_mean,activity_repeat_std,snp_n_transition,snp_n_inv_transition,snp_n_dup_transition,snp_n_uniq_transition,snp_inv_transition_in_degree_mean,...,subnet_n_dup_transition_mean,subnet_n_dup_transition_std,subnet_n_uniq_transition_mean,subnet_n_uniq_transition_std,subnet_n_place_mean,subnet_n_place_std,subnet_n_arc_mean,subnet_n_arc_std,decomposition,log
0,case_0,106.0,93.0,1.139785,0.543567,355.0,26.0,40.0,289.0,2.0,...,0.0,0.0,40.0,41.581246,52.0,67.557383,116.666667,141.454351,sese_manual,L50
1,case_1,94.0,91.0,1.032967,0.17954,331.0,26.0,12.0,293.0,2.0,...,0.0,0.0,40.0,41.581246,52.0,67.557383,116.666667,141.454351,sese_manual,L50
2,case_10,112.0,90.0,1.244444,0.768935,367.0,26.0,62.0,279.0,2.0,...,0.0,0.0,40.0,41.581246,52.0,67.557383,116.666667,141.454351,sese_manual,L50
3,case_100,105.0,91.0,1.153846,0.613244,353.0,26.0,42.0,285.0,2.0,...,0.0,0.0,40.0,41.581246,52.0,67.557383,116.666667,141.454351,sese_manual,L50
4,case_101,88.0,87.0,1.011494,0.107211,319.0,26.0,4.0,289.0,2.0,...,0.0,0.0,40.0,41.581246,52.0,67.557383,116.666667,141.454351,sese_manual,L50


In [149]:
feature_dir = '../../prediction-data/features/'
extra_featur_dir = '../../prediction-data/features/extra-extra-feature/'

feature_df_dict = dict()
extra_feature_df_dict = dict()

for f in os.listdir(feature_dir):
    fp = os.path.join(feature_dir, f)
    
    if not os.path.isfile(fp):
        continue
    
    model = f.replace('-feature.csv', '')
    df = pd.read_csv(fp)
    df['model'] = model
    feature_df_dict[model] = df

for f in os.listdir(extra_featur_dir):
    fp = os.path.join(extra_featur_dir, f)
    
    if not os.path.isfile(fp):
        continue
        
    model = f.replace('-extra-extra-feature.csv', '')
    df = pd.read_csv(fp)
    df['model'] = model
    extra_feature_df_dict[model] = df

In [150]:
assert len(set(feature_df_dict.keys()).difference(set(extra_feature_df_dict.keys()))) == 0

In [151]:
# join the feature dfs with the extra feature dfs
feature_df_list = []
for key in feature_df_dict.keys():
    df0 = feature_df_dict[key]
    df1 = extra_feature_df_dict[key]
    df = df0.merge(df1, on=['model', 'log', 'caseid'])
    feature_df_list.append(df)
feature_df = pd.concat(feature_df_list, axis=0, sort=False)

out_fp = '../../prediction-data/features/merged-feature.csv'
# feature_df.reset_index(drop=False).to_csv(out_fp, index=False)

In [152]:
feature_df.rename(columns={'caseid': 'SP label'}, inplace=True) # for later convenience on merging with base_df
feature_df.set_index(['model', 'log', 'decomposition', 'SP label'], inplace=True)
feature_df.columns = pd.MultiIndex.from_product([['model_trace_features'], feature_df.columns])
feature_df.loc[:,('model_trace_features', 'activity_repeat_std')].fillna(0., inplace=True)

In [153]:
full_df = base_df.merge(feature_df, on=('model', 'log', 'decomposition', 'SP label'), how='inner')
out_fp = '../../prediction-data/final-data.csv'

# export dataframe after computing k times differences!
# full_df.reset_index(drop=False).to_csv(out_fp, index=False)

## Find instances where there are k times differences

In [162]:
full_df.loc[:, idx[('Total Time including setup (us)'), :]].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Total Time including setup (us),Total Time including setup (us),Total Time including setup (us),Total Time including setup (us)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,astar,inc3,recomp-astar,recomp-inc3
model,log,decomposition,SP label,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
P430,L19,sese_manual,case_785,22874,37474,35761,45285
P430,L19,sese_manual,case_69,17436,41131,36837,49677
P430,L19,sese_manual,case_33,12774,28984,35404,47586
P430,L19,sese_manual,case_797,26403,52129,40037,52829
P430,L19,sese_manual,case_784,13038,28965,37256,47108


In [163]:
import itertools as itls

algos = [
    'astar', 'inc3', 'recomp-astar', 'recomp-inc3'
]

c_2 = itls.combinations(algos, 2)
colnames = []

for a0, a1 in c_2:
    # compute the time[a0] / time[a1]
    col_name = '{}_over_{}'.format(a0, a1)
    colnames.append(col_name)
    full_df[('Total Time including setup (us)', col_name)] = full_df[('Total Time including setup (us)', a0)] / full_df[('Total Time including setup (us)', a1)]

k = 5
full_df[('Total Time including setup (us)', 'max_diff')] = full_df.loc[:, idx[('Total Time including setup (us)'), tuple(colnames)]].max(axis=1)
# base_df[('Total Time including setup (us)', 'at_least_{}'.format(k))] = base_df[('Total Time including setup (us)', 'max_diff')] >= k


In [164]:
full_df.loc[:, idx['Total Time including setup (us)', :]].head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Total Time including setup (us),Total Time including setup (us),Total Time including setup (us),Total Time including setup (us),Total Time including setup (us),Total Time including setup (us),Total Time including setup (us),Total Time including setup (us),Total Time including setup (us),Total Time including setup (us),Total Time including setup (us)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,astar,inc3,recomp-astar,recomp-inc3,astar_over_inc3,astar_over_recomp-astar,astar_over_recomp-inc3,inc3_over_recomp-astar,inc3_over_recomp-inc3,recomp-astar_over_recomp-inc3,max_diff
model,log,decomposition,SP label,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
P430,L19,sese_manual,case_785,22874,37474,35761,45285,0.610397,0.639635,0.505112,1.047901,0.827515,0.789688,1.047901
P430,L19,sese_manual,case_69,17436,41131,36837,49677,0.423914,0.473328,0.350987,1.116568,0.827969,0.74153,1.116568
P430,L19,sese_manual,case_33,12774,28984,35404,47586,0.440726,0.360807,0.26844,0.818665,0.609087,0.744,0.818665
P430,L19,sese_manual,case_797,26403,52129,40037,52829,0.506494,0.659465,0.499782,1.302021,0.98675,0.75786,1.302021
P430,L19,sese_manual,case_784,13038,28965,37256,47108,0.450129,0.349957,0.276768,0.777459,0.614864,0.790864,0.790864


### Export full dataframe

In [165]:
out_fp = '../../prediction-data/final-data.csv'

# export dataframe after computing k times differences!
full_df.reset_index(drop=False).to_csv(out_fp, index=False)

### Getting the no-duplicates dataframe
Two rows are considered duplicates if they have the values in:
- all of the model_trace_features columns
- same cost of the alignment

Note that they do not need the same alignment, i.e., same number of legal move types

In [166]:
print('Number of rows before filtering duplicates: {}'.format(full_df.shape[0]))

select_level_0 = ('model_trace_features', 'Cost of the alignment')
select_level_1 = slice(None)

duplicate_rows = full_df.loc[:, idx[select_level_0, select_level_1]].duplicated()
uniq_full_df = full_df[~duplicate_rows]

print('Number of rows after filtering duplicates: {}'.format(uniq_full_df.shape[0]))
print('Number of rows removed: {}'.format(full_df.shape[0] - uniq_full_df.shape[0]))

out_fp = '../../prediction-data/uniq-final-data.csv'
uniq_full_df.reset_index(drop=False).to_csv(out_fp, index=False)

Number of rows before filtering duplicates: 245478
Number of rows after filtering duplicates: 185848
Number of rows removed: 59630


In [None]:
full_df_description = full_df.describe()

In [None]:
col = ('Min', 'Total Time including setup (us)')
min_total_astar = full_df.loc[:, col] == 'astar'
min_total_inc3 = full_df.loc[:, col] == 'inc3'
min_total_r_astar = full_df.loc[:, col] == 'recomp-astar'
min_total_r_inc3 = full_df.loc[:, col] == 'recomp-inc3'

total_win_astar_secs = full_df.loc[min_total_astar, idx[('Total Time including setup (us)'), :]] / 1000000
total_win_inc3_secs = full_df.loc[min_total_inc3, idx[('Total Time including setup (us)'), :]] / 1000000
total_win_r_astar_secs = full_df.loc[min_total_r_astar, idx[('Total Time including setup (us)'), :]] / 1000000
total_win_r_inc3_secs = full_df.loc[min_total_r_inc3, idx[('Total Time including setup (us)'), :]] / 1000000

In [None]:
total_win_r_inc3_secs.describe()

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))

class_dist = full_df.loc[:, idx[('Min'), ('Total Time including setup (us)')]].value_counts()
class_dist.plot(kind='bar', ax=ax, color='dodgerblue', alpha=0.5, edgecolor='black', linewidth=1.5);

xticklabels = [
    'CLASSIC', 'CLASSIC-SP', 'RECOMPOSE', 'RECOMPOSE-SP'
]
ax.set_xticklabels(xticklabels, size=20, rotation=0)

total_wins = [
    total_win_astar_secs,
    total_win_inc3_secs,
    total_win_r_astar_secs,
    total_win_r_inc3_secs
]

algo = [
    'astar', 'inc3', 'recomp-astar', 'recomp-inc3'
]

means = [total_wins[i].loc[:, ('Total Time including setup (us)', algo[i])].mean() for i in range(4)]
std = [ss.std() for ss in total_wins]

# ax_twin = ax.twinx()

# ax_twin.plot(means, marker='^', linestyle='', )

fig.tight_layout()
fig.savefig('./class-dist.svg', bbox_inches='tight', rasterized=True)

# Scrap... 

In [None]:
concat_df.head(2)

### Class distribution

In [None]:
concat_df.loc[(concat_df[('Min', 'Total Time including setup (us)')] == 'astar'),('Cost of the alignment','astar')].describe()

In [None]:
concat_df.loc[(concat_df[('Min', 'Total Time including setup (us)')] == 'inc3'),('Cost of the alignment', 'inc3')].describe()

In [None]:
temp_df.describe()

In [None]:
temp_df = concat_df['n_sync_move'] + concat_df['n_invis_move'] + concat_df['n_model_move'] + concat_df['n_log_move']
temp_df.columns = pd.MultiIndex.from_product([['alignment_length'], ['astar', 'inc3', 'recomp-astar', 'recomp-inc3']])
concat_df = pd.concat([concat_df, temp_df], axis=1)

In [None]:
idx = pd.IndexSlice

astar_invalid = concat_df.loc[:,('is_valid','astar')] == False
min_is_astar = concat_df.loc[:,('Min', 'Total Time including setup (us)')] == 'astar'
concat_df.loc[(astar_invalid)&(min_is_astar),idx[('is_valid', slice(None))]]

In [None]:
concat_df.columns.get_level_values(0)

In [None]:
idx = pd.IndexSlice
concat_df.loc[:, idx[('Min', 'is_valid'),('Total Time including setup (us)', 'astar', 'inc3', 'recomp-astar', 'recomp-inc3')]]

In [None]:
astar_invalid = concat_df.loc[:,('is_valid', 'astar')] == False
inc3_invalid = concat_df.loc[:,('is_valid', 'inc3')] == False
recomp_astar_invalid = concat_df.loc[:,('is_valid', 'recomp-astar')] == False
recomp_inc3_invalid = concat_df.loc[:,('is_valid', 'recomp-inc3')] == False
min_total_astar = concat_df.loc[:,('Min', 'Total Time including setup (us)')] == 'astar'
min_total_inc3 = concat_df.loc[:,('Min', 'Total Time including setup (us)')] == 'inc3'
min_total_recomp_astar = concat_df.loc[:,('Min', 'Total Time including setup (us)')] == 'recomp-astar'
min_total_recomp_inc3 = concat_df.loc[:,('Min', 'Total Time including setup (us)')] == 'recomp-inc3'
min_align_astar = concat_df.loc[:,('Min', 'Time to compute alignment (us)')] == 'astar'
min_align_inc3 = concat_df.loc[:,('Min', 'Time to compute alignment (us)')] == 'inc3'
min_align_recomp_astar = concat_df.loc[:,('Min', 'Time to compute alignment (us)')] == 'recomp-astar'
min_align_recomp_inc3 = concat_df.loc[:,('Min', 'Time to compute alignment (us)')] == 'recomp-inc3'

concat_df.loc[astar_invalid & inc3_invalid & recomp_astar_invalid & recomp_inc3_invalid,:].shape

In [None]:
concat_df.loc[astar_invalid | inc3_invalid | recomp_astar_invalid | recomp_inc3_invalid,:].shape

In [None]:
concat_df.loc[astar_invalid & min_astar, idx[('Min', 'is_valid'),('Total Time including setup (us)', 'astar')]]

In [None]:
concat_df.loc[astar_invalid & min_align_astar,:].shape

In [None]:
concat_df.loc[inc3_invalid & min_total_inc3,:].shape

In [None]:
concat_df.loc[recomp_astar_invalid & min_total_recomp_astar, :].shape

In [None]:
concat_df.loc[recomp_inc3_invalid & min_total_recomp_inc3, :].shape

In [None]:
concat_df['Total Time including setup (us)'].max()

In [None]:
concat_df.loc[(concat_df[('Min', 'Total Time including setup (us)')] == 'recomp-inc3'),[('Cost of the alignment','astar'), ('')]].describe()