In [1]:
import os
from sys import maxsize
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings(action='ignore')
dir_home = os.getcwd()
dir_ref = dir_home +"/ref"

In [2]:

def async_move(event):
  return 1

def sync_move(event1, event2):
  if event1["label"] != event2["label"]:
    return maxsize # infinity in definition
  else:
    data1 = event1["data"]
    data2 = event2["data"]
    keys = set(list(data1.keys()) + list(data2.keys()))
    penalty = 0
    for k in keys:
      if (k in data1 and not k in data2) or (k in data2 and not k in data1):
        penalty += 1
      else:
        penalty += (0 if data1[k] == data2[k] else 1)
    return penalty

In [3]:
def sync_move_standard(event1, event2):
  if event1["label"] != event2["label"]:
    return maxsize # infinity in definition
  else:
    data1 = event1["data"]
    data2 = event2["data"]
    keys = set(list(data1.keys()) + list(data2.keys()))
    penalty = 0
    for k in keys:
      if (k in data1 and not k in data2) or (k in data2 and not k in data1):
        penalty += 1
      else:
        penalty += (0 if data1[k] == data2[k] else 1)
    return penalty

def sync_move_levenshtein(event1, event2):
  return maxsize if event1["label"] != event2["label"] else 0

def distance_faster(trace1, trace2, sync_move):
  delta = [ [0 for j in range(0,len(trace2)+1)] for i in range(0,len(trace1)+1)]

  for i in range(0,len(trace1)+1):
    for j in range(0,len(trace2)+1):
      if i == 0 and j == 0:
        continue # delta[i][j] = 0
      elif i == 0:
        delta[i][j] = async_move(trace2[j-1])
      elif j == 0:
        delta[i][j] = async_move(trace1[j-1])
      else:
        delta[i][j] = min(
                delta[i-1][j-1] + sync_move(trace1[i-1], trace2[j-1]),
                delta[i-1][j] + async_move(trace1[i-1]),
                delta[i][j-1] + async_move(trace2[j-1]))
  return delta[len(trace1)][len(trace2)]

def distance_standard(trace1, trace2):
  return distance_faster(trace1, trace2, sync_move_standard)

def distance_levenshtein(trace1, trace2):
  return distance_faster(trace1, trace2, sync_move_levenshtein)

# 1. Prepare Sepsis event log

In [4]:
data = pd.read_csv("training_set_sepsis.csv")

In [5]:
data = pd.read_csv("training_set_sepsis.csv",
                   dtype = {'pL:low' : np.float64,
                                'pL:high' : np.float64,
                                'pW:low' : np.float64,
                                'pW:high' : np.float64,
                                'sM:low' : np.float64,
                                'sM:high' : np.float64})

In [6]:
df_train = data[['Case ID', 'Activity', 
                 'pL:low', 'pL:high', 'pW:low', 'pW:high',
                 'sM:low', 'sM:high']]

In [47]:
df_test1 = pd.read_csv("test_set_sepsis.csv",
                       dtype = {'pL:low' : np.float64,
                                'pL:high' : np.float64,
                                'pW:low' : np.float64,
                                'pW:high' : np.float64,
                                'sM:low' : np.float64,
                                'sM:high' : np.float64})
df_test2 = pd.read_csv("test_set_result_sepsis.csv", 
                       dtype = {'pL:low' : np.float64,
                                'pL:high' : np.float64,
                                'pW:low' : np.float64,
                                'pW:high' : np.float64,
                                'sM:low' : np.float64,
                                'sM:high' : np.float64})

df_test1 = df_test1[['Case ID', 'Activity', 
                 'pL:low', 'pL:high', 'pW:low', 'pW:high',
                 'sM:low', 'sM:high']]

df_test2 = df_test2[['Case ID', 'Activity', 
                 'pL:low', 'pL:high', 'pW:low', 'pW:high',
                 'sM:low', 'sM:high']]

In [48]:
f = lambda x: [{"label": y['Activity'], 
                "data": {"pL:low": y['pL:low'] , "pL:high": y['pL:high'],
                         "pW:low": y['pW:low'] , "pW:high": y['pW:high'],
                         "sM:low": y['sM:low'] , "sM:high": y['sM:high'],
                         }  } for index, y in x.iterrows()]

g = lambda x: [{"label": y['Activity'], 
                "data": {"pL": y['pL:low'] , 
                         "pW": y['pW:low'] , 
                         "sM": y['sM:low']  }  } for index, y in x.iterrows()]


trace_train = df_train.groupby('Case ID')[['Activity', 'pL:low', 'pL:high', 'pW:low', 'pW:high',
                 'sM:low', 'sM:high']].apply(f).reset_index()

trace_test1 = df_test1.groupby('Case ID')[['Activity', 'pL:low', 'pL:high', 'pW:low', 'pW:high',
                 'sM:low', 'sM:high']].apply(g).reset_index()

trace_test2 = df_test2.groupby('Case ID')[['Activity', 'pL:low', 'pL:high', 'pW:low', 'pW:high',
                 'sM:low', 'sM:high']].apply(f).reset_index()

trace_train.index = trace_train['Case ID']
trace_train = trace_train.drop('Case ID', axis=1)

trace_test1.index = trace_test1['Case ID']
trace_test1 = trace_test1.drop('Case ID', axis=1)

trace_test2.index = trace_test2['Case ID']
trace_test2 = trace_test2.drop('Case ID', axis=1)

# 2. Save the ground truth of original traces (Sepsis)

In [75]:
label = list()
for row1 in range(len(trace_test2)):
    predict = list()
    for row2 in range(len(trace_train)):
        if str(trace_test2[0][row1]) == str(trace_train[0][row2]):
            predict.append(trace_train.index[row2])
    
    label.append(predict)

In [76]:
import pickle
with open("label_sepsis", "wb") as fp:   #Pickling
    pickle.dump(label, fp)

# 3. Generate Encoded matrix (Sepsis)

In [18]:
acts = df_train.Activity.unique().tolist()
attrs = ['pL', 'pW', 'sM']
attrs_expand = [x + ":" + y for x in acts for y in attrs]

pd.options.display.float_format = '{:.0f}'.format
for attr in attrs_expand:
    df_train[attr] = 0
    for row in range(len(df_train)):
        if df_train['Activity'][row] in attr:
            df_train[attr][row] = str([format(df_train[str.split(attr, ':')[1]+':low'][row], '.1f'), format(df_train[str.split(attr, ':')[1]+':high'][row], '.1f')])
        else:
            df_train[attr][row] = str([str(np.nan), str(np.nan)])
            
            
pd.options.display.float_format = '{:.0f}'.format
for attr in attrs_expand:
    df_test1[attr] = 0
    for row in range(len(df_test1)):
        if df_test1['Activity'][row] in attr:
            df_test1[attr][row] = str([format(df_test1[str.split(attr, ':')[1]+':low'][row], '.1f'), format(df_test1[str.split(attr, ':')[1]+':high'][row], '.1f')])
        else:
            df_test1[attr][row] = str([str(np.nan), str(np.nan)]) 

In [19]:
dt_train_transformed = pd.get_dummies(df_train[ ['Case ID', 'Activity'] + attrs_expand], columns=attrs_expand)
dt_test_transformed = pd.get_dummies(df_test1[ ['Case ID', 'Activity'] + attrs_expand], columns=attrs_expand)

import ast
cols = dt_train_transformed.columns[2:].tolist()

for row in range(len(dt_train_transformed)):
    for attr in attrs_expand:
        x = ast.literal_eval(df_train[attr][row])
        x_low = float(x[0])
        x_high = float(x[1])
        
        cols_x = [s for s in cols if attr in s]
        for tcol in cols_x:
            y = ast.literal_eval(str.split(tcol,'_')[1])
            y_low = float(y[0])
            y_high = float(y[1])
            if x_low >= y_low and x_high <= y_high:
                dt_train_transformed[tcol][row] = 1
                
dt_test1_transformed = pd.DataFrame(0, index=np.arange(len(df_test1)) , columns= cols)
dt_test1_transformed.head()

for row in range(len(df_test1)):
    for attr in attrs_expand:
        x = ast.literal_eval(df_test1[attr][row])
        x_low = float(x[0])
        x_high = float(x[1])
        
        cols_x = [s for s in cols if attr in s]
        for tcol in cols_x:
            y = ast.literal_eval(str.split(tcol,'_')[1])
            y_low = float(y[0])
            y_high = float(y[1])
            if x_low >= y_low and x_high <= y_high:
                dt_test1_transformed[tcol][row] = 1
                
dt_test1_transformed = pd.concat([df_test1[['Case ID', 'Activity']], dt_test1_transformed], axis=1)
dt_test1_transformed['Case ID'] = ['test_' + i for i in dt_test1_transformed['Case ID']]

df = pd.concat([dt_train_transformed, dt_test1_transformed]).reset_index(drop=True)

In [32]:
from transformers.LastStateTransformer import LastStateTransformer
from transformers.AggregateTransformer import AggregateTransformer
from transformers.IndexBasedTransformer import IndexBasedTransformer
from transformers.AggregateNgramTransformer20 import AggregateNgramTransformer

from sklearn.pipeline import Pipeline
dir_home = os.getcwd()

In [33]:
m=1

pipe = Pipeline(steps=[
        ("AggregateTransformer", AggregateTransformer(case_id_col = 'Case ID',
                        cat_cols = ['Activity'], 
                        num_cols = cols))
        ])
encoded_df = pipe.fit_transform(df)
encoded_df.to_csv(dir_home+"/data_trans/sepsis8_aggregate_" + str(m) + ".csv", index= True)

In [21]:
pipe = Pipeline(steps=[
        ("AggregateTransformer", AggregateTransformer(case_id_col = 'Case ID',
                        cat_cols = ['Activity'] , 
                        num_cols = cols,
                        boolean= True))
        ])
encoded_df = pipe.fit_transform(df)
encoded_df.to_csv(dir_home+"/data_trans/sepsis8_bool_" + str(m) + ".csv", index= True)

In [23]:
pipe = Pipeline(steps=[
        ("IndexBasedTransformer", IndexBasedTransformer(case_id_col = 'Case ID',
                        cat_cols = ['Activity'] , 
                        num_cols = cols))
        ])
encoded_df = pipe.fit_transform(df)
encoded_df.to_csv(dir_home+"/data_trans/sepsis8_index_" + str(m) + ".csv", index= True)

In [None]:
pipe = Pipeline(steps=[
        ("AggregateNgramTransformer", AggregateNgramTransformer(case_id_col = 'Case ID',
                        act_col = 'Activity', n=2 , v= 0.7,
                        cat_cols = ['A1_Diagnose'],
                        num_cols = ['A2_CRP']))
        ])
encoded_df = pipe.fit_transform(df)

encoded_df.to_csv(dir_home+"/data_trans/sepsis8_aggngram2_" + str(m) + ".csv", index= True)

In [25]:
pipe = Pipeline(steps=[
        ("LastStateTransformer", LastStateTransformer(case_id_col = 'Case ID',
                        cat_cols = ['Activity'] , 
                        num_cols = cols))
        ])
encoded_df = pipe.fit_transform(df)
encoded_df.to_csv(dir_home+"/data_trans/sepsis8_laststate_" + str(m) + ".csv", index= True)

================================================================================

# 1. Prepare Road Fines event log

In [4]:
data = pd.read_csv("training_set_roadfines.csv",
                   dtype = {'amount:low' : np.float64,
                                'amount:high' : np.float64,
                                'dismissal:low' : np.float64,
                                'dismissal:high' : np.float64,
                                'points:low' : np.float64,
                                'points:high' : np.float64,
                                'paymentAmount:low' : np.float64,
                                'paymentAmount:high' : np.float64,
                                'expense:low' : np.float64,
                                'expense:high' : np.float64})

In [5]:
df_train = data[['Case ID', 'Activity', 
                 'amount:low', 'amount:high', 'dismissal:low', 'dismissal:high',
                 'points:low', 'points:high', 'paymentAmount:low', 
                 'paymentAmount:high', 'expense:low', 'expense:high']]

In [6]:
df_test1 = pd.read_csv("test_set_roadfines.csv",
                       dtype = {'amount:low' : np.float64,
                                'amount:high' : np.float64,
                                'dismissal:low' : np.float64,
                                'dismissal:high' : np.float64,
                                'points:low' : np.float64,
                                'points:high' : np.float64,
                                'paymentAmount:low' : np.float64,
                                'paymentAmount:high' : np.float64,
                                'expense:low' : np.float64,
                                'expense:high' : np.float64})
df_test2 = pd.read_csv("test_set_result_roadfines.csv", 
                       dtype = {'amount:low' : np.float64,
                                'amount:high' : np.float64,
                                'dismissal:low' : np.float64,
                                'dismissal:high' : np.float64,
                                'points:low' : np.float64,
                                'points:high' : np.float64,
                                'paymentAmount:low' : np.float64,
                                'paymentAmount:high' : np.float64,
                                'expense:low' : np.float64,
                                'expense:high' : np.float64})
df_test2.head()

Unnamed: 0,Case ID,Activity,Variant,Variant index,amount:low,amount:high,dismissal:low,dismissal:high,points:low,points:high,paymentAmount:low,paymentAmount:high,expense:low,expense:high
0,case 0,Create Fine,Variant 42,42,-9.223372e+18,9.223372e+18,-9.223372e+18,9.223372e+18,-9.223372e+18,9.223372e+18,,,,
1,case 1,Create Fine,Variant 42,42,-9.223372e+18,9.223372e+18,-9.223372e+18,9.223372e+18,-9.223372e+18,9.223372e+18,,,,
2,case 2,Create Fine,Variant 60,60,38.0,41.0,-9.223372e+18,9.223372e+18,-9.223372e+18,9.223372e+18,,,,
3,case 2,Payment,Variant 60,60,,,,,,,-9.223372e+18,9.223372e+18,,
4,case 3,Create Fine,Variant 42,42,-9.223372e+18,9.223372e+18,-9.223372e+18,9.223372e+18,-9.223372e+18,9.223372e+18,,,,


In [7]:
df_test1 = df_test1[['Case ID', 'Activity', 
                 'amount:low', 'amount:high', 'dismissal:low', 'dismissal:high',
                 'points:low', 'points:high', 'paymentAmount:low', 
                 'paymentAmount:high', 'expense:low', 'expense:high']]

df_test2 = df_test2[['Case ID', 'Activity', 
                 'amount:low', 'amount:high', 'dismissal:low', 'dismissal:high',
                 'points:low', 'points:high', 'paymentAmount:low', 
                 'paymentAmount:high', 'expense:low', 'expense:high']]
df_test2.head()

Unnamed: 0,Case ID,Activity,amount:low,amount:high,dismissal:low,dismissal:high,points:low,points:high,paymentAmount:low,paymentAmount:high,expense:low,expense:high
0,case 0,Create Fine,-9.223372e+18,9.223372e+18,-9.223372e+18,9.223372e+18,-9.223372e+18,9.223372e+18,,,,
1,case 1,Create Fine,-9.223372e+18,9.223372e+18,-9.223372e+18,9.223372e+18,-9.223372e+18,9.223372e+18,,,,
2,case 2,Create Fine,38.0,41.0,-9.223372e+18,9.223372e+18,-9.223372e+18,9.223372e+18,,,,
3,case 2,Payment,,,,,,,-9.223372e+18,9.223372e+18,,
4,case 3,Create Fine,-9.223372e+18,9.223372e+18,-9.223372e+18,9.223372e+18,-9.223372e+18,9.223372e+18,,,,


In [8]:
f = lambda x: [{"label": y['Activity'], 
                "data": {"amount:low": y['amount:low'] , "amount:high": y['amount:high'],
                         "dismissal:low": y['dismissal:low'] , "dismissal:high": y['dismissal:high'],
                         "points:low": y['points:low'] , "points:high": y['points:high'],
                         "paymentAmount:low": y['paymentAmount:low'] , "paymentAmount:high": y['paymentAmount:high'],
                         "expense:low": y['expense:low'] , "expense:high": y['expense:high']}  } for index, y in x.iterrows()]

g = lambda x: [{"label": y['Activity'], 
                "data": {"amount": y['amount:low'] , 
                         "dismissal": y['dismissal:low'] , 
                         "points": y['points:low'] , 
                         "paymentAmount": y['paymentAmount:low'] , 
                         "expense": y['expense:low'] }  } for index, y in x.iterrows()]


trace_train = df_train.groupby('Case ID')[['Activity', 'amount:low', 'amount:high', 'dismissal:low', 'dismissal:high',
                 'points:low', 'points:high', 'paymentAmount:low', 
                 'paymentAmount:high', 'expense:low', 'expense:high']].apply(f).reset_index()

trace_test1 = df_test1.groupby('Case ID')[['Activity', 'amount:low', 'amount:high', 'dismissal:low', 'dismissal:high',
                 'points:low', 'points:high', 'paymentAmount:low', 
                 'paymentAmount:high', 'expense:low', 'expense:high']].apply(g).reset_index()

trace_test2 = df_test2.groupby('Case ID')[['Activity', 'amount:low', 'amount:high', 'dismissal:low', 'dismissal:high',
                 'points:low', 'points:high', 'paymentAmount:low', 
                 'paymentAmount:high', 'expense:low', 'expense:high']].apply(f).reset_index()

trace_train.index = trace_train['Case ID']
trace_train = trace_train.drop('Case ID', axis=1)

trace_test1.index = trace_test1['Case ID']
trace_test1 = trace_test1.drop('Case ID', axis=1)

trace_test2.index = trace_test2['Case ID']
trace_test2 = trace_test2.drop('Case ID', axis=1)

In [9]:
# For statistics

attrs = ['amount', 'dismissal', 'points', 'paymentAmount', 'expense']
df_train2 = df_train.copy()
for attr in attrs:
    df_train2[attr] = 0
    for row in range(len(df_train2)):
        df_train2[attr][row] = str([str(df_train2[attr+':low'][row]), str(df_train2[attr+':high'][row])]) 
       
        
df_train2 = df_train2.drop(columns = ['amount:low', 'amount:high', 'dismissal:low', 'dismissal:high',
                 'points:low', 'points:high', 'paymentAmount:low', 
                 'paymentAmount:high', 'expense:low', 'expense:high'], 
                  axis =1).reset_index(drop=True)

# 2. Save the ground truth of original traces (Road Fines)

In [14]:
label = list()
for row1 in range(len(trace_test2)):
    predict = list()
    for row2 in range(len(trace_train)):
        if str(trace_test2[0][row1]) == str(trace_train[0][row2]):
            predict.append(trace_train.index[row2])
    
    label.append(predict)

In [15]:
import pickle

with open("label_road", "wb") as fp:   #Pickling
    pickle.dump(label, fp)

# 3. Generate Encoded matrix (Road Fines)

In [16]:
acts = df_train.Activity.unique().tolist()
attrs = ['amount', 'dismissal', 'points', 'paymentAmount', 'expense']
attrs_expand = [x + ":" + y for x in acts for y in attrs]

pd.options.display.float_format = '{:.0f}'.format
for attr in attrs_expand:
    df_train[attr] = 0
    for row in range(len(df_train)):
        if df_train['Activity'][row] in attr:
            df_train[attr][row] = str([format(df_train[str.split(attr, ':')[1]+':low'][row], '.1f'), format(df_train[str.split(attr, ':')[1]+':high'][row], '.1f')])
        else:
            df_train[attr][row] = str([str(np.nan), str(np.nan)])
            
            
pd.options.display.float_format = '{:.0f}'.format
for attr in attrs_expand:
    df_test1[attr] = 0
    for row in range(len(df_test1)):
        if df_test1['Activity'][row] in attr:
            df_test1[attr][row] = str([format(df_test1[str.split(attr, ':')[1]+':low'][row], '.1f'), format(df_test1[str.split(attr, ':')[1]+':high'][row], '.1f')])
        else:
            df_test1[attr][row] = str([str(np.nan), str(np.nan)]) 

In [17]:
dt_train_transformed = pd.get_dummies(df_train[ ['Case ID', 'Activity'] + attrs_expand], columns=attrs_expand)
dt_test_transformed = pd.get_dummies(df_test1[ ['Case ID', 'Activity'] + attrs_expand], columns=attrs_expand)

import ast
cols = dt_train_transformed.columns[2:].tolist()

for row in range(len(dt_train_transformed)):
    for attr in attrs_expand:
        x = ast.literal_eval(df_train[attr][row])
        x_low = float(x[0])
        x_high = float(x[1])
        
        cols_x = [s for s in cols if attr in s]
        for tcol in cols_x:
            y = ast.literal_eval(str.split(tcol,'_')[1])
            y_low = float(y[0])
            y_high = float(y[1])
            if x_low >= y_low and x_high <= y_high:
                dt_train_transformed[tcol][row] = 1
                
dt_test1_transformed = pd.DataFrame(0, index=np.arange(len(df_test1)) , columns= cols)
dt_test1_transformed.head()

for row in range(len(df_test1)):
    for attr in attrs_expand:
        x = ast.literal_eval(df_test1[attr][row])
        x_low = float(x[0])
        x_high = float(x[1])
        
        cols_x = [s for s in cols if attr in s]
        for tcol in cols_x:
            y = ast.literal_eval(str.split(tcol,'_')[1])
            y_low = float(y[0])
            y_high = float(y[1])
            if x_low >= y_low and x_high <= y_high:
                dt_test1_transformed[tcol][row] = 1
                
dt_test1_transformed = pd.concat([df_test1[['Case ID', 'Activity']], dt_test1_transformed], axis=1)
dt_test1_transformed['Case ID'] = ['test_' + i for i in dt_test1_transformed['Case ID']]

df = pd.concat([dt_train_transformed, dt_test1_transformed]).reset_index(drop=True)

In [18]:
from transformers.LastStateTransformer import LastStateTransformer
from transformers.AggregateTransformer import AggregateTransformer
from transformers.IndexBasedTransformer import IndexBasedTransformer
from transformers.AggregateNgramTransformer20 import AggregateNgramTransformer
from sklearn.pipeline import Pipeline
dir_home = os.getcwd()

In [23]:
m=1

pipe = Pipeline(steps=[
        ("AggregateTransformer", AggregateTransformer(case_id_col = 'Case ID',
                        cat_cols = ['Activity']+ cols, 
                        num_cols = []))
        ])
encoded_df = pipe.fit_transform(df)
encoded_df.to_csv(dir_home+"/data_trans/road_aggregate_" + str(m) + ".csv", index= True)

In [24]:
pipe = Pipeline(steps=[
        ("AggregateTransformer", AggregateTransformer(case_id_col = 'Case ID',
                        cat_cols = ['Activity'] +cols, 
                        num_cols = [],
                        boolean= True))
        ])
encoded_df = pipe.fit_transform(df)
encoded_df.to_csv(dir_home+"/data_trans/road_bool_" + str(m) + ".csv", index= True)

In [25]:
pipe = Pipeline(steps=[
        ("IndexBasedTransformer", IndexBasedTransformer(case_id_col = 'Case ID',
                        cat_cols = ['Activity']+cols , 
                        num_cols = []))
        ])
encoded_df = pipe.fit_transform(df)
encoded_df.to_csv(dir_home+"/data_trans/road_index_" + str(m) + ".csv", index= True)

In [21]:
pipe = Pipeline(steps=[
        ("AggregateNgramTransformer", AggregateNgramTransformer(case_id_col = 'Case ID',
                        act_col = 'Activity', n=2 , v= 0.7,
                        cat_cols = cols,
                        num_cols = []))
        ])
encoded_df = pipe.fit_transform(df)
encoded_df.to_csv(dir_home+"/data_trans/road_aggngram2_" + str(m) + ".csv", index= True)

In [32]:
pipe = Pipeline(steps=[
        ("LastStateTransformer", LastStateTransformer(case_id_col = 'Case ID',
                        cat_cols = ['Activity'] + cols , 
                        num_cols = []))
        ])
encoded_df = pipe.fit_transform(df)
encoded_df.to_csv(dir_home+"/data_trans/road_laststate_" + str(m) + ".csv", index= True)

================================================================