In [1]:
from transformers.PreviousStateTransformer import PreviousStateTransformer
from transformers.LastStateTransformer import LastStateTransformer
from transformers.AggregateTransformer import AggregateTransformer
from transformers.IndexBasedTransformer import IndexBasedTransformer
from transformers.ComplexIndexBasedTransformer import ComplexIndexBasedTransformer
from transformers.ComplexIndexNgramTransformer import ComplexIndexNgramTransformer
from transformers.StaticTransformer import StaticTransformer
import pm4py
import os
from sklearn.pipeline import Pipeline
from sklearn.cluster import DBSCAN
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import pickle
import warnings
from scipy.spatial import distance
warnings.filterwarnings(action='ignore')
dir_home = os.getcwd()

dir_ref = dir_home +"/ref"
import time 

In [2]:
from sklearn.neighbors import NearestNeighbors

def mydist(x, y):
    return np.sum(x*y) # for minimizing problem

def nearest_neighbors(values, all_values, nbr_neighbors, algorithm, metric, metric_params:None):
    nn = NearestNeighbors(n_neighbors = nbr_neighbors, algorithm=algorithm, metric = metric, metric_params = metric_params).fit(all_values)  # algo: 'auto', 'ball_tree', 'kd_tree', 'brute'
    dists, idxs = nn.kneighbors(values)

    return dists, idxs


def nearest_neighbors2(values, all_values, nbr_neighbors):
    nn = NearestNeighbors(n_neighbors = nbr_neighbors, algorithm='ball_tree', metric = 'pyfunc', metric_params={"func":mydist}).fit(all_values)  # algo: 'auto', 'ball_tree', 'kd_tree', 'brute'
    dists, idxs = nn.kneighbors(values)
    return dists, idxs

# 1. The result in Table 4 can be obtained by running below code.

In [42]:
# SEPSIS DIST - general (k rate)
name = 'sepsis'
split = 0.5
rate = 0.3
encoding = ['bool', 'aggregate',  'index',  'laststate', 'aggngram']
metric = [ 'cosine', 'euclidean', 'manhattan']

m=7
result = pd.DataFrame(columns=['data', 'encoding_method' , 'distance_metric', 'precision1', 'precision2',
                               'mean', 'std', 'min', 'max', 'sim', 'time'])
def minmax(x):
    return (x-min(x))/(max(x)-min(x))

for e in encoding:
    print(e)
    data = pd.read_csv(dir_home + '/data_trans/' + name + "_" + e + "_" + str(m)  +".csv")
    data = data.fillna(-1)
    
    train = data[~data['Case ID'].astype(str).str.contains('\_', na=False)].reset_index(drop=True)
    test = data[data['Case ID'].astype(str).str.contains('\_', na=False)].reset_index(drop=True)

    train.index = train['Case ID']
    train = train.drop( 'Case ID', axis=1)
    test.index = test['Case ID']
    test = test.drop( 'Case ID', axis=1)
    
    if e == "ngram" or "complexngram" or "aggngram":
        train_trace = train.filter(regex='\|')
        test_trace = test.filter(regex='\|')
    else:
        train_trace = train.filter(regex='Activity_')
        test_trace = test.filter(regex='Activity_')    
    
    train_attr = train.drop(train_trace.columns , axis =1)
    test_attr = test.drop(test_trace.columns, axis = 1)
    

    w_a = split*np.repeat(1, len(train_trace.columns))/len(train_trace.columns)
    w_a = w_a.tolist()

    w_b = pd.DataFrame([str.split(c, "_")[0] for c in train_attr.columns], columns=['key'] )
    w_b['weight'] = 1
    w_b['weight2'] = w_b.groupby('key')['weight'].cumsum()
    w_b['max'] = w_b.groupby('key')['weight2'].transform(max)
    w_b['weight3'] = w_b['weight']/w_b['max']

    w_b = w_b['weight3'].tolist()
    w_b = [(1-split)*w/sum(w_b) for w in w_b]
    
    loc_a = [ train.columns.tolist().index(ttt)  for ttt in train_trace.columns]
    loc_b = [ train.columns.tolist().index(ttt)  for ttt in train_attr.columns]
    customized_weights = np.repeat(0.0, len(train.columns))
    customized_weights[loc_a] = w_a
    customized_weights[loc_b] = w_b
    
    train = train.apply(lambda x: x*customized_weights, axis= 1)
    test = test.apply(lambda x: x*customized_weights, axis= 1)
    
    
    encoded_train = train.values.tolist()
    encoded_test = test.values.tolist()
    
    encoded_train2 = train_trace.values.tolist()
    encoded_test2 = test_trace.values.tolist()

    encoded_test_attr = test_attr.values.tolist()
    for d in metric:
        time_start_align = time.time()
        
        dists, idxs = nearest_neighbors(np.array(encoded_test), 
                                    np.array(encoded_train), 
                                    nbr_neighbors = int(np.floor(len(train)*rate)), 
                                    algorithm = 'auto',
                                    metric = d,
                                    metric_params = None) # 20 , len(sn_encoded_train) 
        

        predict = list()
        dist_mean = list()
        dist_std = list()
        dist_min = list()
        dist_max = list()
        sim = list()
        for i in range(0, len(test_trace)):
            predict.append(  train.index[idxs[i]] )
            dist_mean.append(  np.mean(dists[i]) )
            dist_std.append(  np.std(dists[i]) )
            dist_min.append(  np.min(dists[i]) )
            dist_max.append(  np.max(dists[i]) )
            sim.append( np.mean( 1/(1+dists[i])) )
            
            
        time_finish_align = time.time()


        with open("ref_cocomot" , "rb") as fp:
            ref_predict = pickle.load(fp, encoding='utf-8') 
        with open("ref_leven" , "rb") as fp:
            ref_predict2 = pickle.load(fp, encoding='utf-8') 
        
        recall_sum = 0
        precision_sum = 0
        acc_sum1 = 0 
        acc_sum2 = 0 
        for l in range(0,30):  # change
            i1 = ref_predict[l]
            acc_1 = sum([1 for j in i1 if j[0] in predict[l]])/len(i1)
            i2 = ref_predict2[l]
            acc_2 = sum([1 for j in i2 if j[0] in predict[l]])/len(i2)
            
            acc_sum1 = acc_sum1 + acc_1
            acc_sum2 = acc_sum2 + acc_2
        acc1 = acc_sum1/30
        acc2 = acc_sum2/30

        result.loc[len(result)+1] = [name, e, d, acc1, acc2, np.mean(dist_mean), np.mean(dist_std),
                                np.mean(dist_min), np.mean(dist_max), np.mean(sim),
                                (time_finish_align - time_start_align) ]
        
result.to_csv(dir_home+"/result/result_dist30_" + name + "_" + str(rate) +"_"  + str(split) + ".csv", index= False) 

bool
aggregate
index
laststate
aggngram


# 2. The result in Table 6 can be obtained by running below code.

In [55]:
# SEPSIS DIST - general (k size)
name = 'sepsis'
split = 0.5
size = 10
encoding = ['bool', 'aggregate',  'index',  'laststate', 'aggngram']
metric = [ 'cosine', 'euclidean', 'manhattan']

m=7
result = pd.DataFrame(columns=['data', 'encoding_method' , 'distance_metric', 'precision1', 'precision2',
                               'mean', 'std', 'min', 'max', 'sim1', 'sim2', 'time'])
def minmax(x):
    return (x-min(x))/(max(x)-min(x))

for e in encoding:
    print(e)
    data = pd.read_csv(dir_home + '/data_trans/' + name + "_" + e + "_" + str(m)  +".csv")
    data = data.fillna(-1)
    
    max_l = data.max(axis=0).values 
    ll = 0 
    col_loc =list()
    col_norm = list()
    for ml in max_l:
        if type(ml)==str:
            pass
        else:
            if ml > 1:
                col_loc.append(ll)
                col_norm.append(ml)
        ll = ll+1     
        
    for ll in range(len(col_loc)):
        data[data.columns[col_loc[ll]] ]  = data[data.columns[col_loc[ll]] ]/col_norm[ll]    
    
    train = data[~data['Case ID'].astype(str).str.contains('\_', na=False)].reset_index(drop=True)
    test = data[data['Case ID'].astype(str).str.contains('\_', na=False)].reset_index(drop=True)

    train.index = train['Case ID']
    train = train.drop( 'Case ID', axis=1)
    test.index = test['Case ID']
    test = test.drop( 'Case ID', axis=1)
    
    if e == "ngram" or "complexngram":
        train_trace = train.filter(regex='\|')
        test_trace = test.filter(regex='\|')
    else:
        train_trace = train.filter(regex='Activity_')
        test_trace = test.filter(regex='Activity_')    
    
    train_attr = train.drop(train_trace.columns , axis =1)
    test_attr = test.drop(test_trace.columns, axis = 1)
    

    w_a = split*np.repeat(1, len(train_trace.columns))/len(train_trace.columns)
    w_a = w_a.tolist()

    w_b = pd.DataFrame([str.split(c, "_")[0] for c in train_attr.columns], columns=['key'] )
    w_b['weight'] = 1
    w_b['weight2'] = w_b.groupby('key')['weight'].cumsum()
    w_b['max'] = w_b.groupby('key')['weight2'].transform(max)
    w_b['weight3'] = w_b['weight']/w_b['max']

    w_b = w_b['weight3'].tolist()
    w_b = [(1-split)*w/sum(w_b) for w in w_b]
    
    
    loc_a = [ train.columns.tolist().index(ttt)  for ttt in train_trace.columns]
    loc_b = [ train.columns.tolist().index(ttt)  for ttt in train_attr.columns]
    customized_weights = np.repeat(0.0, len(train.columns))
    customized_weights[loc_a] = w_a
    customized_weights[loc_b] = w_b
    
    train = train.apply(lambda x: x*customized_weights, axis= 1)
    test = test.apply(lambda x: x*customized_weights, axis= 1)
    
    
    encoded_train = train.values.tolist()
    encoded_test = test.values.tolist()
    
    encoded_train2 = train_trace.values.tolist()
    encoded_test2 = test_trace.values.tolist()

    encoded_test_attr = test_attr.values.tolist()
    for d in metric:
        time_start_align = time.time()
        
        dists, idxs = nearest_neighbors(np.array(encoded_test), 
                                    np.array(encoded_train), 
                                    nbr_neighbors = size, 
                                    algorithm = 'auto',
                                    metric = d,
                                    metric_params = None) # 20 , len(sn_encoded_train) 
        
        
        predict = list()
        dist_mean = list()
        dist_std = list()
        dist_min = list()
        dist_max = list()
        sim= list()
        for i in range(0, len(test_trace)):
            predict.append(  train.index[idxs[i]] )
            dist_mean.append(  np.mean(dists[i]) )
            dist_std.append(  np.std(dists[i]) )
            dist_min.append(  np.min(dists[i]) )
            dist_max.append(  np.max(dists[i]) )
            sim.append( np.mean( 1-dists[i]) )
            
            
        time_finish_align = time.time()

        with open("ref_cocomot" , "rb") as fp:
            ref_predict = pickle.load(fp, encoding='utf-8') 
        with open("ref_leven" , "rb") as fp:
            ref_predict2 = pickle.load(fp, encoding='utf-8') 
        
        recall_sum = 0
        precision_sum = 0
        acc_sum1 = 0 
        acc_sum2 = 0 
        dist1 = 0
        dist2 = 0
        sim1_sum = 0
        sim2_sum = 0
        for l in range(0,30):  # change
            i1 = ref_predict[l]
            acc_1 = sum([1 for j in i1 if j[0] in predict[l]])/len(i1)
            i2 = ref_predict2[l]
            acc_2 = sum([1 for j in i2 if j[0] in predict[l]])/len(i2)
            
            opt1 = train.loc[train.index.isin([rp1[0] for rp1 in i1.tolist()])]
            opt2 = train.loc[train.index.isin([rp1[0] for rp1 in i2.tolist()])]
            
            apa = train.loc[train.index.isin([rp1[0] for rp1 in predict[l]])]
            
            sim1 = 0
            sim2 = 0
            
            for apa_i in range(len(apa)):
                sim1_i = sum( [1-distance.euclidean(apa.iloc[apa_i] , opt1.iloc[opt1_i]) for opt1_i in range(len(opt1)) ] )/len(opt1)
                sim2_i = sum( [1-distance.euclidean(apa.iloc[apa_i] , opt2.iloc[opt2_i]) for opt2_i in range(len(opt2)) ] )/len(opt2)
                
                sim1 = sim1+ sim1_i
                sim2 = sim2+ sim2_i
            
            
            acc_sum1 = acc_sum1 + acc_1
            acc_sum2 = acc_sum2 + acc_2
            
            sim1_sum = sim1_sum + sim1/(len(apa)*len(opt1))
            sim2_sum = sim2_sum + sim2/(len(apa)*len(opt2))
        acc1 = acc_sum1/30
        acc2 = acc_sum2/30
        
        sim1_out = sim1_sum/30
        sim2_out = sim2_sum/30
        print(sim1_out)
        result.loc[len(result)+1] = [name, e, d, acc1, acc2, np.mean(dist_mean), np.mean(dist_std),
                                np.mean(dist_min), np.mean(dist_max), np.mean(sim1_out), np.mean(sim2_out),
                                (time_finish_align - time_start_align) ]
        
result.to_csv(dir_home+"/result/result_sim_" + name + "_" + str(size) +"_"  + str(split) + ".csv", index= False) 

bool
0.6062214432521722
0.6065076527028136
0.6062935329900856
aggregate
0.6079201295932517
0.6081753486347631
0.6082110928369853
index
0.6196312613847257
0.6195982285768463
0.6195989747372875
laststate
0.6020542505033757
0.5995266624451614
0.5995264535077536
aggngram
0.606774927940437
0.6065641960112671
0.6067143947371026


==========================================================================================================

# 3. The result in Table 3 can be obtained by running below code.

In [50]:
# Road - general (k rate)
name = 'road'
rate = 0.02
split = 0.5
encoding = ['bool', 'aggregate',  'index',  'laststate', 'aggngram']
metric = [ 'cosine', 'euclidean', 'manhattan']
m=1
result = pd.DataFrame(columns=['data', 'encoding_method' , 'distance_metric', 'precision', 
                               'mean', 'std', 'min', 'max', 'sim', 'time'])
def minmax(x):
    return (x-min(x))/(max(x)-min(x))

for e in encoding:
    print(e)
    data = pd.read_csv(dir_home + '/data_trans/' + name + "_" + e + "_" + str(m)  +".csv")
    data = data.fillna(-1)
    

    train = data[~data['Case ID'].astype(str).str.contains('test', na=False)].reset_index(drop=True)
    test = data[data['Case ID'].astype(str).str.contains('test', na=False)].reset_index(drop=True)

    train.index = train['Case ID']
    train = train.drop( 'Case ID', axis=1)
    test.index = test['Case ID']
    test = test.drop( 'Case ID', axis=1)
    
    if e == "ngram":
        train_trace = train.filter(regex='\|')
        test_trace = test.filter(regex='\|')
    else:
        train_trace = train.filter(regex='Activity_')
        test_trace = test.filter(regex='Activity_')    
    
    train_attr = train.drop(train_trace.columns , axis =1)
    test_attr = test.drop(test_trace.columns, axis = 1)

    w_a = split*np.repeat(1, len(train_trace.columns))/len(train_trace.columns)
    w_a = w_a.tolist()

    w_b = pd.DataFrame([str.split(c, "_")[0] for c in train_attr.columns], columns=['key'] )
    w_b['weight'] = 1
    w_b['weight2'] = w_b.groupby('key')['weight'].cumsum()
    w_b['max'] = w_b.groupby('key')['weight2'].transform(max)
    w_b['weight3'] = w_b['weight']/w_b['max']

    w_b = w_b['weight3'].tolist()
    w_b = [(1-split)*w/sum(w_b) for w in w_b]
    
    
    loc_a = [ train.columns.tolist().index(ttt)  for ttt in train_trace.columns]
    loc_b = [ train.columns.tolist().index(ttt)  for ttt in train_attr.columns]
    customized_weights = np.repeat(0.0, len(train.columns))
    customized_weights[loc_a] = w_a
    customized_weights[loc_b] = w_b

    train = train.apply(lambda x: x*customized_weights, axis= 1)
    test = test.apply(lambda x: x*customized_weights, axis= 1)
    
    encoded_train = train.values.tolist()
    encoded_test = test.values.tolist()

    encoded_test_attr = test_attr.values.tolist()
    for d in metric:
        time_start_align = time.time()
        
        
        dists, idxs = nearest_neighbors(np.array(encoded_test), 
                                    np.array(encoded_train), 
                                    nbr_neighbors = int(np.floor(len(train)*rate)), 
                                    algorithm = 'auto',
                                    metric = d,
                                    metric_params = None) # 20 , len(sn_encoded_train) 
        
        predict = list()
        dist_mean = list()
        dist_std = list()
        dist_min = list()
        dist_max = list()
        sim = list()
        for i in range(0, len(test_trace)):        
            predict.append(  train.index[idxs[i]] )
            dist_mean.append(  np.mean(dists[i]) )
            dist_std.append(  np.std(dists[i]) )
            dist_min.append(  np.min(dists[i]) )
            dist_max.append(  np.max(dists[i]) )
            sim.append( np.mean( 1/(1+dists[i])) )

        time_finish_align = time.time()

        with open("label_"+name , "rb") as fp:
            ref_predict = pickle.load(fp, encoding='utf-8') 
        recall_sum = 0
        precision_sum = 0
        acc_sum1 = 0 
        acc_sum2 = 0 
        count=0
        for l in range(0, len(test_trace)):  # change
            i = ref_predict[l]
            if len(i) >0:
                acc_1 = (i[0] in predict[l])
                acc_sum1 = acc_sum1 + acc_1
                count += 1
            
        acc1 = acc_sum1/count
        print(acc1)
        result.loc[len(result)+1] = [name, e, d, acc1, np.mean(dist_mean), np.mean(dist_std),
                                np.mean(dist_min), np.mean(dist_max), np.mean(sim),
                                (time_finish_align - time_start_align) ]
result.to_csv(dir_home+"/result/result_dist5_" + name + "_" + str(rate) +"_"  + str(split) + ".csv", index= False) 

bool
0.05517241379310345
0.1129973474801061
0.1129973474801061
aggregate
0.07374005305039788
0.047214854111405836
0.04880636604774536
index
0.3310344827586207
0.329973474801061
0.329973474801061
laststate
0.09124668435013263
0.0986737400530504
0.0986737400530504
aggngram
0.5517241379310345
0.5575596816976127
0.5692307692307692


# 4. The result in Table 5 can be obtained by running below code.

In [24]:
# Road - general (k size)
name = 'road'
size = 10
split = 0.5
encoding = ['bool', 'aggregate',  'index',  'laststate', 'aggngram']
metric = [ 'cosine', 'euclidean', 'manhattan']

m=1
result = pd.DataFrame(columns=['data', 'encoding_method' , 'distance_metric', 'precision', 
                               'mean', 'std', 'min', 'max', 'sim1' , 'time'])
def minmax(x):
    return (x-min(x))/(max(x)-min(x))

for e in encoding:
    print(e)
    data = pd.read_csv(dir_home + '/data_trans/' + name + "_" + e + "_" + str(m)  +".csv")
    data = data.fillna(-1)
    
    max_l = data.max(axis=0).values 
    ll = 0 
    col_loc =list()
    col_norm = list()
    for ml in max_l:
        if type(ml)==str:
            pass
        else:
            if ml > 1:
                col_loc.append(ll)
                col_norm.append(ml)
        ll = ll+1     
        
    for ll in range(len(col_loc)):
        data[data.columns[col_loc[ll]] ]  = data[data.columns[col_loc[ll]] ]/col_norm[ll]    
    train = data[~data['Case ID'].astype(str).str.contains('test', na=False)].reset_index(drop=True)
    test = data[data['Case ID'].astype(str).str.contains('test', na=False)].reset_index(drop=True)

    train.index = train['Case ID']
    train = train.drop( 'Case ID', axis=1)
    test.index = test['Case ID']
    test = test.drop( 'Case ID', axis=1)
    
    if e == "ngram":
        train_trace = train.filter(regex='\|')
        test_trace = test.filter(regex='\|')
    else:
        train_trace = train.filter(regex='Activity_')
        test_trace = test.filter(regex='Activity_')    
    
    train_attr = train.drop(train_trace.columns , axis =1)
    test_attr = test.drop(test_trace.columns, axis = 1)

    w_a = split*np.repeat(1, len(train_trace.columns))/len(train_trace.columns)
    w_a = w_a.tolist()

    w_b = pd.DataFrame([str.split(c, "_")[0] for c in train_attr.columns], columns=['key'] )
    w_b['weight'] = 1
    w_b['weight2'] = w_b.groupby('key')['weight'].cumsum()
    w_b['max'] = w_b.groupby('key')['weight2'].transform(max)
    w_b['weight3'] = w_b['weight']/w_b['max']

    w_b = w_b['weight3'].tolist()
    w_b = [(1-split)*w/sum(w_b) for w in w_b]
    
    
    loc_a = [ train.columns.tolist().index(ttt)  for ttt in train_trace.columns]
    loc_b = [ train.columns.tolist().index(ttt)  for ttt in train_attr.columns]
    customized_weights = np.repeat(0.0, len(train.columns))
    customized_weights[loc_a] = w_a
    customized_weights[loc_b] = w_b


    train = train.apply(lambda x: x*customized_weights, axis= 1)
    test = test.apply(lambda x: x*customized_weights, axis= 1)
    
    encoded_train = train.values.tolist()
    encoded_test = test.values.tolist()

    encoded_test_attr = test_attr.values.tolist()
    for d in metric:
        time_start_align = time.time()
        
        
        dists, idxs = nearest_neighbors(np.array(encoded_test), 
                                    np.array(encoded_train), 
                                    nbr_neighbors = size, 
                                    algorithm = 'auto',
                                    metric = d,
                                    metric_params = None) # 20 , len(sn_encoded_train) 
        
        predict = list()
        dist_mean = list()
        dist_std = list()
        dist_min = list()
        dist_max = list()
        sim = list()
        for i in range(0, len(test_trace)):        
            predict.append(  train.index[idxs[i]] )
            dist_mean.append(  np.mean(dists[i]) )
            dist_std.append(  np.std(dists[i]) )
            dist_min.append(  np.min(dists[i]) )
            dist_max.append(  np.max(dists[i]) )
            sim.append( np.mean( 1/(1+dists[i])) )

        time_finish_align = time.time()

        with open("label_"+name , "rb") as fp:
            ref_predict = pickle.load(fp, encoding='utf-8') 
            
        recall_sum = 0
        precision_sum = 0
        acc_sum1 = 0 
        acc_sum2 = 0 
        dist1 = 0
        dist2 = 0
        sim1_sum = 0
        sim2_sum = 0
        for l in range(0,len(test)):  # change
            i1 = ref_predict[l]
            acc_1 = sum([1 for j in i1 if j[0] in predict[l]])/len(i1)
            
            opt1 = train.loc[train.index.isin(i1)]
            
            apa = train.loc[train.index.isin([rp1 for rp1 in predict[l]])]
            
            sim1 = 0
            
            for apa_i in range(len(apa)):
                sim1_i = sum( [1-distance.euclidean(apa.iloc[apa_i] , opt1.iloc[opt1_i]) for opt1_i in range(len(opt1)) ] )/len(opt1)                
                sim1 = sim1+ sim1_i            
            
            acc_sum1 = acc_sum1 + acc_1
            
            sim1_sum = sim1_sum + sim1/(len(apa)*len(opt1))
        acc1 = acc_sum1/len(test)
        acc2 = acc_sum2/len(test)
        
        sim1_out = sim1_sum/len(test)
        print(sim1_out)
        result.loc[len(result)+1] = [name, e, d, acc1, np.mean(dist_mean), np.mean(dist_std),
                                np.mean(dist_min), np.mean(dist_max), np.mean(sim1_out),
                                (time_finish_align - time_start_align) ]
result.to_csv(dir_home+"/result/result_dist5_" + name + "_" + str(size) +"_"  + str(split) + ".csv", index= False) 

bool
0.9772875026236928
0.9773446210708625
0.9774802675376904
aggregate
0.9730420401232429
0.9596797483969746
0.9476288006232388
index
0.9901732410980985
0.9902191191870047
0.9811627326996627
laststate
0.9841393163844594
0.9840657445049604
0.9840617713187639
aggngram
0.9953259401759348
0.9953394155491504
0.9953428445193031
