##############################################

############# TABLE OF CONTENTS #############

##############################################
- 1) Import packages and functions
- 2) Function for preprocessing the data
- 3) Parameters

DON'T FORGET TO ADAPT THE NUMBER OF EPOCHS



```
# Things to upload to your file from Google drive:
- dataset_confs.py
- DatasetManager.py
- your hyperparameter file (from hyperopt, with the arguments)
- dataset (csv file)

In [1]:
#incomplete_levels = ['25', '50', '75']

#levels = {'25': 0.25, '50': 0.50, '75': 0.75}

incomplete_levels = ['50', '75']
levels = {'50': 0.50, '75': 0.75}

In [2]:
import pu_keras as puk



In [3]:
import sys

import dataset_confs

import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold


class DatasetManager:
    
    def __init__(self, dataset_name):
        self.dataset_name = dataset_name
        
        self.case_id_col = dataset_confs.case_id_col[self.dataset_name]
        self.activity_col = dataset_confs.activity_col[self.dataset_name]
        self.timestamp_col = dataset_confs.timestamp_col[self.dataset_name]
        self.label_col = dataset_confs.label_col[self.dataset_name]
        self.pos_label = dataset_confs.pos_label[self.dataset_name]

        self.dynamic_cat_cols = dataset_confs.dynamic_cat_cols[self.dataset_name]
        self.static_cat_cols = dataset_confs.static_cat_cols[self.dataset_name]
        self.dynamic_num_cols = dataset_confs.dynamic_num_cols[self.dataset_name]
        self.static_num_cols = dataset_confs.static_num_cols[self.dataset_name]
        
        self.sorting_cols = [self.timestamp_col, self.activity_col]

    
    def read_dataset(self, datalocation):
        # read dataset
        dtypes = {col:"object" for col in self.dynamic_cat_cols+self.static_cat_cols+[self.case_id_col, self.label_col, self.timestamp_col]}
        for col in self.dynamic_num_cols + self.static_num_cols:
            dtypes[col] = "float"

        data = pd.read_csv(datalocation, sep=";", dtype=dtypes)
        data[self.timestamp_col] = pd.to_datetime(data[self.timestamp_col])

        if self.dataset_name in ['bpic2011_f1', 'bpic2011_f2', 'bpic2011_f3', 'bpic2011_f4','bpic2015_1_f2','bpic2015_2_f2','bpic2015_3_f2','bpic2015_4_f2','bpic2015_5_f2','sepsis_cases_1','sepsis_cases_2','sepsis_cases_4']:
            data['time:timestamp'] = pd.to_datetime(data['time:timestamp']) 
        if self.dataset_name in ['bpic2012_accepted', 'bpic2012_cancelled', 'bpic2012_declined']:
            data['Complete Timestamp'] = pd.to_datetime(data['Complete Timestamp'])

        return data
    


    def split_data(self, data, train_ratio, split="temporal", seed=22):  
        # split into train and test using temporal split

        grouped = data.groupby(self.case_id_col)
        start_timestamps = grouped[self.timestamp_col].min().reset_index()
        if split == "temporal":
            start_timestamps = start_timestamps.sort_values(self.timestamp_col, ascending=True, kind="mergesort")
        elif split == "random":
            np.random.seed(seed)
            start_timestamps = start_timestamps.reindex(np.random.permutation(start_timestamps.index))
        train_ids = list(start_timestamps[self.case_id_col])[:int(train_ratio*len(start_timestamps))]
        train = data[data[self.case_id_col].isin(train_ids)].sort_values(self.timestamp_col, ascending=True, kind='mergesort')
        test = data[~data[self.case_id_col].isin(train_ids)].sort_values(self.timestamp_col, ascending=True, kind='mergesort')

        return (train, test)
    
    def split_data_strict(self, data, train_ratio, split="temporal"):  
        # split into train and test using temporal split and discard events that overlap the periods
        data = data.sort_values(self.sorting_cols, ascending=True, kind='mergesort')
        grouped = data.groupby(self.case_id_col)
        start_timestamps = grouped[self.timestamp_col].min().reset_index()
        start_timestamps = start_timestamps.sort_values(self.timestamp_col, ascending=True, kind='mergesort')
        train_ids = list(start_timestamps[self.case_id_col])[:int(train_ratio*len(start_timestamps))]
        train = data[data[self.case_id_col].isin(train_ids)].sort_values(self.sorting_cols, ascending=True, kind='mergesort')
        test = data[~data[self.case_id_col].isin(train_ids)].sort_values(self.sorting_cols, ascending=True, kind='mergesort')
        split_ts = test[self.timestamp_col].min()
        train = train[train[self.timestamp_col] < split_ts]
        return (train, test)
    
    def split_data_discard(self, data, train_ratio, split="temporal"):  
        # split into train and test using temporal split and discard events that overlap the periods
        data = data.sort_values(self.sorting_cols, ascending=True, kind='mergesort')
        grouped = data.groupby(self.case_id_col)
        start_timestamps = grouped[self.timestamp_col].min().reset_index()
        start_timestamps = start_timestamps.sort_values(self.timestamp_col, ascending=True, kind='mergesort')
        train_ids = list(start_timestamps[self.case_id_col])[:int(train_ratio*len(start_timestamps))]
        train = data[data[self.case_id_col].isin(train_ids)].sort_values(self.sorting_cols, ascending=True, kind='mergesort')
        test = data[~data[self.case_id_col].isin(train_ids)].sort_values(self.sorting_cols, ascending=True, kind='mergesort')
        split_ts = test[self.timestamp_col].min()
        overlapping_cases = train[train[self.timestamp_col] >= split_ts][self.case_id_col].unique()
        train = train[~train[self.case_id_col].isin(overlapping_cases)]
        return (train, test)
    
    
    def split_val(self, data, val_ratio, split="random", seed=22):  
        # split into train and test using temporal split
        grouped = data.groupby(self.case_id_col)
        start_timestamps = grouped[self.timestamp_col].min().reset_index()
        if split == "temporal":
            start_timestamps = start_timestamps.sort_values(self.timestamp_col, ascending=True, kind="mergesort")
        elif split == "random":
            np.random.seed(seed)
            start_timestamps = start_timestamps.reindex(np.random.permutation(start_timestamps.index))
        val_ids = list(start_timestamps[self.case_id_col])[-int(val_ratio*len(start_timestamps)):]
        val = data[data[self.case_id_col].isin(val_ids)].sort_values(self.sorting_cols, ascending=True, kind="mergesort")
        train = data[~data[self.case_id_col].isin(val_ids)].sort_values(self.sorting_cols, ascending=True, kind="mergesort")
        return (train, val)


    def generate_prefix_data(self, data, min_length, max_length, gap=1):
        # generate prefix data (each possible prefix becomes a trace)
        data['case_length'] = data.groupby(self.case_id_col)[self.activity_col].transform(len)

        dt_prefixes = data[data['case_length'] >= min_length].groupby(self.case_id_col).head(min_length)
        dt_prefixes["prefix_nr"] = 1
        dt_prefixes["orig_case_id"] = dt_prefixes[self.case_id_col]
        for nr_events in range(min_length+gap, max_length+1, gap):
            tmp = data[data['case_length'] >= nr_events].groupby(self.case_id_col).head(nr_events)
            tmp["orig_case_id"] = tmp[self.case_id_col]
            tmp[self.case_id_col] = tmp[self.case_id_col].apply(lambda x: "%s_%s"%(x, nr_events))
            tmp["prefix_nr"] = nr_events
            dt_prefixes = pd.concat([dt_prefixes, tmp], axis=0)
        
        dt_prefixes['case_length'] = dt_prefixes['case_length'].apply(lambda x: min(max_length, x))
        
        return dt_prefixes


    def get_pos_case_length_quantile(self, data, quantile=0.90):
        return int(np.ceil(data[data[self.label_col]==self.pos_label].groupby(self.case_id_col).size().quantile(quantile)))

    def get_indexes(self, data):
        return data.groupby(self.case_id_col).first().index

    def get_relevant_data_by_indexes(self, data, indexes):
        return data[data[self.case_id_col].isin(indexes)]

    def get_label(self, data):
        return data.groupby(self.case_id_col).first()[self.label_col]
    
    def get_prefix_lengths(self, data):
        return data.groupby(self.case_id_col).last()["prefix_nr"]
    
    def get_case_ids(self, data, nr_events=1):
        case_ids = pd.Series(data.groupby(self.case_id_col).first().index)
        if nr_events > 1:
            case_ids = case_ids.apply(lambda x: "_".join(x.split("_")[:-1]))
        return case_ids
    
    def get_label_numeric(self, data):
        y = self.get_label(data) # one row per case
        return [1 if label == self.pos_label else 0 for label in y]
    
    def get_class_ratio(self, data):
        class_freqs = data[self.label_col].value_counts()
        return class_freqs[self.pos_label] / class_freqs.sum()
    
    def get_stratified_split_generator(self, data, n_splits=5, shuffle=True, random_state=22):
        grouped_firsts = data.groupby(self.case_id_col, as_index=False).first()
        skf = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
        
        for train_index, test_index in skf.split(grouped_firsts, grouped_firsts[self.label_col]):
            current_train_names = grouped_firsts[self.case_id_col][train_index]
            train_chunk = data[data[self.case_id_col].isin(current_train_names)].sort_values(self.timestamp_col, ascending=True, kind='mergesort')
            test_chunk = data[~data[self.case_id_col].isin(current_train_names)].sort_values(self.timestamp_col, ascending=True, kind='mergesort')
            yield (train_chunk, test_chunk)
            
    def get_idx_split_generator(self, dt_for_splitting, n_splits=5, shuffle=True, random_state=22):
        skf = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
        
        for train_index, test_index in skf.split(dt_for_splitting, dt_for_splitting[self.label_col]):
            current_train_names = dt_for_splitting[self.case_id_col][train_index]
            current_test_names = dt_for_splitting[self.case_id_col][test_index]
            yield (current_train_names, current_test_names)
            

# **import packages and functions**

In [4]:
# functions and packages
#import EncoderFactory


import dataset_confs
#from DatasetManager import DatasetManager
import pandas as pd
import numpy as np
import os
import pickle
import random
from scipy.stats import spearmanr
from scipy.spatial import distance
from sklearn.metrics import roc_auc_score
from sklearn.base import BaseEstimator, TransformerMixin
from pandas.api.types import is_string_dtype
from collections import OrderedDict
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

#LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, BatchNormalization,Masking, Dropout, Input, Multiply
from tensorflow.keras.layers import concatenate, Embedding, LSTM, Bidirectional, TimeDistributed, Softmax, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Nadam, Adam, SGD, RMSprop
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
import tensorflow.keras.utils as ku
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.regularizers import l2
from tensorflow.keras import backend

## Functions with source

In [5]:
# SOURCE: https://towardsdatascience.com/using-neural-networks-with-embedding-layers-to-encode-high-cardinality-categorical-variables-c1b872033ba2
class ColumnEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns = None
        self.maps = dict()

    def transform(self, X):
        X_copy = X.copy()
        for col in self.columns:
            # encode value x of col via dict entry self.maps[col][x]+1 if present, otherwise 0
            X_copy.loc[:,col] = X_copy.loc[:,col].apply(lambda x: self.maps[col].get(x, -1)+1)
        return X_copy

    def inverse_transform(self, X):
        X_copy = X.copy()
        for col in self.columns:
            values = list(self.maps[col].keys())
            # find value in ordered list and map out of range values to None
            X_copy.loc[:,col] = [values[i-1] if 0<i<=len(values) else None for i in X_copy[col]]
        return X_copy

    def fit(self, X, y=None):
        # only apply to string type columns
        self.columns = [col for col in X.columns if is_string_dtype(X[col])]
        for col in self.columns:
            self.maps[col] = OrderedDict({value: num for num, value in enumerate(sorted(set(X[col])))})
        return self

def prepare_inputs(X_train, X_test, data):  
    global ce
    ce = ColumnEncoder()
    X_train, X_test = X_train.astype(str), X_test.astype(str)
    X_train_enc = ce.fit_transform(X_train)
    X_test_enc = ce.transform(X_test)
    return X_train_enc, X_test_enc

## Functions from stackoverflow

In [6]:
def numeric_padding(sequences, maxlen=None, value=0):
    num_samples = len(sequences)
    sample_shape = np.asarray(sequences[0]).shape[1:]
    x = np.full((num_samples, maxlen) + sample_shape, value)
    for idx, s in enumerate(sequences):
        trunc = s[:maxlen]
        x[idx, :maxlen] = trunc[0]
        
def remove_punctuations(columns_before):
    columns = []
    for string in columns_before:
        new_string = string.replace(":", "_")
        columns.append(new_string)
    return columns

def create_index(log_df, column):
    """Creates an idx for a categorical attribute.
    Args:
        log_df: dataframe.
        column: column name.
    Returns:
        index of a categorical attribute pairs.
    """
    temp_list = temp_list = log_df[log_df[column] != 'none'][[column]].values.tolist() #remove all 'none' values from the index
    subsec_set = {(x[0]) for x in temp_list}
    subsec_set = sorted(list(subsec_set))
    alias = dict()
    if column !='next_activity':
        for i, _ in enumerate(subsec_set):          
            alias[subsec_set[i]] = i + 1
        alias['none'] = 0
    else:
        for i, _ in enumerate(subsec_set):
            alias[subsec_set[i]] = i  
    #reorder by the index value
    alias = {k: v for k, v in sorted(alias.items(), key=lambda item: item[1])}
    return alias

#call this function with the name of the right column
def create_indexes(i, data):
    cat_index = create_index(data, i)
    cat_index['Start'] = 0
    cat_index['End'] = len(cat_index)
    index_cat = {v: k for k, v in cat_index.items()}
    cat_weights = ku.to_categorical(sorted(index_cat.keys()), len(cat_index))
    return cat_weights, index_cat, cat_index

## From from Alexander

In [7]:
def groupby_caseID(data, cols):
    ans = [pd.DataFrame(y) for x, y in data[cols].groupby('Case ID', as_index=False)]
    return ans
def pad_cat_data(cols, data_train, data_test, maxlen):
    
    #padding of the different categorical columns
    #train paddings
    paddings_train = []
    for i in cols:
        padding= []
        for k in range(0,len(data_train)):
            temp = []
            temp = list(data_train[k][i])
            padding.append(temp)
        padded = np.array(pad_sequences(padding,maxlen=maxlen, padding='pre', truncating='pre',value=0))
        #padded = padded/len(data.groupby([i]))
        paddings_train.append(padded)

    #test paddings
    paddings_test = []
    for i in cols:
        padding= []
        for k in range(0,len(data_test)):
            temp = []
            temp = list(data_test[k][i])
            padding.append(temp)
        padded = np.array(pad_sequences(padding,maxlen=maxlen, padding='pre', truncating='pre',value=0))
        #padded = padded/len(data.groupby([i]))
        paddings_test.append(padded)
    return paddings_train, paddings_test

def pad_num_data(cols, data_train, data_test, maxlen, dt_train_prefixes, dt_test_prefixes):
    pad_train = []
    pad_test  = []
    for i in cols:
        
        padding = []
        for k in range(0,len(data_train)):
            temp_train = []
            temp_train = list(data_train[k][i])
            padding.append(temp_train)

        padded = np.array(pad_sequences(padding,maxlen=maxlen, padding='pre', truncating='pre',value=0))
        if dt_train_prefixes[i].max() !=0:
           
            padded = padded/dt_train_prefixes[i].max()
        else:
            padded = padded
        pad_train.append(padded)
   
    for i in cols:
      
        padding = []
        for k in range(0,len(data_test)):
            temp_test = []
            temp_test = list(data_test[k][i])
            padding.append(temp_test)
      
        padded = np.array(pad_sequences(padding,maxlen=maxlen, padding='pre', truncating='pre',value=0))
        if dt_test_prefixes[i].max() !=0:
            padded = padded/dt_test_prefixes[i].max()
        else:
            padded = padded
        pad_test.append(padded)
    return pad_train, pad_test

def reshape_num_data(pad_data, cutoff):
        pad_num = np.reshape(pad_data, (len(pad_data), cutoff, 1))
        return pad_num
def labels_after_grouping(data_train,data_test):
    train_labels = []
    for i in range (0,len(data_train)):
        temp_label = data_train[i]['label'].iloc[0]
        train_labels.append(temp_label)

    test_labels = []
    for i in range (0,len(data_test)):
        temp_label = data_test[i]['label'].iloc[0]
        test_labels.append(temp_label)
    train_y = [1 if i!='regular' else 0 for i in train_labels]
    test_y = [1 if i!='regular' else 0 for i in test_labels]
    return train_y, test_y


# **Function for preprocessing the data**

In [8]:
def create_data(dt_train_prefixes, dt_test_prefixes, cat_cols, numerical_cols):
    #get the label of the train and test set
    test_y = dataset_manager.get_label_numeric(dt_test_prefixes)
    train_y = dataset_manager.get_label_numeric(dt_train_prefixes)   

    dt_train_prefixes[cat_cols],dt_test_prefixes[cat_cols]= prepare_inputs(dt_train_prefixes[cat_cols], dt_test_prefixes[cat_cols], data)
    dt_train_prefixes[cat_cols] = dt_train_prefixes[cat_cols]+1
    dt_test_prefixes[cat_cols] = dt_test_prefixes[cat_cols]+1
    
    #append caseId and label
    cat_cols.append('Case ID')
    cat_cols.append('label')
    
    #groupby case ID
    ans_train = groupby_caseID(dt_train_prefixes, cat_cols)
    ans_test = groupby_caseID(dt_test_prefixes, cat_cols)
    #obtain the new label lists after grouping
    train_y, test_y = labels_after_grouping(ans_train, ans_test)
    #remove then back
    cat_cols.remove('label')
    cat_cols.remove('Case ID')
    #pad cat columns
    paddings_train, paddings_test = pad_cat_data(cat_cols, ans_train, ans_test, maxlen)
  
    #NUMERICAL COLUMNS

    numerical_columns.append('Case ID')
    ans_train2 = groupby_caseID(dt_train_prefixes, numerical_columns)
    ans_test2 = groupby_caseID(dt_test_prefixes, numerical_columns )
    numerical_columns.remove('Case ID')  
    pad_train, pad_test = pad_num_data(numerical_columns, ans_train2, ans_test2, maxlen, dt_train_prefixes, dt_test_prefixes)

    return pad_train, pad_test, paddings_train, paddings_test, train_y, test_y

# Function to flip labels

In [9]:
def count_labels(data_y):
    print("total size", len(data_y))
    print("regular", data_y.count("regular"))
    print("deviant", data_y.count("deviant"))

def count_labels_number(data_y):
    print("total size", len(data_y))
    print("regular", data_y.count(0))
    print("deviant", data_y.count(1))


# Parameters

In [10]:
######PARAMETERS
params_dir = 'params'
results_dir ='results'
column_selection = 'all'
train_ratio = 0.8
n_splits = 3
random_state = 22
n_iter=1

encoding = ['embeddings']
cls_method ='LSTM'

csv_files = {
    #"bpic2011": ["BPIC11_f%s"%formula for formula in range(3,4)],
    "bpic2015": ["BPIC15_%s_f2"%(municipality) for municipality in range(3,4)],
    #"sepsis_cases": ["sepsis_cases_4"],
    #"bpic2012": ["bpic2012_O_ACCEPTED#COMPLETE","bpic2012_O_CANCELLED-COMPLETE","bpic2012_0_DECLINED-COMPLETE"],
    #production": ["Production"],
    #"bpic2017": ["BPIC17_O_Accepted","BPIC17_O_Cancelled","BPIC17_0_Refused"],
    #"bpic2017": ["BPIC17_O_Cancelled"],
    #"traffic_fines": ["traffic_fines_%s"%formula for formula in range(1,3)],
    #"hospital_billing": ["hospital_billing_%s"%suffix for suffix in [2,3]]
}
files = []
for k, v in csv_files.items():
    files.extend(v)
dataset_ref_to_datasets = {
    #"bpic2011": ["bpic2011_f%s"%formula for formula in range(3,4)],
    "bpic2015": ["bpic2015_%s_f2"%(municipality) for municipality in range(3,4)],
    #"sepsis_cases": ["sepsis_cases_4"]
    #"bpic2012": ["bpic2012_accepted","bpic2012_cancelled","bpic2012_declined"],
    #"production": ["production"],
    #"bpic2017": ["bpic2017_cancelled"],
    #"bpic2017": ["bpic2017_accepted","bpic2017_cancelled","bpic2017_refused"],
    #"traffic_fines": ["traffic_fines_%s"%formula for formula in range(1,3)],
    #"hospital_billing": ["hospital_billing_%s"%suffix for suffix in [2,3]]
}



files = []
for k, v in csv_files.items():
    files.extend(v)
datasets = []
for k, v in dataset_ref_to_datasets.items():
    datasets.extend(v)
res = {datasets[i]: files[i] for i in range(len(datasets))}

# **loop over datasets and classifiers**

In [11]:
for dataset_name in datasets:
    dataset_name_csv = res[dataset_name]

    dataset_manager = DatasetManager(dataset_name)

    data = dataset_manager.read_dataset('Original_data/'+dataset_name_csv+'.csv')

    for cls_encoding in encoding:
        for level in incomplete_levels: 

            flip_ratio_ = levels[level]  
            label_freq_ = 1.0 - flip_ratio_  ## P(labeled | y = 1)


            print('Dataset:', dataset_name)
            print('Classifier', cls_method)
            print('Encoding', cls_encoding)
            method_name = "%s_%s"%(column_selection, cls_encoding)
            
            #optimal parameters (see hyperopt file)
            optimal_params_filename = os.path.join(params_dir, "PU_optimal_params_%s_%s_%s_%s.pickle" % (cls_method, dataset_name, level, method_name))
            if not os.path.isfile(optimal_params_filename) or os.path.getsize(optimal_params_filename) <= 0:
                print('problem')
            with open(optimal_params_filename, "rb") as fin:
                args = pickle.load(fin)
                print(args)
 
            # read the data
            dataset_manager = DatasetManager(dataset_name)

            #if dataset_name in ['bpic2011_f1', 'bpic2011_f2', 'bpic2011_f3', 'bpic2011_f4','bpic2015_1_f2','bpic2015_2_f2','bpic2015_3_f2','bpic2015_4_f2','bpic2015_5_f2','sepsis_cases_1','sepsis_cases_2','sepsis_cases_4']:
            #data['time:timestamp'] = pd.to_datetime(data['time:timestamp'])
            #if dataset_name in ['bpic2012_accepted', 'bpic2012_cancelled', 'bpic2012_declined']:
            #data['Complete Timestamp'] = pd.to_datetime(data['Complete Timestamp'])

            cls_encoder_args = {'case_id_col': dataset_manager.case_id_col, 
                                'static_cat_cols': dataset_manager.static_cat_cols,
                                'static_num_cols': dataset_manager.static_num_cols, 
                                'dynamic_cat_cols': dataset_manager.dynamic_cat_cols,
                                'dynamic_num_cols': dataset_manager.dynamic_num_cols, 
                                'fillna': True}
                
            #file to save results
            outfile = os.path.join('', "PU_performance_results_%s_%s_%s_%s.csv" % (cls_method, dataset_name, level, method_name))
                
            # determine min and max (truncated) prefix lengths
            min_prefix_length = 1
            if "traffic_fines" in dataset_name:
                max_prefix_length = 10
            elif "bpic2017" in dataset_name:
                max_prefix_length = min(20, dataset_manager.get_pos_case_length_quantile(data, 0.90))
            else:
                max_prefix_length = min(40, dataset_manager.get_pos_case_length_quantile(data, 0.90))
            maxlen = cutoff = max_prefix_length
                
            # split into training and test
            #train, test = dataset_manager.split_data_strict(data, train_ratio, split="temporal")
            train = dataset_manager.read_dataset('Data/Train_PU'+level+'_'+dataset_name_csv+'.csv')

        
            test = dataset_manager.read_dataset('Data/Test_'+dataset_name_csv+'.csv')

                
            #prefix generation of train and test data
            dt_train_prefixes = dataset_manager.generate_prefix_data(train, min_prefix_length, max_prefix_length)
            dt_test_prefixes = dataset_manager.generate_prefix_data(test, min_prefix_length, max_prefix_length)
            
            #transform data (padded)
            cat_cols = cls_encoder_args['dynamic_cat_cols']+cls_encoder_args['static_cat_cols']
            numerical_columns = cls_encoder_args['dynamic_num_cols']+cls_encoder_args['static_num_cols']
            pad_train, pad_test, paddings_train, paddings_test, train_y, test_y = create_data(dt_train_prefixes, dt_test_prefixes, cat_cols,numerical_columns)

            #DELETE THIS LATER
            count_labels_number(train_y)
            count_labels_number(test_y)
        
            #create the input layers and embeddings
            embeddings= []
            input_layers = []
            preds_all = []
            nr_events_all = []
            nr_events = list(dataset_manager.get_prefix_lengths(dt_test_prefixes))
            nr_events_all.extend(nr_events)
            test_y_all = []
            test_y_all.extend(test_y)
            
            score = 0
            dim = 0
            
            #cat cols
            for i in cat_cols:
                no_values = len(data.groupby([i]))
                cat_weights, index_cat, cat_index = create_indexes(i, data)
                i=i.replace('(','_')               
                i=i.replace(')','_')                
                i=i.replace(' ','_')                
                i=i.replace(':','_')
                input_layer = Input(shape=(cutoff,), name=i)
                embedding = Embedding(cat_weights.shape[0],
                                      cat_weights.shape[1],
                                      weights=[cat_weights],
                                      input_length=no_values+1,
                                      name='embed_'+i)(input_layer) 
                embeddings.append(embedding)
                input_layers.append(input_layer)
                dim += cat_weights.shape[1]

            #static input layers
            for j in numerical_columns:
                j=j.replace('(','_')               
                j=j.replace(')','_')                
                j=j.replace(' ','_')                
                j=j.replace(':','_')
                input_layer = Input(shape=(cutoff,1), name=j)
                input_layers.append(input_layer)
                embeddings.append(input_layer)
                dim +=1

            #create the model inputs
            model_inputs= []
            model_inputs_test= []
            for i in range(0,len(paddings_train)):
                 model_inputs.append(paddings_train[i])

            for i in range(0,len(paddings_test)):
                model_inputs_test.append(paddings_test[i])

            for i in range(0,len(pad_train)):
                model_inputs.append(reshape_num_data(pad_train[i], cutoff))

            for i in range(0,len(pad_test)):
                model_inputs_test.append(reshape_num_data(pad_test[i], cutoff))

            full_embs = concatenate(embeddings, name='full_embedding')
            l2reg=0.001
            # train a 2-layer bidirectional LSTM with dropout
            l1 = Bidirectional(LSTM(args['lstm_size'], return_sequences=True,  dropout=args['LSTM_dropout']), name='LSTM1')
            l1_out = l1(full_embs)
            l2 = Bidirectional(LSTM(args['lstm_size'], return_sequences=False, dropout=args['LSTM_dropout']), name='LSTM2')
            l2_out = l2(l1_out)      
            output_layer = Dense(1, activation='sigmoid', name='final_output')(l2_out)
            
            #MODEL
            model = Model(inputs=[input_layers], outputs=output_layer)

            if args['optimizer']=='RMSprop':
                opt = RMSprop(learning_rate=args['learning_rate'])
            if args['optimizer']=='Nadam':
                opt = Nadam(learning_rate=args['learning_rate'])
            if args['optimizer']=='Adam':
                opt = Adam(learning_rate=args['learning_rate'])
            if args['optimizer']=='SGD':
                opt = SGD(learning_rate=args['learning_rate'])

            model.compile(loss={'final_output':puk.nnPU_loss(label_freq_)}, optimizer= opt)

            model.summary()

            early_stopping = EarlyStopping(monitor='val_loss', patience=42)
            model_checkpoint = ModelCheckpoint('output_files/models/model_{epoch:02d}-{val_loss:.2f}.h5', monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto')
            lr_reducer = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, verbose=0, mode='auto', min_delta=0.0001, cooldown=0, min_lr=0)

            result = model.fit(model_inputs,
                               np.array(train_y),
                               callbacks=[early_stopping, lr_reducer],
                               validation_split = 0.1,
                               verbose=2, batch_size=args['batch_size'],
                               epochs=300) 
            pred = model.predict(model_inputs_test)
            preds_all.extend(pred)
            auc_total = roc_auc_score(test_y_all, preds_all)

            outfile = os.path.join(results_dir, "PU_performance_results_%s_%s_%s_%s.csv" % (cls_method, dataset_name, level, method_name))

            with open(outfile, 'w') as fout:
                fout.write("%s;%s;%s;%s;%s;%s;%s\n"%("dataset","level", "method", "cls", "nr_events", "metric", "score"))
                dt_results = pd.DataFrame({"actual": test_y_all, "predicted": preds_all, "nr_events": nr_events_all})
                for nr_events, group in dt_results.groupby("nr_events"):
                    if len(set(group.actual)) < 2:
                        fout.write("%s;%s;%s;%s;%s;%s;%s;%s\n"%(dataset_name,level, method_name, cls_method, nr_events,-1, "auc", np.nan))
                    else:
                        fout.write("%s;%s;%s;%s;%s;%s;%s;%s\n"%(dataset_name,level, method_name, cls_method, nr_events,-1, "auc", roc_auc_score(group.actual, group.predicted)))
                fout.write("%s;%s;%s;%s;%s;%s;%s\n"%(dataset_name,level, method_name, cls_method,-1, "auc", roc_auc_score(dt_results.actual, dt_results.predicted)))



Dataset: bpic2015_3_f2
Classifier LSTM
Encoding embeddings
{'LSTM_dropout': 0.13974149171990055, 'batch_size': 192, 'learning_rate': 0.007966382215671723, 'lstm_size': 128, 'optimizer': 'RMSprop'}
total size 37400
regular 33847
deviant 3553
total size 10041
regular 7468
deviant 2573
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Activity (InputLayer)          [(None, 40)]         0           []                               
                                                                                                  
 monitoringResource (InputLayer  [(None, 40)]        0           []                               
 )                                                                                                
                                                                                                  
 questio

Epoch 1/300


  ag__.if_stmt((ag__.ld(label_smoothing) is not 0), if_body, else_body, get_state, set_state, ('y_true',), 1)


176/176 - 20s - loss: 0.3531 - val_loss: 0.3120 - lr: 0.0080 - 20s/epoch - 113ms/step
Epoch 2/300
176/176 - 12s - loss: 0.2531 - val_loss: 0.3244 - lr: 0.0080 - 12s/epoch - 69ms/step
Epoch 3/300
176/176 - 12s - loss: 0.1693 - val_loss: 0.5554 - lr: 0.0080 - 12s/epoch - 68ms/step
Epoch 4/300
176/176 - 12s - loss: 0.1193 - val_loss: 0.5405 - lr: 0.0080 - 12s/epoch - 70ms/step
Epoch 5/300
176/176 - 12s - loss: 0.1016 - val_loss: 0.8965 - lr: 0.0080 - 12s/epoch - 67ms/step
Epoch 6/300
176/176 - 12s - loss: 0.0895 - val_loss: 0.6363 - lr: 0.0080 - 12s/epoch - 68ms/step
Epoch 7/300
176/176 - 12s - loss: 0.0768 - val_loss: 0.9589 - lr: 0.0080 - 12s/epoch - 68ms/step
Epoch 8/300
176/176 - 12s - loss: 0.0737 - val_loss: 0.7712 - lr: 0.0080 - 12s/epoch - 68ms/step
Epoch 9/300
176/176 - 12s - loss: 0.0711 - val_loss: 0.7634 - lr: 0.0080 - 12s/epoch - 68ms/step
Epoch 10/300
176/176 - 12s - loss: 0.0678 - val_loss: 0.9371 - lr: 0.0080 - 12s/epoch - 68ms/step
Epoch 11/300
176/176 - 13s - loss: 0.062

Epoch 1/300
192/192 - 32s - loss: 0.3569 - val_loss: 0.4345 - lr: 0.0071 - 32s/epoch - 168ms/step
Epoch 2/300
192/192 - 26s - loss: 0.2021 - val_loss: 0.3942 - lr: 0.0071 - 26s/epoch - 137ms/step
Epoch 3/300
192/192 - 27s - loss: 0.1198 - val_loss: 0.3177 - lr: 0.0071 - 27s/epoch - 140ms/step
Epoch 4/300
192/192 - 27s - loss: 0.0870 - val_loss: 0.4148 - lr: 0.0071 - 27s/epoch - 140ms/step
Epoch 5/300
192/192 - 26s - loss: 0.0736 - val_loss: 0.2719 - lr: 0.0071 - 26s/epoch - 137ms/step
Epoch 6/300
192/192 - 28s - loss: 0.0667 - val_loss: 0.4551 - lr: 0.0071 - 28s/epoch - 147ms/step
Epoch 7/300
192/192 - 26s - loss: 0.0591 - val_loss: 0.5257 - lr: 0.0071 - 26s/epoch - 138ms/step
Epoch 8/300
192/192 - 26s - loss: 0.0561 - val_loss: 0.4372 - lr: 0.0071 - 26s/epoch - 137ms/step
Epoch 9/300
192/192 - 26s - loss: 0.0493 - val_loss: 0.4717 - lr: 0.0071 - 26s/epoch - 137ms/step
Epoch 10/300
192/192 - 27s - loss: 0.0485 - val_loss: 0.7253 - lr: 0.0071 - 27s/epoch - 139ms/step
Epoch 11/300
192/19