In [1]:
import numpy as np
import pandas as pd
import os
from easydict import EasyDict as edict
from dataset_manager_optimized import DatasetManager
from preprocessing.bucketing import get_bucketer
from preprocessing.encoding import get_encoder
from sklearn.pipeline import FeatureUnion

In [2]:
from tqdm.notebook import tqdm
tqdm.pandas()

# Load data

In [3]:
case_id_col = "Case ID"
activity_col = "Activity"
resource_col = 'org:resource'
timestamp_col = 'time:timestamp'
label_col = "label"
pos_label = "deviant"
neg_label = "regular"

In [4]:
# features for classifier
# Categorical features 
dynamic_cat_cols = [activity_col, resource_col, 'Action', 'CreditScore', 'EventOrigin', 'lifecycle:transition',
                   "Accepted", "Selected"] # i.e. event attributes
static_cat_cols = ['ApplicationType', 'LoanGoal'] # i.e. case attributes that are known from the start

# Numeric features
dynamic_num_cols = ['FirstWithdrawalAmount', 'MonthlyCost', 'NumberOfTerms', 'OfferedAmount',
                   "timesincelastevent", "timesincecasestart", "timesincemidnight", "event_nr", "month", "weekday", "hour",
                    "open_cases"]
static_num_cols = ['RequestedAmount']

static_cols = static_cat_cols + static_num_cols + [case_id_col, label_col]
dynamic_cols = dynamic_cat_cols + dynamic_num_cols + [timestamp_col]
cat_cols = dynamic_cat_cols + static_cat_cols

In [5]:
config = edict({'case_id_col': case_id_col,
                           'activity_col': activity_col,
                           'resource_col': resource_col,
                           'timestamp_col': timestamp_col,
                           'label_col': label_col,
                           'pos_label_col': pos_label,
                           'dynamic_cat_cols': dynamic_cat_cols,
                           'static_cat_cols': static_cat_cols,
                           'dynamic_num_cols': dynamic_num_cols,
                           'static_num_cols': static_num_cols,
                'min_prefix_length': 1,
                'max_prefix_length': 20})

In [6]:
from dataset_column_schema import DatasetSchemas 


data_dir = '/data/leuven/365/vsc36567/xAI-PPM/data/processed_benchmark_event_logs'
ds_name = 'bpic2017' 
ds_file_names = ['BPIC17_O_Accepted.csv']#, 'BPIC17_O_Cancelled.csv', 'BPIC17_O_Refused.csv']
bpic17_column_schema = DatasetSchemas.bpic2017()

bucketing_method = 'single'
encoding_method = 'agg'
encoding_dict = {
            "laststate": ["static", "last"],
            "agg": ["static", "agg"],
            "index": ["static", "index"],
            "combined": ["static", "last", "agg"]
        }

TypeError: DatasetColumnSchema.__init__() got an unexpected keyword argument 'pos_label'

In [5]:
# Provide the path to your 'processed_benchmark_event_logs' folder 
data_dir = r'/data/leuven/365/vsc36567/xAI-PPM/data/processed_benchmark_event_logs' 

# dataset  #"BPIC2017_O_Cancelled", "BPIC2017_O_Refused"] were commented to fit the encoded data into available memory 
dataset_ref_to_datasets = {
    "bpic2017" : ["BPIC17_O_Accepted"]#, "BPIC2017_O_Cancelled", "BPIC2017_O_Refused"],
}

bucketing = 'single'
encoding = 'agg'
method_name = ('_').join([bucketing, encoding])

encoding_dict = {
    'agg' : ['static', 'agg'],
    'index' : ['static', 'index']
    }

gap = 1
train_ratio = 0.8

## Optimize hyperparameters

In [9]:
from hyperopt import Trials, STATUS_OK, tpe, fmin, hp
import hyperopt
from hyperopt.pyll.base import scope
from hyperopt.pyll.stochastic import sample
import time 
import sys
import os.path as osp

In [10]:
# Read the datasets
# DataManager splits the data and extracts prefixes 

dataset_name = 'bpic2017'
train_prefixes = {}
test_prefixes = {}

encoder_config = bpic17_column_schema.get_encoder_args(fillna=True)
# encoder_config = {'case_id_col': case_id_col,
#                 'static_cat_cols': static_cat_cols,
#                 'static_num_cols': static_num_cols,
#                 'dynamic_cat_cols': dynamic_cat_cols,
#                 'dynamic_num_cols': dynamic_num_cols,
#                 'fillna': True}

start_time = time.time()
for file_name in dataset_ref_to_datasets[dataset_name]:
    activity = file_name.split('_')[-1]
    dm = DatasetManager(dataset_name, bpic17_column_schema)
    df = dm.read_dataset(osp.join(data_dir, file_name + '.csv'))
    max_prefix_length = min(config.max_prefix_length, dm.get_pos_case_length_quantile(df, 0.90))

    # Splitting the data into train and test set 
    train, test = dm.split_data_strict(df, train_ratio=train_ratio, split='temporal')
    print('Shape of the train set: ', train.shape, '\nShape of the test set: ', test.shape)
    
    # Generating prefixes 
    df_train_prefixes = dm.generate_prefix_data(train, min_prefix_length, max_prefix_length)
    df_test_prefixes = dm.generate_prefix_data(test, min_prefix_length, max_prefix_length)
    
    # Create buckets for each prexif or a single one that fits all the prexifes  
    bucketer = get_bucketer(method=bucketing, case_id_col=case_id_col)
    train_bucket = bucketer.fit_predict(df_train_prefixes)
    test_bucket = bucketer.predict(df_test_prefixes)

    # Iterating over the set of generated buckets
    for bucket in set(test_bucket):

        train_bucket_ind = dm.get_indexes(df_train_prefixes)[bucket == train_bucket]
        test_bucket_ind = dm.get_indexes(df_test_prefixes)[bucket == test_bucket]   

        # extracting training data for the experiment
        df_train_bucket = dm.get_data_by_indexes(df_train_prefixes, train_bucket_ind)
        df_test_bucket = dm.get_data_by_indexes(df_test_prefixes, test_bucket_ind)
        
        _, train_y = np.asarray(dm.get_labels(df_train_bucket))
        _, test_y = np.asarray(dm.get_labels(df_test_bucket))

        # Get a set of encoders for preprocessing of static and dynamic features
        featureCombinerExperiment = FeatureUnion(
                [(enc_method, get_encoder(enc_method, **encoder_config)) for enc_method in encoding_dict[encoding]])
        
        encoded_train_bucket = featureCombinerExperiment.fit_transform(df_train_bucket)
        encoded_test_bucket = featureCombinerExperiment.fit_transform(df_test_bucket)        


        enc_fnames = []
        for _, transformer in featureCombinerExperiment.transformer_list:
            for new_fname in transformer.get_feature_names():
                enc_fnames.append(new_fname)
        # enc_fnames.append('encoded_label')

        # create a dataframe with the encoded training features and label
        # encoded_training = np.concatenate((encoded_training, train_y.reshape(-1, 1)), axis=1)
        training_set_df = pd.DataFrame(encoded_training, columns=enc_fnames)

AttributeError: 'DatasetColumnSchema' object has no attribute 'pos_label_col'

In [10]:
# The train set encoding, commented for the test set for the sake of allocated memory


