In [1]:
import numpy as np
import pandas as pd
import os
from easydict import EasyDict as edict
from dataset_manager import DatasetManager
from preprocessing.bucketing import get_bucketer
from preprocessing.encoding import get_encoder
from sklearn.pipeline import FeatureUnion

In [2]:
from tqdm.notebook import tqdm
tqdm.pandas()

# Load data

In [3]:
case_id_col = "Case ID"
activity_col = "Activity"
resource_col = 'org:resource'
timestamp_col = 'time:timestamp'
label_col = "label"
pos_label = "deviant"
neg_label = "regular"

# These will be the targets of the classification task
relevant_offer_events = ["O_Cancelled", "O_Accepted", "O_Refused"]

In [4]:
resource_freq_threshold = 10
max_category_levels = 10

In [5]:
# features for classifier
# Categorical features 
dynamic_cat_cols = [activity_col, resource_col, 'Action', 'CreditScore', 'EventOrigin', 'lifecycle:transition',
                   "Accepted", "Selected"] # i.e. event attributes
static_cat_cols = ['ApplicationType', 'LoanGoal'] # i.e. case attributes that are known from the start

# Numeric features
dynamic_num_cols = ['FirstWithdrawalAmount', 'MonthlyCost', 'NumberOfTerms', 'OfferedAmount',
                   "timesincelastevent", "timesincecasestart", "timesincemidnight", "event_nr", "month", "weekday", "hour",
                    "open_cases"]
static_num_cols = ['RequestedAmount']

static_cols = static_cat_cols + static_num_cols + [case_id_col, label_col]
dynamic_cols = dynamic_cat_cols + dynamic_num_cols + [timestamp_col]
cat_cols = dynamic_cat_cols + static_cat_cols

In [6]:
config = edict({'columns': {'case_id_col': case_id_col,
                           'activity_col': activity_col,
                           'resource_col': resource_col,
                           'timestamp_col': timestamp_col,
                           'label_col': label_col,
                           'pos_label_col': pos_label,
                           'dynamic_cat_cols': dynamic_cat_cols,
                           'static_cat_cols': static_cat_cols,
                           'dynamic_num_cols': dynamic_num_cols,
                           'static_num_cols': static_num_cols}
                })

In [7]:
# Provide the path to your 'data' folder 
data_dir = r'/data/leuven/365/vsc36567/xAI-PPM/data' 
# dataset  #"BPIC2017_O_Cancelled", "BPIC2017_O_Refused"] were commented to fit the encoded data into available memory 
dataset_ref_to_datasets = {
    "bpic2017" : ["BPIC17_O_Accepted",] #"BPIC2017_O_Cancelled", "BPIC2017_O_Refused"],
}

bucketing = 'single'
encoding = 'agg'
method_name = ('_').join([bucketing, encoding])

encoding_dict = {
    'agg' : ['static', 'agg'],
    'index' : ['static', 'index']
    }

gap = 1
train_ratio = 0.5
min_cases_for_training = 1

In [9]:
# A folder for encoded datasets should be created in your 'data' folder
encoded_datasets_dir = f'/data/leuven/365/vsc36567/xAI-PPM/data/encoded_datasets_{method_name}'
if not os.path.exists(encoded_datasets_dir):
  os.makedirs(os.path.join(encoded_datasets_dir))

In [10]:
# Read the datasets
# DataManager splits the data and extracts prefixes 

dataset_name = 'bpic2017'
train_prefixes = {}
test_prefixes = {}

for file_name in dataset_ref_to_datasets[dataset_name]:
    activity = file_name.split('_')[-1]
    file_path = os.path.join(data_dir, file_name + '.csv')
    dm = DatasetManager(dataset_name, config)
    df = dm.read_dataset(file_path)

    min_prefix_length_final = 1
    max_prefix_length_final = min(20, dm.get_pos_case_length_quantile(df, 0.90))

    train, test = dm.split_data_strict(df, train_ratio=train_ratio, split='temporal')
    
    df_test_prefixes = dm.generate_prefix_data(test, min_prefix_length_final, max_prefix_length_final)
    df_train_prefixes = dm.generate_prefix_data(train, min_prefix_length_final, max_prefix_length_final)

    train_prefixes[activity] = df_train_prefixes
    test_prefixes[activity] = df_test_prefixes

    bucketer = get_bucketer(method=bucketing, case_id_col=case_id_col)
    train_bucket = bucketer.fit_predict(df_train_prefixes)
    test_bucket = bucketer.predict(df_test_prefixes)
    nr_events_all = []

100%|██████████| 19/19 [00:06<00:00,  2.81it/s]
100%|██████████| 19/19 [00:06<00:00,  2.85it/s]


In [12]:
# The train set encoding, commented for the test set for the sake of allocated memory

nr_events_all = []
current_online_times = []


encoder_config = {'case_id_col': case_id_col,
                'static_cat_cols': static_cat_cols,
                'static_num_cols': static_num_cols,
                'dynamic_cat_cols': dynamic_cat_cols,
                'dynamic_num_cols': dynamic_num_cols,
                'fillna': True}

for bucket in set(test_bucket):
    relevant_train_bucket = dm.get_indexes(df_train_prefixes)[bucket == train_bucket]
    relevant_test_bucket = dm.get_indexes(df_test_prefixes)[bucket == test_bucket]
    
    df_test_bucket = dm.get_data_by_indexes(df_test_prefixes, relevant_test_bucket)
    test_prfx_len = dm.get_prefix_lengths(df_test_bucket)[0]
    test_y = np.array([dm.get_label_numeric(df_test_bucket)])
    nr_events_all.extend(list(dm.get_prefix_lengths(df_test_bucket)))
    
    # checking presence of the train data, if exists, we proceed with the offline training
    if len(relevant_train_bucket) == 0:
        preds = [dm.get_class_ratio(train)] * len(relevant_test_bucket)
        current_online_times.extend([0] * len(preds))
    else:
        # extracting training data for the experiment
        df_train_bucket = dm.get_data_by_indexes(df_train_prefixes, relevant_train_bucket)
        train_y_experiment = np.array([dm.get_label_numeric(df_train_bucket)])
        prfx_len = dm.get_prefix_lengths(df_train_bucket)[0]
        
        encoder = get_encoder(encoding, **encoder_config)
        featureCombinerExperiment = FeatureUnion(
                [(enc_method, encoder) for enc_method in encoding_dict[encoding]])
        
        encoded_training = featureCombinerExperiment.fit_transform(df_train_bucket)
        ffeatures = featureCombinerExperiment.get_feature_names_out()
        feat_num = len(ffeatures)
        ffeatures.append('encoded_label')

        # create a dataframe with the encoded training features and label
        encoded_training = np.concatenate((encoded_training,train_y_experiment.T), axis=1)
        training_set_df = pd.DataFrame(encoded_training, columns=ffeatures)
        bkt_size = training_set_df.shape[0]

        # # create a dataframe with the encoded test features and label
        # encoded_testing_bucket = featureCombinerExperiment.fit_transform(df_test_bucket)
        # encoded_testing_bucket = np.concatenate((encoded_testing_bucket,test_y.T), axis=1)
        # testing_set_df = pd.DataFrame(encoded_testing_bucket, columns=ffeatures)
        # test_bkt_size = testing_set_df.shape[0]
        
        # save the preprocessed data and corresponding metadata into a file 
        outfile_train = 'encoded_training_%s_%s_%s_%s_%s.csv' % (
                        dataset_name, method_name, bkt_size, prfx_len, feat_num)
        training_set_df.to_csv(os.path.join(encoded_datasets_dir, outfile_train), sep=';', columns=ffeatures, index=False)
        print('%s;%s;%s;%s;%s;%s\n' % (dataset_name, method_name, 'training', bkt_size, prfx_len, feat_num))
        
        # outfile_test = 'encoded_testing_%s_%s_%s_%s_%s.csv' % (
        #                 dataset_name, method_name, test_bkt_size, test_prfx_len, feat_num)
        # testing_set_df.to_csv(os.path.join(encoded_datasets_dir, outfile_test), sep=';', columns=ffeatures, index=False)
        # print('%s;%s;%s;%s;%s;%s\n' % (dataset_name, method_name, 'testing', test_bkt_size, test_prfx_len, feat_num))


  test_prfx_len = dm.get_prefix_lengths(df_test_bucket)[0]
  prfx_len = dm.get_prefix_lengths(df_train_bucket)[0]


: 