## Import packages

In [1]:
from dataset_column_schema import DatasetSchemas 
from dataset_manager_optimized import DatasetManager

In [2]:
import os.path as osp
from experiment_runner import MLExperimentRunner

## Setup preprocessing

In [3]:
data_dir =  r'C:\Users\sahat\OneDrive - KU Leuven\Research\PPM&xAI\data'
# '/data/leuven/365/vsc36567/xAI-PPM/data/processed_benchmark_event_logs'

ds_name = 'bpic2017' 
ds_file_names = ['BPIC17_O_Accepted.csv'] #, 'BPIC17_O_Cancelled.csv', 'BPIC17_O_Refused.csv']
bpic17_column_schema = DatasetSchemas.bpic2017()

bucketing_method = 'single'
encoding_method = 'agg'
encoding_dict = {
            "laststate": ["static", "last"],
            "agg": ["static", "agg"],
            "index": ["static", "index"],
            "combined": ["static", "last", "agg"]
        }

Define classifier arguments

TODOs: 

[ ] - try pass categorical features as it is

[ ] - ommit bucketing 

[ ] - try XGBoost 

In [4]:
encoding_args = bpic17_column_schema.get_encoder_args(fillna=True)

cls_method = 'rf'
cls_args = {'n_estimators': 500, 
            'max_features': 'sqrt',
            }

In [5]:
# Initialize a dataset manager

gap = 1
train_ratio = 0.8
random_state=22

dm = DatasetManager(ds_name, ds_column_schema=bpic17_column_schema) 
data = dm.read_dataset(osp.join(data_dir, ds_file_names[0]))


# Splitting the data into train and test set
train, test = dm.split_data_strict(data, train_ratio=train_ratio, split="temporal")
print("Shape of the train set: ", train.shape,
    "\nShape of the test set: ",  test.shape)
max_prefix_length = min(20, dm.get_pos_case_length_quantile(data, 0.90))
print(
    f"\nGenerating train and test prefixes with the max length {max_prefix_length}"
)

Shape of the train set:  (927785, 26) 
Shape of the test set:  (239791, 26)

Generating train and test prefixes with the max length 20


In [6]:
experiment_setup_args = dict(dataset_name=ds_name, dataset_manager=dm,
                             bucket_method=bucketing_method, 
                             encoding_methods=encoding_dict[encoding_method],
                             encoding_args=encoding_args, 
                             cls_method=cls_method, 
                             cls_args=cls_args, random_state=random_state)

# Optimize hyperparameters

In [7]:
from experiment_runner import CrossValidationExperimentRunner


In [8]:
cv_runner  = CrossValidationExperimentRunner(**experiment_setup_args, k_folds=3)

In [None]:
cv_runner.run_experiment(train, min_prefix_length=1, 
                         max_prefix_length=max_prefix_length, gap=1)

  0%|          | 0/4 [00:00<?, ?trial/s, best loss=?]

Generating prefixes:   0%|          | 0/19 [00:00<?, ?it/s]
Generating prefixes:  11%|#         | 2/19 [00:00<00:01, 13.18it/s]
Generating prefixes:  21%|##1       | 4/19 [00:00<00:01, 12.42it/s]
Generating prefixes:  32%|###1      | 6/19 [00:00<00:01, 11.11it/s]
Generating prefixes:  42%|####2     | 8/19 [00:00<00:01, 10.72it/s]
Generating prefixes:  53%|#####2    | 10/19 [00:00<00:00, 10.19it/s]
Generating prefixes:  63%|######3   | 12/19 [00:01<00:00, 10.17it/s]
Generating prefixes:  74%|#######3  | 14/19 [00:01<00:00,  9.84it/s]
Generating prefixes:  79%|#######8  | 15/19 [00:01<00:00,  9.65it/s]
Generating prefixes:  84%|########4 | 16/19 [00:01<00:00,  9.25it/s]
Generating prefixes:  89%|########9 | 17/19 [00:01<00:00,  9.05it/s]
Generating prefixes:  95%|#########4| 18/19 [00:01<00:00,  8.60it/s]
Generating prefixes: 100%|##########| 19/19 [00:01<00:00,  8.60it/s]
Generating prefixes: 100%|##########| 19/19 [00:01<00:00,  9.74it/s]
Generating prefixes:   0%|          | 0/19 [00:

Started processing the fold 0                        
       Shape of the train prefixes and labels after labels extraction: 
(3412183, 29)                                        
(313126,)                                            
       Shape of the test prefixes and labels after labels extraction: 
(1707151, 29)                                        
(156636,)                                            
                                                     
       Shape of the train bucket after encoding: 
(313126, 746)                                        
       Shape of the test bucket after encoding:      
(156636, 746)                                        
  0%|          | 0/4 [02:01<?, ?trial/s, best loss=?]

In [10]:
313126 + 156636

469762

# Train the final model

In [6]:
exp_runner = MLExperimentRunner(ds_name, dm, bucketing_method, encoding_dict[encoding_method],
                                encoding_args, cls_method, cls_args, random_state=22)

In [7]:
encoded_bucketed_data = exp_runner.preprocess_event_log(train, test, max_prefix_length=max_prefix_length, gap=gap)

Generating prefixes: 100%|██████████| 19/19 [00:10<00:00,  1.84it/s]
Generating prefixes: 100%|██████████| 19/19 [00:02<00:00,  7.83it/s]


Length of the train prefixes:  5119334
Length of the test prefixes:  1297206

Creating buckets with the "single" bucket method
    Processing bucket: 1
       Shape of the train bucket and its labels after labels extraction:  (469762, 29) (469762,)
       Shape of the test bucket and its labels after labels extraction:  (118532, 29) (118532,)

       Shape of the train bucket after encoding:  (469762, 183)
       Shape of the test bucket after encoding:  (118532, 183)
    Finished processing bucket: 1


In [8]:
result = exp_runner.run_experiment(encoded_bucketed_data['train'], 
                                   encoded_bucketed_data['test'], 
                                   )

***Fitting the created RandomForestClassifier classifier***


KeyboardInterrupt: 