In [1]:
from dataset_column_schema import DatasetSchemas 
from dataset_manager_optimized import DatasetManager, CVFoldsManager

In [2]:
from easydict import EasyDict as edict
from model_trainer import ModelTrainingPipeline

In [7]:
data_dir = '/data/leuven/365/vsc36567/xAI-PPM/data/processed_benchmark_event_logs'
ds_name = 'bpic2017' 
ds_file_names = ['BPIC17_O_Accepted.csv']#, 'BPIC17_O_Cancelled.csv', 'BPIC17_O_Refused.csv']
bpic17_column_schema = DatasetSchemas.bpic2017()

bucketing_method = 'single'
encoding_method = 'agg'
encoding_dict = {
            "laststate": ["static", "last"],
            "agg": ["static", "agg"],
            "index": ["static", "index"],
            "combined": ["static", "last", "agg"]
        }

In [5]:
preprocessing_config = edict({'column_schema': bpic17_column_schema,
                            'bucketing_method': 'single',
                            'encoding_methods': encoding_dict[encoding_method],
                            'encoding_args': bpic17_column_schema.get_encoder_args(fillna=True),
                            'min_prefix_length': 1,
                            'max_prefix_length': 20,
                            'gap': 1,
                            'train_ratio': 0.8})

cls_method = 'rf'
cls_args = {'n_estimators': 500, 
            'max_features': 10,
            }

In [None]:
start_time = time.time()
for file_name in ds_file_names:

    dm = DatasetManager(dataset_name, preprocessing_config)
    df = dm.read_dataset(osp.join(data_dir, file_name))
    min_prefix_length = preprocessing_config.min_prefix_length
    max_prefix_length = min(preprocessing_config.max_prefix_length, 
                            dm.get_pos_case_length_quantile(df, 0.90))

    # Splitting the data into train and test set 
    train, test = dm.split_data_strict(df, 
                                        train_ratio=preprocessing_config.train_ratio, 
                                        split='temporal')
    print('Shape of the train set: ', train.shape, '\nShape of the test set: ', test.shape)
    
    # Generating prefixes 
    df_train_prefixes = dm.generate_prefix_data(test, min_prefix_length, max_prefix_length)
    df_test_prefixes = dm.generate_prefix_data(train, min_prefix_length, max_prefix_length)
    
    # Create buckets for each prexif or a single one that fits all the prexifes  
    bucketer = get_bucketer(method=bucketing, case_id_col=case_id_col)
    train_bucket = bucketer.fit_predict(df_train_prefixes)
    test_bucket = bucketer.predict(df_test_prefixes)

    # Iterating over the set of generated buckets
    for bucket in set(test_bucket):

        train_bucket_ind = dm.get_indexes(df_train_prefixes)[bucket == train_bucket]
        test_bucket_ind = dm.get_indexes(df_test_prefixes)[bucket == test_bucket]   

        # extracting training data for the experiment
        df_train_bucket = dm.get_data_by_indexes(df_train_prefixes, train_bucket_ind)
        df_test_bucket = dm.get_data_by_indexes(df_test_prefixes, test_bucket_ind)
        
        _, train_y = np.asarray(dm.get_labels(df_train_bucket))
        _, test_y = np.asarray(dm.get_labels(df_test_bucket))

        # Get a set of encoders for preprocessing of static and dynamic features
        featureCombinerExperiment = FeatureUnion(
                [(enc_method, get_encoder(enc_method, **encoder_config)) for enc_method in encoding_dict[encoding]])
        
        encoded_train_bucket = featureCombinerExperiment.fit_transform(df_train_bucket)
        encoded_test_bucket = featureCombinerExperiment.fit_transform(df_test_bucket)        


        enc_fnames = []
        for _, transformer in featureCombinerExperiment.transformer_list:
            for new_fname in transformer.get_feature_names():
                enc_fnames.append(new_fname)
        # enc_fnames.append('encoded_label')

        # create a dataframe with the encoded training features and label
        # encoded_training = np.concatenate((encoded_training, train_y.reshape(-1, 1)), axis=1)
        training_set_df = pd.DataFrame(encoded_training, columns=enc_fnames)
