##############################################

############# TABLE OF CONTENTS #############

##############################################
- 1) Import packages and functions
- 2) Function for preprocessing the data
- 3) Parameters

In [2]:
incomplete_levels = ['00', '25', '50', '75']

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **import packages and functions**

In [4]:
# functions and packages
from scipy.spatial import distance
from sklearn.metrics import roc_curve
import random
import time
from sklearn.preprocessing import MinMaxScaler


import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import FeatureUnion
from sklearn.metrics import roc_auc_score, accuracy_score
import numpy as np
import pandas as pd
import pickle
import os
# import packages
# packages from https://github.com/irhete/predictive-monitoring-benchmark/blob/master/experiments/experiments.py

import EncoderFactory
import warnings
warnings.filterwarnings("ignore")
pd.options.mode.chained_assignment = None

## Functions from stackoverflow

In [5]:
def transform_data(dt_train, dt_test, y_train):
    # feature combiner and columns
    feature_combiner = FeatureUnion([(method, EncoderFactory.get_encoder(
          method, **cls_encoder_args)) for method in methods])
    feature_combiner.fit(dt_train, y_train)

    # transform train dataset and add the column names back to the dataframe
    train_named = feature_combiner.transform(dt_train)
    train_named = pd.DataFrame(train_named)
    names = feature_combiner.get_feature_names()
    train_named.columns = names

    # transform test dataset
    test_named = feature_combiner.transform(dt_test)
    test_named = pd.DataFrame(test_named)
    names = feature_combiner.get_feature_names()
    test_named.columns = names

    return train_named, test_named

# Function to flip labels

In [6]:
def count_labels(data_y):
    print("total size", len(data_y))
    #print("regular", data_y.count("regular"))
    #print("deviant", data_y.count("deviant"))

def count_labels_number(data_y):
    print("total size", len(data_y))
    #print("regular", data_y.count(0))
    #print("deviant", data_y.count(1))

In [7]:
import sys

import dataset_confs

import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold


class DatasetManager:
    
    def __init__(self, dataset_name):
        self.dataset_name = dataset_name
        
        self.case_id_col = dataset_confs.case_id_col[self.dataset_name]
        self.activity_col = dataset_confs.activity_col[self.dataset_name]
        self.timestamp_col = dataset_confs.timestamp_col[self.dataset_name]
        self.label_col = dataset_confs.label_col[self.dataset_name]
        self.pos_label = dataset_confs.pos_label[self.dataset_name]

        self.dynamic_cat_cols = dataset_confs.dynamic_cat_cols[self.dataset_name]
        self.static_cat_cols = dataset_confs.static_cat_cols[self.dataset_name]
        self.dynamic_num_cols = dataset_confs.dynamic_num_cols[self.dataset_name]
        self.static_num_cols = dataset_confs.static_num_cols[self.dataset_name]
        
        self.sorting_cols = [self.timestamp_col, self.activity_col]

    
    def read_dataset(self, datalocation):
        # read dataset
        dtypes = {col:"object" for col in self.dynamic_cat_cols+self.static_cat_cols+[self.case_id_col, self.label_col, self.timestamp_col]}
        for col in self.dynamic_num_cols + self.static_num_cols:
            dtypes[col] = "float"

        data = pd.read_csv(datalocation, sep=";", dtype=dtypes)
        data[self.timestamp_col] = pd.to_datetime(data[self.timestamp_col])

        if self.dataset_name in ['bpic2011_f1', 'bpic2011_f2', 'bpic2011_f3', 'bpic2011_f4','bpic2015_1_f2','bpic2015_2_f2','bpic2015_3_f2','bpic2015_4_f2','bpic2015_5_f2','sepsis_cases_1','sepsis_cases_2','sepsis_cases_4']:
            data['time:timestamp'] = pd.to_datetime(data['time:timestamp']) 
        if self.dataset_name in ['bpic2012_accepted', 'bpic2012_cancelled', 'bpic2012_declined']:
            data['Complete Timestamp'] = pd.to_datetime(data['Complete Timestamp'])

        return data
    


    def split_data(self, data, train_ratio, split="temporal", seed=22):  
        # split into train and test using temporal split

        grouped = data.groupby(self.case_id_col)
        start_timestamps = grouped[self.timestamp_col].min().reset_index()
        if split == "temporal":
            start_timestamps = start_timestamps.sort_values(self.timestamp_col, ascending=True, kind="mergesort")
        elif split == "random":
            np.random.seed(seed)
            start_timestamps = start_timestamps.reindex(np.random.permutation(start_timestamps.index))
        train_ids = list(start_timestamps[self.case_id_col])[:int(train_ratio*len(start_timestamps))]
        train = data[data[self.case_id_col].isin(train_ids)].sort_values(self.timestamp_col, ascending=True, kind='mergesort')
        test = data[~data[self.case_id_col].isin(train_ids)].sort_values(self.timestamp_col, ascending=True, kind='mergesort')

        return (train, test)
    
    def split_data_strict(self, data, train_ratio, split="temporal"):  
        # split into train and test using temporal split and discard events that overlap the periods
        data = data.sort_values(self.sorting_cols, ascending=True, kind='mergesort')
        grouped = data.groupby(self.case_id_col)
        start_timestamps = grouped[self.timestamp_col].min().reset_index()
        start_timestamps = start_timestamps.sort_values(self.timestamp_col, ascending=True, kind='mergesort')
        train_ids = list(start_timestamps[self.case_id_col])[:int(train_ratio*len(start_timestamps))]
        train = data[data[self.case_id_col].isin(train_ids)].sort_values(self.sorting_cols, ascending=True, kind='mergesort')
        test = data[~data[self.case_id_col].isin(train_ids)].sort_values(self.sorting_cols, ascending=True, kind='mergesort')
        split_ts = test[self.timestamp_col].min()
        train = train[train[self.timestamp_col] < split_ts]
        return (train, test)
    
    def split_data_discard(self, data, train_ratio, split="temporal"):  
        # split into train and test using temporal split and discard events that overlap the periods
        data = data.sort_values(self.sorting_cols, ascending=True, kind='mergesort')
        grouped = data.groupby(self.case_id_col)
        start_timestamps = grouped[self.timestamp_col].min().reset_index()
        start_timestamps = start_timestamps.sort_values(self.timestamp_col, ascending=True, kind='mergesort')
        train_ids = list(start_timestamps[self.case_id_col])[:int(train_ratio*len(start_timestamps))]
        train = data[data[self.case_id_col].isin(train_ids)].sort_values(self.sorting_cols, ascending=True, kind='mergesort')
        test = data[~data[self.case_id_col].isin(train_ids)].sort_values(self.sorting_cols, ascending=True, kind='mergesort')
        split_ts = test[self.timestamp_col].min()
        overlapping_cases = train[train[self.timestamp_col] >= split_ts][self.case_id_col].unique()
        train = train[~train[self.case_id_col].isin(overlapping_cases)]
        return (train, test)
    
    
    def split_val(self, data, val_ratio, split="random", seed=22):  
        # split into train and test using temporal split
        grouped = data.groupby(self.case_id_col)
        start_timestamps = grouped[self.timestamp_col].min().reset_index()
        if split == "temporal":
            start_timestamps = start_timestamps.sort_values(self.timestamp_col, ascending=True, kind="mergesort")
        elif split == "random":
            np.random.seed(seed)
            start_timestamps = start_timestamps.reindex(np.random.permutation(start_timestamps.index))
        val_ids = list(start_timestamps[self.case_id_col])[-int(val_ratio*len(start_timestamps)):]
        val = data[data[self.case_id_col].isin(val_ids)].sort_values(self.sorting_cols, ascending=True, kind="mergesort")
        train = data[~data[self.case_id_col].isin(val_ids)].sort_values(self.sorting_cols, ascending=True, kind="mergesort")
        return (train, val)


    def generate_prefix_data(self, data, min_length, max_length, gap=1):
        # generate prefix data (each possible prefix becomes a trace)
        data['case_length'] = data.groupby(self.case_id_col)[self.activity_col].transform(len)

        dt_prefixes = data[data['case_length'] >= min_length].groupby(self.case_id_col).head(min_length)
        dt_prefixes["prefix_nr"] = 1
        dt_prefixes["orig_case_id"] = dt_prefixes[self.case_id_col]
        for nr_events in range(min_length+gap, max_length+1, gap):
            tmp = data[data['case_length'] >= nr_events].groupby(self.case_id_col).head(nr_events)
            tmp["orig_case_id"] = tmp[self.case_id_col]
            tmp[self.case_id_col] = tmp[self.case_id_col].apply(lambda x: "%s_%s"%(x, nr_events))
            tmp["prefix_nr"] = nr_events
            dt_prefixes = pd.concat([dt_prefixes, tmp], axis=0)
        
        dt_prefixes['case_length'] = dt_prefixes['case_length'].apply(lambda x: min(max_length, x))
        
        return dt_prefixes


    def get_pos_case_length_quantile(self, data, quantile=0.90):
        return int(np.ceil(data[data[self.label_col]==self.pos_label].groupby(self.case_id_col).size().quantile(quantile)))

    def get_indexes(self, data):
        return data.groupby(self.case_id_col).first().index

    def get_relevant_data_by_indexes(self, data, indexes):
        return data[data[self.case_id_col].isin(indexes)]

    def get_label(self, data):
        return data.groupby(self.case_id_col).first()[self.label_col]
    
    def get_prefix_lengths(self, data):
        return data.groupby(self.case_id_col).last()["prefix_nr"]
    
    def get_case_ids(self, data, nr_events=1):
        case_ids = pd.Series(data.groupby(self.case_id_col).first().index)
        if nr_events > 1:
            case_ids = case_ids.apply(lambda x: "_".join(x.split("_")[:-1]))
        return case_ids
    
    def get_label_numeric(self, data):
        y = self.get_label(data) # one row per case
        return [1 if label == self.pos_label else 0 for label in y]
    
    def get_class_ratio(self, data):
        class_freqs = data[self.label_col].value_counts()
        return class_freqs[self.pos_label] / class_freqs.sum()
    
    def get_stratified_split_generator(self, data, n_splits=5, shuffle=True, random_state=22):
        grouped_firsts = data.groupby(self.case_id_col, as_index=False).first()
        skf = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
        
        for train_index, test_index in skf.split(grouped_firsts, grouped_firsts[self.label_col]):
            current_train_names = grouped_firsts[self.case_id_col][train_index]
            train_chunk = data[data[self.case_id_col].isin(current_train_names)].sort_values(self.timestamp_col, ascending=True, kind='mergesort')
            test_chunk = data[~data[self.case_id_col].isin(current_train_names)].sort_values(self.timestamp_col, ascending=True, kind='mergesort')
            yield (train_chunk, test_chunk)
            
    def get_idx_split_generator(self, dt_for_splitting, n_splits=5, shuffle=True, random_state=22):
        skf = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
        
        for train_index, test_index in skf.split(dt_for_splitting, dt_for_splitting[self.label_col]):
            current_train_names = dt_for_splitting[self.case_id_col][train_index]
            current_test_names = dt_for_splitting[self.case_id_col][test_index]
            yield (current_train_names, current_test_names)
            

# Parameters

In [8]:
# PARAMETERS
params_dir = './params_dir_ML'
results_dir = './results_dir_ML'
column_selection = 'all'
train_ratio = 0.8
n_splits = 3
random_state = 22
n_iter = 1

# create results directory
if not os.path.exists(os.path.join(results_dir)):
    os.makedirs(os.path.join(results_dir))

encoding_dict = {
    "agg": ["static", "agg"],
    # "index": ["static", "index"]
}
encoding = []
for k, v in encoding_dict.items():
    encoding.append(k)

csv_files = {
    "bpic2011": ["BPIC11_f%s"%formula for formula in range(4,5)],
    "bpic2015": ["BPIC15_%s_f2"%(municipality) for municipality in range(1,4)],
    #"sepsis_cases": ["sepsis_cases_1", "sepsis_cases_2", "sepsis_cases_4"],
    #"bpic2012": ["bpic2012_O_ACCEPTED#COMPLETE","bpic2012_O_CANCELLED-COMPLETE","bpic2012_0_DECLINED-COMPLETE"],
    #production": ["Production"],
    #"bpic2017": ["BPIC17_O_Accepted","BPIC17_O_Cancelled","BPIC17_0_Refused"],
    #"bpic2017": ["BPIC17_O_Cancelled"],
    #"traffic_fines": ["traffic_fines_%s"%formula for formula in range(1,3)],
    #"hospital_billing": ["hospital_billing_%s"%suffix for suffix in [2,3]]
}
files = []
for k, v in csv_files.items():
    files.extend(v)
dataset_ref_to_datasets = {
    "bpic2011": ["bpic2011_f%s"%formula for formula in range(4,5)],
    "bpic2015": ["bpic2015_%s_f2"%(municipality) for municipality in range(1,4)],
    #"sepsis_cases": ["sepsis_cases_1", "sepsis_cases_2", "sepsis_cases_4"]
    #"bpic2012": ["bpic2012_accepted","bpic2012_cancelled","bpic2012_declined"],
    #"production": ["production"],
    #"bpic2017": ["bpic2017_cancelled"],
    #"bpic2017": ["bpic2017_accepted","bpic2017_cancelled","bpic2017_refused"],
    #"traffic_fines": ["traffic_fines_%s"%formula for formula in range(1,3)],
    #"hospital_billing": ["hospital_billing_%s"%suffix for suffix in [2,3]]
}

files = []
for k, v in csv_files.items():
    files.extend(v)
datasets = []
for k, v in dataset_ref_to_datasets.items():
    datasets.extend(v)
res = {datasets[i]: files[i] for i in range(len(datasets))}

# classifiers dictionary
classifier_ref_to_classifiers = {
     "MLmodels": ["XGB"],
   }
classifiers = []
for k, v in classifier_ref_to_classifiers.items():
    classifiers.extend(v)
incomplete_levels = ['00', '25', '50', '75']

# **loop over datasets and classifiers**

In [9]:
for dataset_name in datasets:
  for cls_method in classifiers:
    for cls_encoding in encoding:
        for level in incomplete_levels: 
            print('Dataset:', dataset_name)
            print('Classifier', cls_method)
            print('Encoding', cls_encoding)
            dataset_manager = DatasetManager(dataset_name)
            dataset_name_csv = res[dataset_name]
            data = dataset_manager.read_dataset('/content/drive/MyDrive/PU/Original_data/'+dataset_name_csv+'.csv')
            dataset_name_csv = res[dataset_name]
            method_name = "%s_%s" % (column_selection, cls_encoding)
            methods = encoding_dict[cls_encoding]

            # extract the optimal parameters
            optimal_params_filename = "optimal_params_%s_%s_%s_%s.pickle" % (cls_method, dataset_name, level, method_name)
            if not os.path.isfile(optimal_params_filename) or os.path.getsize(optimal_params_filename) <= 0:
                print('problem')
            with open(optimal_params_filename, "rb") as fin:
                args = pickle.load(fin)
                print(args)

            cls_encoder_args = {'case_id_col': dataset_manager.case_id_col,
                                'static_cat_cols': dataset_manager.static_cat_cols,
                                'static_num_cols': dataset_manager.static_num_cols,
                                'dynamic_cat_cols': dataset_manager.dynamic_cat_cols,
                                'dynamic_num_cols': dataset_manager.dynamic_num_cols,
                                'fillna': True}
            
            #file to save results
            outfile = os.path.join('', "performance_results_%s_%s_%s_%s.csv" % (cls_method, dataset_name, level, method_name))
                
            # determine min and max (truncated) prefix lengths
            min_prefix_length = 1
            if "traffic_fines" in dataset_name:
                max_prefix_length = 10
            elif "bpic2017" in dataset_name:
                max_prefix_length = min(20, dataset_manager.get_pos_case_length_quantile(data, 0.90))
            else:
                max_prefix_length = min(40, dataset_manager.get_pos_case_length_quantile(data, 0.90))

            maxlen = cutoff = max_prefix_length 
                
            # split into training and test
            train = dataset_manager.read_dataset('/content/drive/MyDrive/PU/Data/Train_PU'+level+'_'+dataset_name_csv+'.csv')
            test = dataset_manager.read_dataset('/content/drive/MyDrive/PU/Data/Test_'+dataset_name_csv+'.csv')
            #prefix generation of train and test data
            dt_train_prefixes = dataset_manager.generate_prefix_data(train, min_prefix_length, max_prefix_length)
            dt_test_prefixes = dataset_manager.generate_prefix_data(test, min_prefix_length, max_prefix_length)
            test_y = dataset_manager.get_label_numeric(dt_test_prefixes)
            train_y = dataset_manager.get_label_numeric(dt_train_prefixes)
            dt_train_named, dt_test_named = transform_data(dt_train_prefixes, dt_test_prefixes, train_y)
          
            #DELETE THIS LATER
            count_labels_number(train_y)
            count_labels_number(test_y)
        
            #create the input layers and embeddings
            embeddings= []
            input_layers = []
            preds_all = []
            nr_events_all = []
            nr_events = list(dataset_manager.get_prefix_lengths(dt_test_prefixes))
            nr_events_all.extend(nr_events)
            test_y_all = []
            test_y_all.extend(test_y)
            #MODEL
            current_args = args
            cls = xgb.XGBClassifier(objective='binary:logistic',
                                                n_estimators=500,
                                                learning_rate= current_args['learning_rate'],
                                                subsample=current_args['subsample'],
                                                max_depth=int(current_args['max_depth']),
                                                colsample_bytree=current_args['colsample_bytree'],
                                                min_child_weight=int(current_args['min_child_weight']),
                                                seed=random_state)
            cls.fit(dt_train_named, train_y)
            # predictions
            preds_pos_label_idx = np.where(cls.classes_ == 1)[0][0]
            pred = cls.predict_proba(dt_test_named)[:, preds_pos_label_idx]
            preds_all.extend(pred)
            
            score = 0
            dim = 0
            auc_total = roc_auc_score(test_y_all, preds_all)
            
            print(auc_total)
            with open(outfile, 'w') as fout:
                fout.write("%s;%s;%s;%s;%s;%s\n" % ("dataset", "method", "cls", "nr_events", "metric", "score")) 
                dt_results = pd.DataFrame({"actual": test_y_all, "predicted": preds_all, "nr_events": nr_events_all})
                for nr_events, group in dt_results.groupby("nr_events"):
                    if len(set(group.actual)) < 2:
                        fout.write("%s;%s;%s;%s;%s;%s;%s\n" % (dataset_name, method_name, cls_method, nr_events, -1,
                                                               "auc", np.nan))
                    else:
                        fout.write("%s;%s;%s;%s;%s;%s;%s\n" % (dataset_name, method_name, cls_method, nr_events, -1,
                                                               "auc", roc_auc_score(group.actual, group.predicted)))
                fout.write("%s;%s;%s;%s;%s;%s\n" % (dataset_name, method_name, cls_method, -1, "auc",
                                                    roc_auc_score(dt_results.actual, dt_results.predicted)))
            from google.colab import files
            files.download(outfile) 


Dataset: bpic2011_f4
Classifier XGB
Encoding agg
{'colsample_bytree': 0.6456099376490885, 'learning_rate': 0.0328658836622191, 'max_depth': 17, 'min_child_weight': 6, 'subsample': 0.9560784753132745}
total size 21902
total size 7491
0.8652857322227787


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Dataset: bpic2011_f4
Classifier XGB
Encoding agg
{'colsample_bytree': 0.9747371891675085, 'learning_rate': 0.011089509060744018, 'max_depth': 22, 'min_child_weight': 5, 'subsample': 0.7768664011411401}
total size 21902
total size 7491
0.8550003085297818


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Dataset: bpic2011_f4
Classifier XGB
Encoding agg
{'colsample_bytree': 0.9224852779417818, 'learning_rate': 0.15536730191157233, 'max_depth': 23, 'min_child_weight': 4, 'subsample': 0.508197961285134}
total size 21902
total size 7491
0.8136121120631707


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Dataset: bpic2011_f4
Classifier XGB
Encoding agg
{'colsample_bytree': 0.5249164443878849, 'learning_rate': 0.14712433401218095, 'max_depth': 19, 'min_child_weight': 4, 'subsample': 0.9322228144336675}
total size 21902
total size 7491
0.7205410968112052


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Dataset: bpic2015_1_f2
Classifier XGB
Encoding agg
{'colsample_bytree': 0.9346091294700718, 'learning_rate': 0.04045200563382756, 'max_depth': 28, 'min_child_weight': 3, 'subsample': 0.6639032370830285}
total size 18345
total size 4876
0.9175938261029121


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Dataset: bpic2015_1_f2
Classifier XGB
Encoding agg
{'colsample_bytree': 0.5033332972450442, 'learning_rate': 0.012333759007818967, 'max_depth': 23, 'min_child_weight': 5, 'subsample': 0.795416567381228}
total size 18345
total size 4876
0.9194024357274774


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Dataset: bpic2015_1_f2
Classifier XGB
Encoding agg
{'colsample_bytree': 0.6853530947220043, 'learning_rate': 0.304677988480315, 'max_depth': 17, 'min_child_weight': 2, 'subsample': 0.7834627111709327}
total size 18345
total size 4876
0.9040597629416344


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Dataset: bpic2015_1_f2
Classifier XGB
Encoding agg
{'colsample_bytree': 0.7606251756319556, 'learning_rate': 0.0011458843445060207, 'max_depth': 22, 'min_child_weight': 3, 'subsample': 0.8667085427407879}
total size 18345
total size 4876
0.7618649156007508


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Dataset: bpic2015_2_f2
Classifier XGB
Encoding agg
{'colsample_bytree': 0.6563490128331088, 'learning_rate': 0.17540886525849986, 'max_depth': 24, 'min_child_weight': 4, 'subsample': 0.6948134652058899}
total size 22221
total size 5789
0.9472852562759042


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Dataset: bpic2015_2_f2
Classifier XGB
Encoding agg
{'colsample_bytree': 0.6662850106503915, 'learning_rate': 0.050503104929724096, 'max_depth': 18, 'min_child_weight': 4, 'subsample': 0.9220500938117628}
total size 22221
total size 5789
0.9529500023551792


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Dataset: bpic2015_2_f2
Classifier XGB
Encoding agg
{'colsample_bytree': 0.6780879169305561, 'learning_rate': 0.03937952355109364, 'max_depth': 15, 'min_child_weight': 4, 'subsample': 0.8439683615191149}
total size 22221
total size 5789
0.9149933730876053


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Dataset: bpic2015_2_f2
Classifier XGB
Encoding agg
{'colsample_bytree': 0.6320642022815781, 'learning_rate': 0.020401364008479916, 'max_depth': 15, 'min_child_weight': 2, 'subsample': 0.5059281587475952}
total size 22221
total size 5789
0.9097082646045958


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Dataset: bpic2015_3_f2
Classifier XGB
Encoding agg
{'colsample_bytree': 0.807530409018419, 'learning_rate': 0.053351866886983124, 'max_depth': 4, 'min_child_weight': 5, 'subsample': 0.8787421775126139}
total size 37400
total size 10041
0.9625274600830885


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Dataset: bpic2015_3_f2
Classifier XGB
Encoding agg
{'colsample_bytree': 0.8211704176073136, 'learning_rate': 0.15463959216079115, 'max_depth': 18, 'min_child_weight': 3, 'subsample': 0.9664162632101461}
total size 37400
total size 10041
0.9410364907632327


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Dataset: bpic2015_3_f2
Classifier XGB
Encoding agg
{'colsample_bytree': 0.8490469694021753, 'learning_rate': 0.041587254117149586, 'max_depth': 16, 'min_child_weight': 3, 'subsample': 0.5737684721404155}
total size 37400
total size 10041
0.9423060349628032


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Dataset: bpic2015_3_f2
Classifier XGB
Encoding agg
{'colsample_bytree': 0.7114591941259321, 'learning_rate': 0.009674548119496529, 'max_depth': 12, 'min_child_weight': 3, 'subsample': 0.7806593413631935}
total size 37400
total size 10041
0.9302066066154835


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
args

{'colsample_bytree': 0.7114591941259321,
 'learning_rate': 0.009674548119496529,
 'max_depth': 12,
 'min_child_weight': 3,
 'subsample': 0.7806593413631935}