Control-Flow-Prediction: Comparison of Models
--

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
from exp.eventlog import *
from exp.dl_utils import *
from exp.control_flow_prediction import *

In [3]:
#export
def _expand_path(fpath): return Path(fpath).expanduser()

class Config():
    "Creates a default config file 'config.yml' in $FASTPM HOME (default `~/.fastpm/`)"
    DEFAULT_CONFIG_LOCATION = os.path.expanduser(os.getenv('FASTPM_HOME', '~/.fastpm'))
    DEFAULT_CONFIG_PATH = DEFAULT_CONFIG_LOCATION + '/config.yml'
    DEFAULT_CONFIG = {
        'data_path': DEFAULT_CONFIG_LOCATION + '/data',
        'model_path': DEFAULT_CONFIG_LOCATION + '/model'
    }
    
    @classmethod
    def get_key(cls, key):
        cf=cls.get()
        "Get the path to `key` in the config file."
        return cf.get(key, cls.DEFAULT_CONFIG.get(key,None))

    @classmethod
    def get_path(cls, path):
        "Get the `path` in the config file."
        return _expand_path(cls.get_key(path))

    @classmethod
    def data_path(cls):
        "Get the path to data in the config file."
        return cls.get_path('data_path')

    @classmethod
    def data_archive_path(cls):
        "Get the path to data archives in the config file."
        return cls.get_path('data_archive_path')
    
    @classmethod
    def get_model(cls, path):
        "Get the 'path' in the config file."
        return _expand_path(cls.get_key(path))
    
    @classmethod
    def model_path(cls):
        "Get the path to model in the config file."
        return cls.get_path('model_path')
    
    @classmethod
    def model_archive_path(cls):
        "Get the path to data archives in the config file."
        return cls.get_path('model_archive_path')

    @classmethod
    def get(cls, fpath=None, create_missing=True):
        "Retrieve the `Config` in `fpath`."
        fpath = _expand_path(fpath or cls.DEFAULT_CONFIG_PATH)
        if not fpath.exists() and create_missing: cls.create(fpath)
        assert fpath.exists(), f'Could not find config at: {fpath}. Please create'
        with open(fpath, 'r') as yaml_file:
            yfile=yaml.safe_load(yaml_file)
            return yfile

    @classmethod
    def create(cls, fpath):
        "Creates a `Config` from `fpath`."
        fpath = _expand_path(fpath)
        assert(fpath.suffix == '.yml')
        if fpath.exists(): return
        fpath.parent.mkdir(parents=True, exist_ok=True)
        with open(fpath, 'w') as yaml_file:
            yaml.dump(cls.DEFAULT_CONFIG, yaml_file, default_flow_style=False)

### Load and Preprocess Data

In [None]:
bpi2012 = import_xes(untar_data(URLs.BPIC_2012)).events
bpi2017 = import_xes(untar_data(URLs.BPIC_2017)).events
data_sets = [bpi2012, bpi2017]

In [18]:
datasets = [URLs.BPIC_2012, URLs.BPIC_2017]
models = ["awd_lstm"]

processing_params = {'seed': 42, 'bs': 64, 'bptt': 70}
training_params = {"awd_lstm": {"emb_sz": 300, "nh": 300, "nl": 2, "cbs": [partial(AvgStatsCallback, accuracy_flat),
                                                                           CudaCallback, Recorder,
                                                                           partial(GradientClipping, clip=0.1),
                                                                           partial(RNNTrainer, α=2., β=1.),
                                                                           ProgressBarCallback]}
                  }
evaluation_params = {"bs": 64, "bptt": 70}

In [29]:
def load_data(dataset):
    return import_xes(untar_data(dataset)).events

def func(t, data):
    return len(data[int(t)][0])

def process_data_set(data_set, processing_params):
    seed = processing_params.get("seed")
    bs = processing_params.get("bs")
    bptt = processing_params.get("bptt")
    
    cfl = ControlFlowList.from_df(data_set)
    split = SplitData.split_by_func(cfl, partial(random_splitter, p_valid=0.1))
    
    UNK, PAD, BOT, EOT = "xxunk xxpad xxbot xxeot ".split()
    proc_tok = TokenizeProcessor()
    proc_num = NumericalizeProcessor()
    sd = SplitData.split_by_func(cfl, partial(random_splitter, p_valid=0.1))
    ll = label_by_func(sd, lambda x: 0, proc_x = [proc_tok, proc_num])
    vocab = proc_num.vocab
    data = lm_databunchify(ll, bs, bptt)
    
    return data, ll.test.x, vocab

def train_model(model, train, vocab, training_params):
    if model == "awd_lstm":
        parameters = training_params.get("awd_lstm")
        emb_sz = parameters.get("emb_sz")
        nh = parameters.get("nh")
        nl = parameters.get("nl")
        cbs = parameters.get("cbs")
        
        tok_pad = vocab.index(PAD)
        awd_lstm = get_language_model(len(vocab), emb_sz, nh, nl, tok_pad, input_p=0.6, 
                                      output_p=0.4, weight_p=0.5, embed_p=0.1, hidden_p=0.2)
        learner = Learner(awd_lstm, train, cross_entropy_flat, lr=5e-3, cb_funcs=cbs, opt_func=adam_opt())
        learner.fit(20)
        return learner
    
    elif model == "basic":
        pass
    else:
        pass

def measure_next_step_prediction(model, test, evaluation_params):
    bs = evaluation_params.get("bs")
    bptt = evaluation_params.get("bptt")
    
    x, y = process_data_for_next_step_prediction(test)
    pd_data = Dataset(x,y)
    test_sampler = SortSampler(pd_data.x, key=partial(func, data=pd_data))
    
    test_dl = DataLoader(pd_data, batch_size=bs*64, sampler=test_sampler, collate_fn=partial(pad_collate, pad_first=True))
    mean = predict_next_step(model, test_dl)
    return mean

def measure_suffix_prediction(model, test, evaluation_params):
    bs = evaluation_params.get("bs")
    bptt = evaluation_params.get("bptt")
    
    x, y = process_data_for_suffix_prediction(test)
    pd_data = Dataset(x,y)
    test_sampler = SortSampler(pd_data.x, key=partial(func, data=pd_data))
    
    test_dl = DataLoader(pd_data, batch_size=bs*8, sampler=test_sampler, collate_fn=pad_collate_sp)
    mean = predict_suffix(model, test_dl)
    return mean

In [30]:
def run(data_sets=datasets, models=models, processing_params=processing_params,
         training_params=training_params, evaluation_params=evaluation_params):
    
    for dataset in progress_bar(datasets):
        data_set = load_data(dataset)
        train_valid, test, vocab = process_data_set(data_set, processing_params)
        for model in progress_bar(models):
            Learner = train_model(model, train_valid, vocab, training_params)
            res_df1 = measure_next_step_prediction(Learner, test, evaluation_params)
            res_df2 = measure_suffix_prediction(Learner, test, evaluation_params)
            
            print("For {} and {}:\n--> Next-Step-Prediction value = {}\n--> Suffix-Prediction value = {}".format(model, data_set, res_df1, res_df2))

In [31]:
run()

epoch,train_loss,train_accuracy_flat,valid_loss,valid_accuracy_flat,time
0,1.667903,0.514955,0.819049,0.741109,00:00
1,0.728369,0.768969,0.542085,0.822433,00:00
2,0.548446,0.819465,0.487693,0.835789,00:00
3,0.49428,0.828904,0.480681,0.835975,00:00
4,0.469748,0.831537,0.45579,0.837872,00:00
5,0.455384,0.835833,0.446516,0.845685,00:00
6,0.420722,0.84365,0.44497,0.844085,00:00
7,0.439497,0.836748,0.443246,0.844382,00:00
8,0.409474,0.847636,0.439187,0.845052,00:00
9,0.42291,0.841577,0.442767,0.844717,00:00


For awd_lstm:
--> Next-Step-Prediction value = 0.8392466902732849
-->Suffix-Prediction value = 0.4935105666115253


epoch,train_loss,train_accuracy_flat,valid_loss,valid_accuracy_flat,time
0,0.809045,0.749446,0.446455,0.853208,00:04
1,0.411956,0.860226,0.421665,0.862368,00:04
2,0.380171,0.868087,0.416318,0.864434,00:04
3,0.367261,0.871355,0.413813,0.86517,00:04
4,0.360786,0.872887,0.407908,0.865906,00:04
5,0.350873,0.874865,0.412993,0.865757,00:04
6,0.349431,0.874386,0.40786,0.86751,00:04
7,0.347484,0.874557,0.406775,0.867113,00:04
8,0.343666,0.875631,0.399473,0.868833,00:04
9,0.343626,0.875465,0.404253,0.867642,00:04


For awd_lstm:
--> Next-Step-Prediction value = 0.8818588256835938
-->Suffix-Prediction value = 0.4519359869893541


In [None]:
class Model:
    prepare_training_Databunch()

In [None]:

def func1():
    print('func1')

def func2():
    print('func2')
dictc={'A':func1,'B':func2}

def defaultfunc():
    print('default')
s='...'
dictc.get(s,defaultfunc)()
    

In [None]:
class ProcessPrediction:
    
    def precproccessing():
        pass
    def training():
        pass
    def create_model():
        pass
    def testing():
        pass
    def suffixPrediction():
        pass
    def nextStepPrediction():
        pass