Control-Flow-Prediction: Comparison of Models
--

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
from exp.eventlog import *
from exp.dl_utils import *
from exp.control_flow_prediction import *

In [3]:
#export
def _expand_path(fpath): return Path(fpath).expanduser()

class Config():
    "Creates a default config file 'config.yml' in $FASTPM HOME (default `~/.fastpm/`)"
    DEFAULT_CONFIG_LOCATION = os.path.expanduser(os.getenv('FASTPM_HOME', '~/.fastpm'))
    DEFAULT_CONFIG_PATH = DEFAULT_CONFIG_LOCATION + '/config.yml'
    DEFAULT_CONFIG = {
        'data_path': DEFAULT_CONFIG_LOCATION + '/data',
        'model_path': DEFAULT_CONFIG_LOCATION + '/model'
    }
    
    @classmethod
    def get_key(cls, key):
        cf=cls.get()
        "Get the path to `key` in the config file."
        return cf.get(key, cls.DEFAULT_CONFIG.get(key,None))

    @classmethod
    def get_path(cls, path):
        "Get the `path` in the config file."
        return _expand_path(cls.get_key(path))

    @classmethod
    def data_path(cls):
        "Get the path to data in the config file."
        return cls.get_path('data_path')

    @classmethod
    def data_archive_path(cls):
        "Get the path to data archives in the config file."
        return cls.get_path('data_archive_path')
    
    @classmethod
    def get_model(cls, path):
        "Get the 'path' in the config file."
        return _expand_path(cls.get_key(path))
    
    @classmethod
    def model_path(cls):
        "Get the path to model in the config file."
        return cls.get_path('model_path')
    
    @classmethod
    def model_archive_path(cls):
        "Get the path to data archives in the config file."
        return cls.get_path('model_archive_path')

    @classmethod
    def get(cls, fpath=None, create_missing=True):
        "Retrieve the `Config` in `fpath`."
        fpath = _expand_path(fpath or cls.DEFAULT_CONFIG_PATH)
        if not fpath.exists() and create_missing: cls.create(fpath)
        assert fpath.exists(), f'Could not find config at: {fpath}. Please create'
        with open(fpath, 'r') as yaml_file:
            yfile=yaml.safe_load(yaml_file)
            return yfile

    @classmethod
    def create(cls, fpath):
        "Creates a `Config` from `fpath`."
        fpath = _expand_path(fpath)
        assert(fpath.suffix == '.yml')
        if fpath.exists(): return
        fpath.parent.mkdir(parents=True, exist_ok=True)
        with open(fpath, 'w') as yaml_file:
            yaml.dump(cls.DEFAULT_CONFIG, yaml_file, default_flow_style=False)

### Building Basic Model

In [53]:
class BasicModel(nn.Module):
    def __init__(self, n_in, n_out, emb_sz, nh):
        super().__init__()
        self.nh = nh
        self.emb = nn.Embedding(n_in, 7, padding_idx=1)
        self.lin1 = nn.Linear(7, nh)
        self.relu = nn.ReLU()
        self.lin2 = nn.Linear(nh, n_out)
        
    def __call__(self, x):
        x = x.long()
        x = self.emb(x)
        x = self.lin1(x)
        x = self.relu(x)
        x = self.lin2(x)
        return x.float()
    
    def reset(self):
        "Reset the hidden states."
        pass

In [54]:
sched = combine_scheds([0.3, 0.7], [sched_cos(0.3, 0.6), sched_cos(0.6, 0.2)]) 
opt_func = partial(Optimizer, steppers=[sgd_step])

### Load and Preprocess Data

In [55]:
# bpi2012 = import_xes(untar_data(URLs.BPIC_2012)).events
# bpi2017 = import_xes(untar_data(URLs.BPIC_2017)).events
# data_sets = [bpi2012, bpi2017]

In [56]:
datasets = [URLs.BPIC_2012, URLs.BPIC_2017]
models = ["basic"]

processing_params = {'seed': 42, 'bs': 64, 'bptt': 70}
training_params = {'awd_lstm': {'emb_sz': 300, 'nh': 300, 'nl': 2, 'cbs': [partial(AvgStatsCallback, accuracy_flat),
                                                                           CudaCallback, Recorder,
                                                                           partial(GradientClipping, clip=0.1),
                                                                           partial(RNNTrainer, α=2., β=1.),
                                                                           ProgressBarCallback]},
                   'basic': {'nh': 10, 'bs': 64, 'bptt': 70, 'cbfs': [partial(AvgStatsCallback, accuracy_flat), 
                                                                      CudaCallback, Recorder,
                                                                      partial(ParamScheduler, 'lr', sched),
                                                                      ProgressBarCallback]}
                  }
evaluation_params = {'bs': 64, 'bptt': 70}

### Helper Functions for "run()"

In [57]:
def load_data(dataset):
    return import_xes(untar_data(dataset)).events

def func(t, data):
    return len(data[int(t)][0])

def process_data_set(data_set, processing_params):
    seed = processing_params.get("seed")
    bs = processing_params.get("bs")
    bptt = processing_params.get("bptt")
    
    cfl = ControlFlowList.from_df(data_set)
    split = SplitData.split_by_func(cfl, partial(random_splitter, p_valid=0.1))
    
    UNK, PAD, BOT, EOT = "xxunk xxpad xxbot xxeot ".split()
    proc_tok = TokenizeProcessor()
    proc_num = NumericalizeProcessor()
    sd = SplitData.split_by_func(cfl, partial(random_splitter, p_valid=0.1))
    ll = label_by_func(sd, lambda x: 0, proc_x = [proc_tok, proc_num])
    vocab = proc_num.vocab
    data = lm_databunchify(ll, bs, bptt)
    
    return data, ll.test.x, vocab

def train_model(model, train, vocab, training_params):
    if model == "awd_lstm":
        parameters = training_params.get("awd_lstm")
        emb_sz = parameters.get("emb_sz")
        nh = parameters.get("nh")
        nl = parameters.get("nl")
        cbs = parameters.get("cbs")
        
        tok_pad = vocab.index(PAD)
        awd_lstm = get_language_model(len(vocab), emb_sz, nh, nl, tok_pad, input_p=0.6, 
                                      output_p=0.4, weight_p=0.5, embed_p=0.1, hidden_p=0.2)
        learner = Learner(awd_lstm, train, cross_entropy_flat, lr=5e-3, cb_funcs=cbs, opt_func=adam_opt())
        learner.fit(20)
        return learner
    
    elif model == "basic":
        parameters = training_params.get("basic")
        nh = parameters.get("nh")
        bs = parameters.get("bs")
        bptt = parameters.get("bptt")
        cbfs = parameters.get("cbfs")
        n_in = bs * bptt
        emb_sz = int(len(vocab) / 2)
        
        basic = BasicModel(n_in, len(vocab), emb_sz, nh)
        learner = Learner(basic, train, cross_entropy_flat, cb_funcs=cbfs, opt_func=opt_func)
        learner.fit(20)
        return learner

    else:
        pass

def measure_next_step_prediction(model, test, evaluation_params):
    bs = evaluation_params.get("bs")
    bptt = evaluation_params.get("bptt")
    
    x, y = process_data_for_next_step_prediction(test)
    pd_data = Dataset(x,y)
    test_sampler = SortSampler(pd_data.x, key=partial(func, data=pd_data))
    
    test_dl = DataLoader(pd_data, batch_size=bs*64, sampler=test_sampler, collate_fn=partial(pad_collate, pad_first=True))
    mean = predict_next_step(model, test_dl)
    return mean

def measure_suffix_prediction(model, test, evaluation_params):
    bs = evaluation_params.get("bs")
    bptt = evaluation_params.get("bptt")
    
    x, y = process_data_for_suffix_prediction(test)
    pd_data = Dataset(x,y)
    test_sampler = SortSampler(pd_data.x, key=partial(func, data=pd_data))
    
    test_dl = DataLoader(pd_data, batch_size=bs*8, sampler=test_sampler, collate_fn=pad_collate_sp)
    mean = predict_suffix(model, test_dl)
    return mean

In [58]:
def run(data_sets=datasets, models=models, processing_params=processing_params,
         training_params=training_params, evaluation_params=evaluation_params):
    
    for dataset in progress_bar(datasets):
        data_set = load_data(dataset)
        train_valid, test, vocab = process_data_set(data_set, processing_params)
        for model in progress_bar(models):
            Learner = train_model(model, train_valid, vocab, training_params)
            res_df1 = measure_next_step_prediction(Learner, test, evaluation_params)
            res_df2 = measure_suffix_prediction(Learner, test, evaluation_params)
            
            print("For {} and {}:\n--> Next-Step-Prediction value = {}\n--> Suffix-Prediction value = {}".format(model, data_set, res_df1, res_df2))

In [59]:
run()

epoch,train_loss,train_accuracy_flat,valid_loss,valid_accuracy_flat,time
0,2.4502,0.347353,1.869108,0.488356,00:00
1,1.610901,0.557102,1.392981,0.616109,00:00
2,1.30204,0.62341,1.212588,0.626637,00:00
3,1.18308,0.63151,1.143531,0.637128,00:00
4,1.133691,0.642124,1.12772,0.644755,00:00
5,1.111089,0.643668,1.090726,0.644792,00:00
6,1.089545,0.643417,1.072166,0.644382,00:00
7,1.070159,0.64329,1.057303,0.644382,00:00
8,1.055936,0.643745,1.043343,0.644568,00:00
9,1.043104,0.644442,1.031726,0.644792,00:00


RuntimeError: shape '[4096, 166, -1]' is invalid for input of size 4648

In [None]:
class Model:
    prepare_training_Databunch()

In [None]:

def func1():
    print('func1')

def func2():
    print('func2')
dictc={'A':func1,'B':func2}

def defaultfunc():
    print('default')
s='...'
dictc.get(s,defaultfunc)()
    

In [None]:
class ProcessPrediction:
    
    def precproccessing():
        pass
    def training():
        pass
    def create_model():
        pass
    def testing():
        pass
    def suffixPrediction():
        pass
    def nextStepPrediction():
        pass