In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

from exp.eventlog import *
from exp.dl_utils import *
from collections import OrderedDict

import editdistance as ed
import warnings

# Load Data

In [2]:
def load_data(data, merge=True):
    path = untar_data(data)
    log = import_xes(path, extensions=False, classifiers=False, schema=False, log_attributes=False)
    if merge:
        df = pd.merge(log.events, log.traceAttributes, left_on='trace_id', right_index=True)
    else: 
        df = log.events()
        
    return df

In [3]:
df0 = load_data(URLs.BPIC_2012)
#df1 = load_data(URLs.BPIC_2017)
#df2 = load_data(URLs.BPIC_2018)

failed to parse date: 1970-01-01T00:00:00.000+01:00
failed to parse date: 1970-01-01T00:00:00.000+01:00
failed to parse date: 2012-04-23T00:00:00.000+02:00
failed to parse date: 2011-10-01T00:38:44.546+02:00
failed to parse date: 2012-03-14T16:04:54.681+01:00
failed to parse date: 1970-01-01T00:00:00.000Z
failed to parse date: 2016-01-01T09:51:15.304Z
failed to parse date: 2017-02-01T23:00:00.000Z
failed to parse date: 2017-02-01T14:11:03.499Z


In [4]:
df1.head()

Unnamed: 0,trace_id,event_id,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,LoanGoal,ApplicationType,RequestedAmount
0,Application_652823628,0,Created,User_1,A_Create Application,Application,Application_652823628,complete,2016-01-01 09:51:15.304000+00:00,Existing loan takeover,New credit,20000.0
1,Application_652823628,1,statechange,User_1,A_Submitted,Application,ApplState_1582051990,complete,2016-01-01 09:51:15.352000+00:00,Existing loan takeover,New credit,20000.0
2,Application_652823628,2,Created,User_1,W_Handle leads,Workflow,Workitem_1298499574,schedule,2016-01-01 09:51:15.774000+00:00,Existing loan takeover,New credit,20000.0
3,Application_652823628,3,Deleted,User_1,W_Handle leads,Workflow,Workitem_1673366067,withdraw,2016-01-01 09:52:36.392000+00:00,Existing loan takeover,New credit,20000.0
4,Application_652823628,4,Created,User_1,W_Complete application,Workflow,Workitem_1493664571,schedule,2016-01-01 09:52:36.403000+00:00,Existing loan takeover,New credit,20000.0


# Data Preprocessing

In [13]:
# Splitting Data
def random_split_traces(d, split=0.8, trace_id='trace_id'):
    traces = d[trace_id].drop_duplicates()
    shuffled = traces.iloc[np.random.permutation(len(traces))].values
    split = int(len(traces) * split)
    return shuffled[:split], shuffled[split:]

def get_df(t, df): 
    return df[df[trace_id].isin(t)]

# Processing Data
def normalize_cont_column(x, mean, std, eps=1e-7): 
    return (x-mean)/(eps+std)

#UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ = "xxunk xxpad xxbos xxeos xxrep xxwrep xxup xxmaj".split()
#default_spec_tok = [UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ]
#
#cat_names = ['event_id', 'org:resource', 'lifecycle:transition', 'concept:name']
##date_names = ['time:timestamp', 'REG_DATE']
#date_names = ['time:timestamp']
#cont_names = ['AMOUNT_REQ']

def uniqueify(x, sort=False):
    res = list(OrderedDict.fromkeys(x).keys())
    if sort: res.sort()
    return res

class Processor():
    def process(self, items): return items

class CategoryProcessor(Processor):
    def __init__(self,default_token=None): 
        self.vocab = None
        self.default_token = default_token

    def __call__(self, items):
        #The vocab is defined on the first use.
        if self.vocab is None:
            self.vocab = uniqueify(items)
            if self.default_token is not None:
                for o in reversed(self.default_token):
                    if o in self.vocab: self.vocab.remove(o)
                    self.vocab.insert(0, o)
            self.otoi = {v: k for k,v in enumerate(self.vocab)}
        return [self.proc1(o) for o in items]
    
    def proc1(self, item): return self.otoi.get(item, 0)

    def deprocess(self, idxs):
        assert self.vocab is not None
        return [self.deproc1(idx) for idx in idxs]
    
    def deproc1(self, idx): return self.vocab[idx]
    
def add_datepart(df, fldname, drop=True, time=False, utc=False):
    "Helper function that adds columns relevant to a date."
    df = df.copy()
    fld = df[fldname]
    fld_dtype = fld.dtype
    if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        fld_dtype = np.datetime64

    if not np.issubdtype(fld_dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, utc=utc,infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    cols=[]
    for n in attr: 
        col_name=targ_pre +"_"+ n
        df[col_name] = getattr(fld.dt, n.lower())
        cols.append(col_name)
    df[targ_pre + '_Elapsed'] = fld.astype(np.int64) // 10 ** 9
    if drop: df.drop(fldname, axis=1, inplace=True)
    return df, cols, targ_pre + '_Elapsed'

class TraceProcessor(Processor):
    def __init__(self, cat_names, cont_names, date_names, vocabs={}):
        self.vocabs = vocabs
        self.cat_names, self.cont_names, self.date_names = cat_names, cont_names, date_names
        
    def __call__(self, df):
        cat_names, cont_names = self.cat_names[:], self.cont_names[:]
        for d in self.date_names:
            df, cat, cont = add_datepart(df, d, utc=True)
            cat_names += listify(cat)    
            cont_names += listify(cont)

        for c in cat_names:
            if not c in self.vocabs.keys(): 
                self.vocabs[c] = CategoryProcessor(default_spec_tok)
            df[c] = self.vocabs[c](df[c])
            
            
        for c in cont_names:
            df[c] = df[c].astype(float)

            if not c in self.vocabs.keys(): 
                self.vocabs[c] = df[c].mean(), df[c].std()
            df[c] = normalize_cont_column(df[c], *self.vocabs[c])
    
        return df
    
def create_traces(event_df, trace_id='trace_id'):
    ll = []
    trace_ids = []
    cols = list(event_df)
    cols.remove(trace_id)
    for n, g in event_df.groupby(trace_id):
        l = []
        
        for c in cols:
            l.append(list(g[c]))
        ll.append(l)
        trace_ids.append(n)  
        

    df = pd.DataFrame(ll, columns=cols)
    df.index = trace_ids
    return df

# Language Model DataLoader
class LMDataSet():
    def __init__(self, df, bs=64, bptt=70, shuffle=False):
        self.bs, self.bptt, self.shuffle = bs, bptt, shuffle
        self.cols = list(df)

        total_len = sum(df.apply(lambda x: max([len(listify(x[k])) for k in self.cols]),axis=1))
        self.n_batch = total_len // self.bs
        self.batched = self.batchify(df)
    
    def __len__(self): return ((self.n_batch-1) // self.bptt) * self.bs
    
    def __getitem__(self, idx):
        source = self.batched[:, idx % self.bs]
        seq_idx = (idx // self.bs) * self.bptt
        x, y = source[:, seq_idx:seq_idx+self.bptt], source[:, seq_idx+1:seq_idx+self.bptt+1]
        return x,y
    
    def batchify(self,df):
        if self.shuffle: df=df.sample(frac=1)
        
        dd={}
        for c in self.cols:
            dd[c]=[]
        for i, row in df.iterrows():
            l = max([len(listify(row[c])) for c in self.cols])
            for c in self.cols:
                dd[c].append(tensor(row[c]).expand(l))
        for c in self.cols:
            s = torch.cat([torch.cat((tensor([2.0]), t.float(), tensor([3.0]))) for t in dd[c]])
            dd[c] = s[:self.n_batch * self.bs].view(self.bs, self.n_batch)
        return torch.stack([dd[c] for c in self.cols])
    
def get_dls(train_ds, valid_ds,  **kwargs):
    return (DataLoader(LMDataSet(train_ds, shuffle=True,bptt=bptt), batch_size=bs),
            DataLoader(LMDataSet(valid_ds, shuffle=False,bptt=bptt), batch_size=bs))

In [15]:
%%time

trace_id = "trace_id"

UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ = "xxunk xxpad xxbos xxeos xxrep xxwrep xxup xxmaj".split()
default_spec_tok = [UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ]

cat_names = ['event_id', 'org:resource', 'lifecycle:transition', 'concept:name']
date_names = ['time:timestamp', 'REG_DATE']
cont_names = ['AMOUNT_REQ']

# Split Data into train_indices and test_indices
train_valid_ids, test_ids = random_split_traces(df0)

# Get the Dataframe from the train_idices to split it into train and valid
train_valid_df = get_df(train_valid_ids, df0)
train_ids, valid_ids = random_split_traces(train_valid_df, split=0.9)

# Get the Dataframe for train, valid and test
train_df = get_df(train_ids, df0)
valid_df = get_df(valid_ids, df0)
test_df = get_df(test_ids, df0)

# Encode Data
tp = TraceProcessor(cat_names, cont_names, date_names)
train_proc = tp(train_df)
valid_proc = tp(valid_df)
test_proc = tp(test_df)

# Create Traces
train_traces = create_traces(train_proc)
valid_traces = create_traces(valid_proc)
test_traces = create_traces(test_proc)

# Create DataLoader
bs, bptt = 128, 70
data = DataBunch(*get_dls(train_traces, valid_traces))

iter_dl = iter(data.train_dl)
xb, yb = next(iter_dl)

CPU times: user 31.8 s, sys: 36.2 ms, total: 31.8 s
Wall time: 31.8 s


# Load AWD-LSTM Model

In [16]:
def dropout_mask(x, sz, p):
    return x.new(*sz).bernoulli_(1-p).div_(1-p)

class RNNDropout(nn.Module):
    def __init__(self, p=0.5):
        super().__init__()
        self.p=p

    def forward(self, x):
        if not self.training or self.p == 0.: return x
        m = dropout_mask(x.data, (x.size(0), 1, x.size(2)), self.p)
        return x * m

WEIGHT_HH = 'weight_hh_l0'

class WeightDropout(nn.Module):
    def __init__(self, module, weight_p=[0.], layer_names=[WEIGHT_HH]):
        super().__init__()
        self.module,self.weight_p,self.layer_names = module,weight_p,layer_names
        for layer in self.layer_names:
            #Makes a copy of the weights of the selected layers.
            w = getattr(self.module, layer)
            self.register_parameter(f'{layer}_raw', nn.Parameter(w.data))
            self.module._parameters[layer] = F.dropout(w, p=self.weight_p, training=False)

    def _setweights(self):
        for layer in self.layer_names:
            raw_w = getattr(self, f'{layer}_raw')
            self.module._parameters[layer] = F.dropout(raw_w, p=self.weight_p, training=self.training)

    def forward(self, *args):
        self._setweights()
        with warnings.catch_warnings():
            #To avoid the warning that comes because the weights aren't flattened.
            warnings.simplefilter("ignore")
            return self.module.forward(*args)
        
class EmbeddingDropout(nn.Module):
    "Applies dropout in the embedding layer by zeroing out some elements of the embedding vector."
    def __init__(self, emb, embed_p):
        super().__init__()
        self.emb,self.embed_p = emb,embed_p
        self.pad_idx = self.emb.padding_idx
        if self.pad_idx is None: self.pad_idx = -1

    def forward(self, words, scale=None):
        if self.training and self.embed_p != 0:
            size = (self.emb.weight.size(0),1)
            mask = dropout_mask(self.emb.weight.data, size, self.embed_p)
            masked_embed = self.emb.weight * mask
        else: masked_embed = self.emb.weight
        if scale: masked_embed.mul_(scale)
        return F.embedding(words, masked_embed, self.pad_idx, self.emb.max_norm,
                           self.emb.norm_type, self.emb.scale_grad_by_freq, self.emb.sparse)
    
def to_detach(h):
    "Detaches `h` from its history."
    return h.detach() if type(h) == torch.Tensor else tuple(to_detach(v) for v in h)

class AWD_LSTM(nn.Module):
    "AWD-LSTM inspired by https://arxiv.org/abs/1708.02182."
    initrange=0.1

    def __init__(self, vocab_sz, emb_sz, n_hid, n_layers, pad_token,
                 hidden_p=0.2, input_p=0.6, embed_p=0.1, weight_p=0.5):
        super().__init__()
        self.bs,self.emb_sz,self.n_hid,self.n_layers = 1,emb_sz,n_hid,n_layers
        self.emb = nn.Embedding(vocab_sz, emb_sz, padding_idx=pad_token)
        self.emb_dp = EmbeddingDropout(self.emb, embed_p)
        self.rnns = [nn.LSTM(emb_sz if l == 0 else n_hid, (n_hid if l != n_layers - 1 else emb_sz), 1,
                             batch_first=True) for l in range(n_layers)]
        self.rnns = nn.ModuleList([WeightDropout(rnn, weight_p) for rnn in self.rnns])
        self.emb.weight.data.uniform_(-self.initrange, self.initrange)
        self.input_dp = RNNDropout(input_p)
        self.hidden_dps = nn.ModuleList([RNNDropout(hidden_p) for l in range(n_layers)])

    def forward(self, input):
        input = input[:, 3].long()
        bs, sl = input.size()
        if bs!=self.bs:
            self.bs=bs
            self.reset()
        raw_output = self.input_dp(self.emb_dp(input))
        new_hidden,raw_outputs,outputs = [],[],[]
        for l, (rnn,hid_dp) in enumerate(zip(self.rnns, self.hidden_dps)):
            raw_output, new_h = rnn(raw_output, self.hidden[l])
            new_hidden.append(new_h)
            raw_outputs.append(raw_output)
            if l != self.n_layers - 1: raw_output = hid_dp(raw_output)
            outputs.append(raw_output)
        self.hidden = to_detach(new_hidden)
        return raw_outputs, outputs

    def _one_hidden(self, l):
        "Return one hidden state."
        nh = self.n_hid if l != self.n_layers - 1 else self.emb_sz
        return next(self.parameters()).new(1, self.bs, nh).zero_()

    def reset(self):
        "Reset the hidden states."
        self.hidden = [(self._one_hidden(l), self._one_hidden(l)) for l in range(self.n_layers)]
        
class LinearDecoder(nn.Module):
    def __init__(self, n_out, n_hid, output_p, tie_encoder=None, bias=True):
        super().__init__()
        self.output_dp = RNNDropout(output_p)
        self.decoder = nn.Linear(n_hid, n_out, bias=bias)
        if bias: self.decoder.bias.data.zero_()
        if tie_encoder: self.decoder.weight = tie_encoder.weight
        else: init.kaiming_uniform_(self.decoder.weight)

    def forward(self, input):
        raw_outputs, outputs = input
        output = self.output_dp(outputs[-1]).contiguous()
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded, raw_outputs, outputs
    
class SequentialRNN(nn.Sequential):
    "A sequential module that passes the reset call to its children."
    def reset(self):
        for c in self.children():
            if hasattr(c, 'reset'): c.reset()
                
def get_language_model(vocab_sz, emb_sz, n_hid, n_layers, pad_token, output_p=0.4, hidden_p=0.2, input_p=0.6, 
                       embed_p=0.1, weight_p=0.5, tie_weights=True, bias=True):
    rnn_enc = AWD_LSTM(vocab_sz, emb_sz, n_hid=n_hid, n_layers=n_layers, pad_token=pad_token,
                       hidden_p=hidden_p, input_p=input_p, embed_p=embed_p, weight_p=weight_p)
    enc = rnn_enc.emb if tie_weights else None
    return SequentialRNN(rnn_enc, LinearDecoder(vocab_sz, emb_sz, output_p, tie_encoder=enc, bias=bias))

def cross_entropy_activity(input, target):
    target = target[:, 3] # magic number for 'concept:name'
    bs, sl = target.size()
    return F.cross_entropy(input.view(bs * sl, -1), target.flatten().long())

def accuracy_activity(input, target): 
    target = target[:, 3] # magic number for 'concept:name'
    bs, sl = target.size()
    return (torch.argmax(input.view(bs * sl, -1), dim=1)==target.flatten().long()).float().mean()

## Training Loop

In [17]:
class GradientClipping(Callback):
    def __init__(self, clip=None): self.clip = clip
    def after_backward(self):
        if self.clip:  nn.utils.clip_grad_norm_(self.run.model.parameters(), self.clip)
            
class RNNTrainer(Callback):
    def __init__(self, α, β): self.α,self.β = α,β
    
    def after_pred(self):
        #Save the extra outputs for later and only returns the true output.
        self.raw_out,self.out = self.pred[1],self.pred[2]
        self.run.pred = self.pred[0]
    
    def after_loss(self):
        #AR and TAR
        if self.α != 0.:  self.run.loss += self.α * self.out[-1].float().pow(2).mean()
        if self.β != 0.:
            h = self.raw_out[-1]
            if len(h)>1: self.run.loss += self.β * (h[:,1:] - h[:,:-1]).float().pow(2).mean()
                
    def begin_epoch(self):
        pass
        #Shuffle the texts at the beginning of the epoch
        #if hasattr(self.dl.dataset, "batchify"): self.dl.dataset.batchify()

In [18]:
# Create Dropout-mask
x = torch.randn(10,10)
mask = dropout_mask(x, (10,10), 0.5) # display((x*mask).std(), x.std())
# dp = RNNDropout(0.3)

padding_index = 1
vocab_size = len(tp.vocabs["concept:name"].vocab)
emb_sz, nh, nl = 300, 300, 2

model = get_language_model(vocab_size, emb_sz, nh, nl, padding_index, input_p=0.6, output_p=0.4, weight_p=0.5, 
                           embed_p=0.1, hidden_p=0.2)

cbs = [partial(AvgStatsCallback,accuracy_activity),
       CudaCallback, Recorder,
       partial(GradientClipping, clip=0.1),
       partial(RNNTrainer, α=2., β=1.),
       ProgressBarCallback]

learn0 = Learner(model, data, cross_entropy_activity, lr=5e-3, cb_funcs=cbs, opt_func=adam_opt())
learn0.fit(20)

awd_lstm = learn0.model

epoch,train_loss,train_accuracy_activity,valid_loss,valid_accuracy_activity,time
0,2.301018,0.381037,1.529319,0.552734,00:00
1,1.287843,0.602466,0.888631,0.747824,00:00
2,0.915237,0.708934,0.635054,0.795871,00:00
3,0.738355,0.771551,0.520768,0.825446,00:00
4,0.620418,0.802641,0.464158,0.843917,00:00
5,0.611407,0.806526,0.454764,0.845592,00:00
6,0.581666,0.81107,0.425296,0.845368,00:00
7,0.515097,0.830671,0.412761,0.853348,00:00
8,0.536649,0.822226,0.405276,0.853516,00:00
9,0.502272,0.83158,0.397009,0.854129,00:00


# Load Basic Model

In [19]:
class BasicModel(nn.Module):
    def __init__(self, n_in, n_out, n_emb,nh):
        super().__init__()
        self.emb = nn.Embedding(n_in, 7, padding_idx=1)
        self.lin1 = nn.Linear(7, nh)
        self.relu = nn.ReLU()
        self.lin2 = nn.Linear(nh, n_out)
        
    def __call__(self, x):
        x = x[:, 3] # magic number for 'concept:name'
        x = x.long()
        x = self.emb(x)
        x = self.lin1(x)
        x = self.relu(x)
        x = self.lin2(x)
        return x.float()
    
def getBasicModel():
    vocab = len((tp.vocabs['concept:name']).vocab) # Stupid 'concept:name' model
    n_emb, nh = int(vocab/2), 10
    model = BasicModel(bs*bptt, vocab, n_emb, nh)
    return model

def cross_entropy_activity(input, target):
    target = target[:, 3] # magic number for 'concept:name'
    bs, sl = target.size()
    return F.cross_entropy(input.view(bs * sl, -1), target.flatten().long())

def accuracy_activity(input, target): 
    target = target[:, 3] # magic number for 'concept:name'
    bs, sl = target.size()
    return (torch.argmax(input.view(bs * sl, -1), dim=1) == target.flatten().long()).float().mean()

class CudaCallback(Callback):
    def begin_fit(self): self.model.cuda()
    def begin_batch(self): self.run.xb,self.run.yb = self.xb.cuda(), self.yb.cuda()

In [20]:
model = getBasicModel()
pred = model(xb)

sched = combine_scheds([0.3, 0.7], [sched_cos(0.3, 0.6), sched_cos(0.6, 0.2)]) 
cbfs = [partial(AvgStatsCallback,accuracy_activity),
        CudaCallback, 
        Recorder,
        partial(ParamScheduler, 'lr', sched),
        ProgressBarCallback]

opt_func = partial(Optimizer, steppers=[sgd_step])
opt = opt_func(model.parameters(), lr=0.5)

learn1 = Learner(model, data, cross_entropy_activity, cb_funcs=cbfs, opt_func=opt_func)
learn1.fit(20)

basic_model = learn1.model

epoch,train_loss,train_accuracy_activity,valid_loss,valid_accuracy_activity,time
0,2.834084,0.256415,2.335463,0.400558,00:00
1,2.123303,0.457701,1.93549,0.497712,00:00
2,1.801295,0.495387,1.658341,0.556083,00:00
3,1.54073,0.614031,1.414832,0.630748,00:00
4,1.339196,0.629969,1.265034,0.638895,00:00
5,1.238882,0.636108,1.217808,0.638895,00:00
6,1.183578,0.636581,1.156898,0.638895,00:00
7,1.150111,0.636442,1.132803,0.639509,00:00
8,1.117328,0.642932,1.099438,0.653571,00:00
9,1.098077,0.650648,1.085022,0.652958,00:00


# Evaluation

In [24]:
learn1.avg_stats.train_stats
learn1.

train: [1.0081724439348494, tensor(0.6602, device='cuda:0')]

In [None]:
def process_data_for_next_step_prediction(test, input_cols=None, output_col=3, startIndex=1):
    xs, ys = [], []
    if input_cols == None: 
        input_cols=list(test)
    
    i = 0
    input_cols = listify(input_cols)
    for trace in test.values:
        for i in range(startIndex, len(listify(trace[0]))):
            x, y = [], []
            for c in range(len(input_cols)):
                x.append(trace[c][:i])
                
            xs.append(x)
            ys.append(trace[output_col][i])
            
    return pd.DataFrame(xs, columns=input_cols), ys

def pad_collate(samples, pad_idx=1, pad_first=True):
    columns = list(samples)
    values = []
    
    for col in columns:
        max_len = max([len(s) for s in samples[col]])
        res = torch.zeros(len(samples[col]), max_len).long() + pad_idx
    
        for i, s in enumerate(samples[col]):
            if pad_first: res[i, -len(s):] = torch.LongTensor(s)
            else:         res[i, :len(s) ] = torch.LongTensor(s)
        values.append(res)

    return values

def predict_next_step(model, df):
    model.eval()
    model.cpu()
    preds = []
    for e in df.values:
        t = torch.stack([tensor(e[c]).float() for c in range(len(list(df)))])
        pred = model(t[None])
        preds.append(pred[0][-1].tolist())
        
    return np.argmax(np.array(preds), axis=1)

def next_step_measure(preds, ys):
    # Simple accuracy measure
    # Do I have to weight it? Check Paper!
    return (np.array(preds) == np.array(ys)).mean()

In [None]:
%%time

x, y = process_data_for_next_step_prediction(test_traces)
xpad = pad_collate(x, pad_first=True)

#preds0 = predict_next_step(awd_lstm, x)
#next_step_measure(preds0, y)

#preds1 = predict_next_step(basic_model, x)
#next_step_measure(preds1, y)