# Imports

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
from exp.eventlog import *
from exp.dl_utils import *

import editdistance as ed

# Loading Data

In [3]:
path = untar_data(URLs.BPIC_2012)
log = import_xes(path, extensions=False, classifiers=False, schema=False, log_attributes=False)

failed to parse date: 1970-01-01T00:00:00.000+01:00
failed to parse date: 1970-01-01T00:00:00.000+01:00
failed to parse date: 2012-04-23T00:00:00.000+02:00
failed to parse date: 2011-10-01T00:38:44.546+02:00
failed to parse date: 2012-03-14T16:04:54.681+01:00


In [4]:
# Merge Trace Attributes and Event Attributes first in one df. It is easier to copy over the trace attributes.

df = pd.merge(log.events, log.traceAttributes, left_on='trace_id', right_index=True)
df.head()

Unnamed: 0,trace_id,event_id,org:resource,lifecycle:transition,concept:name,time:timestamp,REG_DATE,AMOUNT_REQ
0,173688,0,112.0,COMPLETE,A_SUBMITTED,2011-09-30 22:38:44.546000+00:00,2011-10-01 00:38:44.546000+02:00,20000
1,173688,1,112.0,COMPLETE,A_PARTLYSUBMITTED,2011-09-30 22:38:44.880000+00:00,2011-10-01 00:38:44.546000+02:00,20000
2,173688,2,112.0,COMPLETE,A_PREACCEPTED,2011-09-30 22:39:37.906000+00:00,2011-10-01 00:38:44.546000+02:00,20000
3,173688,3,112.0,SCHEDULE,W_Completeren aanvraag,2011-09-30 22:39:38.875000+00:00,2011-10-01 00:38:44.546000+02:00,20000
4,173688,4,,START,W_Completeren aanvraag,2011-10-01 09:36:46.437000+00:00,2011-10-01 00:38:44.546000+02:00,20000


# Data Preprocessing

## Splitting the Data

In [5]:
trace_id = "trace_id"

def random_split_traces(d, split=0.8, trace_id='trace_id'):
    traces = d[trace_id].drop_duplicates()
    shuffled = traces.iloc[np.random.permutation(len(traces))].values
    split = int(len(traces) * split)
    return shuffled[:split], shuffled[split:]

In [6]:
def get_df(t, df): 
    return df[df[trace_id].isin(t)]

In [7]:
# Split Data into train_indices and test_indices
train_valid_ids, test_ids = random_split_traces(df)

# Get the Dataframe from the train_idices to split it into train and valid
train_valid_df = get_df(train_valid_ids, df)
train_ids, valid_ids = random_split_traces(train_valid_df, split=0.9)

# Get the Dataframe for train, valid and test
train_df = get_df(train_ids, df)
valid_df = get_df(valid_ids, df)
test_df = get_df(test_ids, df)

## Process Data

In [9]:
def normalize_cont_column(x, mean, std, eps=1e-7): 
    return (x-mean)/(eps+std)

In [10]:
UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ = "xxunk xxpad xxbos xxeos xxrep xxwrep xxup xxmaj".split()
default_spec_tok = [UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ]

cat_names = ['event_id', 'org:resource', 'lifecycle:transition', 'concept:name']
date_names = ['time:timestamp', 'REG_DATE']
cont_names = ['AMOUNT_REQ']

In [11]:
from collections import OrderedDict

def uniqueify(x, sort=False):
    res = list(OrderedDict.fromkeys(x).keys())
    if sort: res.sort()
    return res

class Processor():
    def process(self, items): return items

class CategoryProcessor(Processor):
    def __init__(self,default_token=None): 
        self.vocab = None
        self.default_token = default_token

    def __call__(self, items):
        #The vocab is defined on the first use.
        if self.vocab is None:
            self.vocab = uniqueify(items)
            if self.default_token is not None:
                for o in reversed(self.default_token):
                    if o in self.vocab: self.vocab.remove(o)
                    self.vocab.insert(0, o)
            self.otoi = {v: k for k,v in enumerate(self.vocab)}
        return [self.proc1(o) for o in items]
    
    def proc1(self, item): return self.otoi.get(item, 0)

    def deprocess(self, idxs):
        assert self.vocab is not None
        return [self.deproc1(idx) for idx in idxs]
    
    def deproc1(self, idx): return self.vocab[idx]

In [12]:
def add_datepart(df, fldname, drop=True, time=False, utc=False):
    "Helper function that adds columns relevant to a date."
    df = df.copy()
    fld = df[fldname]
    fld_dtype = fld.dtype
    if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        fld_dtype = np.datetime64

    if not np.issubdtype(fld_dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, utc=utc,infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    cols=[]
    for n in attr: 
        col_name=targ_pre +"_"+ n
        df[col_name] = getattr(fld.dt, n.lower())
        cols.append(col_name)
    df[targ_pre + '_Elapsed'] = fld.astype(np.int64) // 10 ** 9
    if drop: df.drop(fldname, axis=1, inplace=True)
    return df, cols, targ_pre + '_Elapsed'

In [13]:
class TraceProcessor(Processor):
    def __init__(self, cat_names, cont_names, date_names, vocabs={}):
        self.vocabs = vocabs
        self.cat_names, self.cont_names, self.date_names = cat_names, cont_names, date_names
        
    def __call__(self, df):
        cat_names, cont_names = self.cat_names[:], self.cont_names[:]
        for d in self.date_names:
            df, cat, cont = add_datepart(df, d, utc=True)
            cat_names += listify(cat)    
            cont_names += listify(cont)

        for c in cat_names:
            if not c in self.vocabs.keys(): 
                self.vocabs[c] = CategoryProcessor(default_spec_tok)
            df[c] = self.vocabs[c](df[c])
            
            
        for c in cont_names:
            df[c] = df[c].astype(float)

            if not c in self.vocabs.keys(): 
                self.vocabs[c] = df[c].mean(), df[c].std()
            df[c] = normalize_cont_column(df[c], *self.vocabs[c])
    
        return df

In [14]:
# Encode Data

tp = TraceProcessor(cat_names, cont_names, date_names)

train_proc = tp(train_df)
valid_proc = tp(valid_df)
test_proc = tp(test_df)

In [15]:
len(tp.vocabs)

31

In [16]:
def create_traces(event_df, trace_id='trace_id'):
    ll = []
    trace_ids = []
    cols = list(event_df)
    cols.remove(trace_id)
    for n, g in event_df.groupby(trace_id):
        l = []
        
        for c in cols:
            l.append(list(g[c]))
        ll.append(l)
        trace_ids.append(n)  
        

    df = pd.DataFrame(ll, columns=cols)
    df.index = trace_ids
    return df

In [17]:
train_traces = create_traces(train_proc)
valid_traces = create_traces(valid_proc)
test_traces = create_traces(test_proc)

train_traces.head()

Unnamed: 0,event_id,org:resource,lifecycle:transition,concept:name,AMOUNT_REQ,time:timestamp_Year,time:timestamp_Month,time:timestamp_Week,time:timestamp_Day,time:timestamp_Dayofweek,...,REG_DATE_Day,REG_DATE_Dayofweek,REG_DATE_Dayofyear,REG_DATE_Is_month_end,REG_DATE_Is_month_start,REG_DATE_Is_quarter_end,REG_DATE_Is_quarter_start,REG_DATE_Is_year_end,REG_DATE_Is_year_start,REG_DATE_Elapsed
173700,"[8, 9, 10]","[8, 8, 8]","[8, 8, 8]","[8, 9, 10]","[-0.8520994544284267, -0.8520994544284267, -0....","[8, 8, 8]","[8, 8, 8]","[8, 8, 8]","[8, 8, 8]","[8, 8, 8]",...,"[8, 8, 8]","[8, 8, 8]","[8, 8, 8]","[8, 8, 8]","[8, 8, 8]","[8, 8, 8]","[8, 8, 8]","[8, 8, 8]","[8, 8, 8]","[-1.7143697334172892, -1.7143697334172892, -1...."
173709,"[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]","[8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 8]","[8, 8, 8, 9, 10, 8, 10, 8, 10, 8, 8, 8]","[8, 9, 11, 12, 12, 12, 12, 12, 12, 12, 12, 13]","[-0.37013941909466025, -0.37013941909466025, -...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9]","[8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 10, 10]","[8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 8, 8]","[8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 10, 10]",...,"[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]","[-1.712781992472111, -1.712781992472111, -1.71..."
173712,"[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,...","[8, 8, 8, 9, 9, 9, 9, 11, 11, 12, 12, 12, 12, 12]","[8, 8, 9, 10, 8, 9, 8, 10, 8, 10, 8, 10, 8, 8]","[8, 9, 14, 14, 11, 12, 14, 12, 12, 12, 12, 12,...","[1.1560673594622668, 1.1560673594622668, 1.156...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8, 8, 8, 8, 11, 11, 11, 11, 11]","[8, 8, 8, 8, 8, 8, 8, 8, 8, 10, 10, 10, 10, 10]","[8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9]",...,"[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]","[-1.71276954570331, -1.71276954570331, -1.7127..."
173715,"[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,...","[8, 8, 8, 8, 13, 13, 9, 9, 9, 9, 9, 9, 9, 9, 1...","[8, 8, 8, 9, 10, 8, 10, 8, 8, 8, 8, 8, 9, 8, 1...","[8, 9, 11, 12, 12, 12, 12, 15, 16, 17, 18, 19,...","[2.3609674477966833, 2.3609674477966833, 2.360...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 11,...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 11,...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...",...,"[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[-1.712759173395976, -1.712759173395976, -1.71..."
173718,"[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,...","[8, 8, 8, 8, 13, 17, 17, 17, 17, 17, 13, 13, 1...","[8, 8, 8, 9, 10, 8, 8, 8, 8, 8, 9, 8, 10, 8, 1...","[8, 9, 11, 12, 12, 15, 17, 16, 18, 19, 20, 12,...","[-0.04883272887214927, -0.04883272887214927, -...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...",...,"[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[-1.712160431955111, -1.712160431955111, -1.71..."


# LanguageModel Dataloader

In [18]:
class LMDataSet():
    def __init__(self, df, bs=64, bptt=70, shuffle=False):
        self.bs, self.bptt, self.shuffle = bs, bptt, shuffle
        self.cols = list(df)

        total_len = sum(df.apply(lambda x: max([len(listify(x[k])) for k in self.cols]),axis=1))
        self.n_batch = total_len // self.bs
        self.batched = self.batchify(df)
    
    def __len__(self): return ((self.n_batch-1) // self.bptt) * self.bs
    
    def __getitem__(self, idx):
        source = self.batched[:, idx % self.bs]
        seq_idx = (idx // self.bs) * self.bptt
        x, y = source[:, seq_idx:seq_idx+self.bptt], source[:, seq_idx+1:seq_idx+self.bptt+1]
        return x,y
    
    def batchify(self,df):
        if self.shuffle: df=df.sample(frac=1)
        
        dd={}
        for c in self.cols:
            dd[c]=[]
        for i, row in df.iterrows():
            l = max([len(listify(row[c])) for c in self.cols])
            for c in self.cols:
                dd[c].append(tensor(row[c]).expand(l))
        for c in self.cols:
            s = torch.cat([torch.cat((tensor([2.0]), t.float(), tensor([3.0]))) for t in dd[c]])
            dd[c] = s[:self.n_batch * self.bs].view(self.bs, self.n_batch)
        return torch.stack([dd[c] for c in self.cols])

In [19]:
bs, bptt = 128, 70

def get_dls(train_ds, valid_ds,  **kwargs):
    return (DataLoader(LMDataSet(train_ds, shuffle=True, bptt=bptt), batch_size=bs),
            DataLoader(LMDataSet(valid_ds, shuffle=False, bptt=bptt), batch_size=bs))

In [20]:
data = DataBunch(*get_dls(train_traces, valid_traces))

In [21]:
iter_dl = iter(data.train_dl)
xb, yb = next(iter_dl)

# xb hat also die folgende Größe: 128 Einträge, die jeweils 31 Einträge mit 70 Werten enthalten.
xb.size()

torch.Size([128, 31, 70])

# AWD-LSTM Model

In [22]:
#export
def dropout_mask(x, sz, p):
    return x.new(*sz).bernoulli_(1-p).div_(1-p)

In [23]:
x = torch.randn(10,10)
mask = dropout_mask(x, (10,10), 0.5); mask

tensor([[2., 0., 2., 2., 2., 0., 2., 0., 2., 0.],
        [2., 2., 0., 2., 0., 0., 0., 0., 2., 2.],
        [0., 2., 0., 2., 2., 2., 0., 0., 0., 0.],
        [0., 2., 0., 0., 2., 2., 2., 0., 2., 0.],
        [2., 2., 2., 2., 2., 0., 0., 2., 0., 0.],
        [0., 0., 2., 2., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 2., 0., 0., 2., 2., 0., 0.],
        [2., 2., 0., 2., 0., 2., 0., 0., 2., 0.],
        [0., 0., 0., 2., 0., 2., 2., 0., 2., 0.],
        [0., 0., 0., 2., 2., 2., 0., 0., 0., 2.]])

In [24]:
(x*mask).std(), x.std()

(tensor(1.3338), tensor(0.8860))

In [25]:
#export
class RNNDropout(nn.Module):
    def __init__(self, p=0.5):
        super().__init__()
        self.p=p

    def forward(self, x):
        if not self.training or self.p == 0.: return x
        m = dropout_mask(x.data, (x.size(0), 1, x.size(2)), self.p)
        return x * m

In [26]:
dp = RNNDropout(0.3)
tst_input = torch.randn(3,3,7)
tst_input.shape, dp(tst_input).shape

(torch.Size([3, 3, 7]), torch.Size([3, 3, 7]))

In [27]:
#export
import warnings

WEIGHT_HH = 'weight_hh_l0'

class WeightDropout(nn.Module):
    def __init__(self, module, weight_p=[0.], layer_names=[WEIGHT_HH]):
        super().__init__()
        self.module,self.weight_p,self.layer_names = module,weight_p,layer_names
        for layer in self.layer_names:
            #Makes a copy of the weights of the selected layers.
            w = getattr(self.module, layer)
            self.register_parameter(f'{layer}_raw', nn.Parameter(w.data))
            self.module._parameters[layer] = F.dropout(w, p=self.weight_p, training=False)

    def _setweights(self):
        for layer in self.layer_names:
            raw_w = getattr(self, f'{layer}_raw')
            self.module._parameters[layer] = F.dropout(raw_w, p=self.weight_p, training=self.training)

    def forward(self, *args):
        self._setweights()
        with warnings.catch_warnings():
            #To avoid the warning that comes because the weights aren't flattened.
            warnings.simplefilter("ignore")
            return self.module.forward(*args)

In [28]:
module = nn.LSTM(5, 2)
dp_module = WeightDropout(module, 0.4)
getattr(dp_module.module, WEIGHT_HH)

Parameter containing:
tensor([[-0.2272, -0.0797],
        [ 0.2494,  0.0336],
        [-0.0886,  0.3769],
        [ 0.3528, -0.3193],
        [-0.4407, -0.4356],
        [ 0.4189, -0.0403],
        [-0.3918, -0.6412],
        [-0.3841,  0.1335]], requires_grad=True)

In [29]:
tst_input = torch.randn(4,20,5)
h = (torch.zeros(1,20,2), torch.zeros(1,20,2))
x,h = dp_module(tst_input,h)
getattr(dp_module.module, WEIGHT_HH)

tensor([[-0.0000, -0.1328],
        [ 0.4156,  0.0560],
        [-0.1476,  0.0000],
        [ 0.5879, -0.0000],
        [-0.0000, -0.7260],
        [ 0.6981, -0.0672],
        [-0.0000, -1.0686],
        [-0.6402,  0.2225]], grad_fn=<MulBackward0>)

In [30]:
#export
class EmbeddingDropout(nn.Module):
    "Applies dropout in the embedding layer by zeroing out some elements of the embedding vector."
    def __init__(self, emb, embed_p):
        super().__init__()
        self.emb,self.embed_p = emb,embed_p
        self.pad_idx = self.emb.padding_idx
        if self.pad_idx is None: self.pad_idx = -1

    def forward(self, words, scale=None):
        if self.training and self.embed_p != 0:
            size = (self.emb.weight.size(0),1)
            mask = dropout_mask(self.emb.weight.data, size, self.embed_p)
            masked_embed = self.emb.weight * mask
        else: masked_embed = self.emb.weight
        if scale: masked_embed.mul_(scale)
        return F.embedding(words, masked_embed, self.pad_idx, self.emb.max_norm,
                           self.emb.norm_type, self.emb.scale_grad_by_freq, self.emb.sparse)

In [31]:
enc = nn.Embedding(100, 7, padding_idx=1)
enc_dp = EmbeddingDropout(enc, 0.5)
tst_input = torch.randint(0,100,(8,))
enc_dp(tst_input)

tensor([[-2.4143,  1.6710, -2.0157, -3.7371,  2.4505,  0.8283, -0.5766],
        [-0.0000, -0.0000, -0.0000, -0.0000,  0.0000, -0.0000,  0.0000],
        [ 0.0000, -0.0000,  0.0000,  0.0000, -0.0000, -0.0000,  0.0000],
        [-0.7171,  0.5501, -2.2564, -0.5140,  2.0190,  3.1024,  1.0897],
        [ 0.0000,  0.0000, -0.0000,  0.0000,  0.0000, -0.0000, -0.0000],
        [-1.0049,  0.5598, -3.2516, -3.4682, -1.5463,  2.7949,  2.1786],
        [ 0.0500,  0.5723, -4.5704, -3.0745,  3.5074, -0.8298,  1.3030],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]],
       grad_fn=<EmbeddingBackward>)

In [32]:
#export
def to_detach(h):
    "Detaches `h` from its history."
    return h.detach() if type(h) == torch.Tensor else tuple(to_detach(v) for v in h)

In [114]:
#export
class AWD_LSTM(nn.Module):
    "AWD-LSTM inspired by https://arxiv.org/abs/1708.02182."
    initrange=0.1

    def __init__(self, vocab_sz, emb_sz, n_hid, n_layers, pad_token,
                 hidden_p=0.2, input_p=0.6, embed_p=0.1, weight_p=0.5):
        super().__init__()
        self.bs,self.emb_sz,self.n_hid,self.n_layers = 1,emb_sz,n_hid,n_layers
        self.emb = nn.Embedding(vocab_sz, emb_sz, padding_idx=pad_token)
        self.emb_dp = EmbeddingDropout(self.emb, embed_p)
        self.rnns = [nn.LSTM(emb_sz if l == 0 else n_hid, (n_hid if l != n_layers - 1 else emb_sz), 1,
                             batch_first=True) for l in range(n_layers)]
        self.rnns = nn.ModuleList([WeightDropout(rnn, weight_p) for rnn in self.rnns])
        self.emb.weight.data.uniform_(-self.initrange, self.initrange)
        self.input_dp = RNNDropout(input_p)
        self.hidden_dps = nn.ModuleList([RNNDropout(hidden_p) for l in range(n_layers)])

    def forward(self, input):
        #input = input[:, 3].long()
        bs, sl = input.size()
        if bs!=self.bs:
            self.bs=bs
            self.reset()
        raw_output = self.input_dp(self.emb_dp(input))
        new_hidden,raw_outputs,outputs = [],[],[]
        for l, (rnn,hid_dp) in enumerate(zip(self.rnns, self.hidden_dps)):
            raw_output, new_h = rnn(raw_output, self.hidden[l])
            new_hidden.append(new_h)
            raw_outputs.append(raw_output)
            if l != self.n_layers - 1: raw_output = hid_dp(raw_output)
            outputs.append(raw_output) 
        self.hidden = to_detach(new_hidden)
        return raw_outputs, outputs

    def _one_hidden(self, l):
        "Return one hidden state."
        nh = self.n_hid if l != self.n_layers - 1 else self.emb_sz
        return next(self.parameters()).new(1, self.bs, nh).zero_()

    def reset(self):
        "Reset the hidden states."
        self.hidden = [(self._one_hidden(l), self._one_hidden(l)) for l in range(self.n_layers)]

In [34]:
#export
class LinearDecoder(nn.Module):
    def __init__(self, n_out, n_hid, output_p, tie_encoder=None, bias=True):
        super().__init__()
        self.output_dp = RNNDropout(output_p)
        self.decoder = nn.Linear(n_hid, n_out, bias=bias)
        if bias: self.decoder.bias.data.zero_()
        if tie_encoder: self.decoder.weight = tie_encoder.weight
        else: init.kaiming_uniform_(self.decoder.weight)

    def forward(self, input):
        raw_outputs, outputs = input
        output = self.output_dp(outputs[-1]).contiguous()
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded, raw_outputs, outputs

In [35]:
#export
class SequentialRNN(nn.Sequential):
    "A sequential module that passes the reset call to its children."
    def reset(self):
        for c in self.children():
            if hasattr(c, 'reset'): c.reset()

In [36]:
#export
def get_language_model(vocab_sz, emb_sz, n_hid, n_layers, pad_token, output_p=0.4, hidden_p=0.2, input_p=0.6, 
                       embed_p=0.1, weight_p=0.5, tie_weights=True, bias=True):
    rnn_enc = AWD_LSTM(vocab_sz, emb_sz, n_hid=n_hid, n_layers=n_layers, pad_token=pad_token,
                       hidden_p=hidden_p, input_p=input_p, embed_p=embed_p, weight_p=weight_p)
    enc = rnn_enc.emb if tie_weights else None
    return SequentialRNN(rnn_enc, LinearDecoder(vocab_sz, emb_sz, output_p, tie_encoder=enc, bias=bias))

In [37]:
def cross_entropy_activity(input, target):
    target = target[:, 3] # magic number for 'concept:name'
    bs, sl = target.size()
    
    return F.cross_entropy(input.view(bs * sl, -1), target.flatten().long())

In [38]:
def accuracy_activity(input, target): 
    target = target[:, 3] # magic number for 'concept:name'
    bs, sl = target.size()
    
    return (torch.argmax(input.view(bs * sl, -1), dim=1)==target.flatten().long()).float().mean()

# Training Loop with AWD-LSTM

In [39]:
#export
class GradientClipping(Callback):
    def __init__(self, clip=None): self.clip = clip
    def after_backward(self):
        if self.clip:  nn.utils.clip_grad_norm_(self.run.model.parameters(), self.clip)

In [40]:
#export
class RNNTrainer(Callback):
    def __init__(self, α, β): self.α,self.β = α,β
    
    def after_pred(self):
        #Save the extra outputs for later and only returns the true output.
        self.raw_out,self.out = self.pred[1],self.pred[2]
        self.run.pred = self.pred[0]
    
    def after_loss(self):
        #AR and TAR
        if self.α != 0.:  self.run.loss += self.α * self.out[-1].float().pow(2).mean()
        if self.β != 0.:
            h = self.raw_out[-1]
            if len(h)>1: self.run.loss += self.β * (h[:,1:] - h[:,:-1]).float().pow(2).mean()
                
    def begin_epoch(self):
        pass
        #Shuffle the texts at the beginning of the epoch
        #if hasattr(self.dl.dataset, "batchify"): self.dl.dataset.batchify()

In [41]:
padding_index = 1
vocab_size = len(tp.vocabs["concept:name"].vocab)
emb_sz, nh, nl = 300, 300, 2

model = get_language_model(vocab_size, emb_sz, nh, nl, padding_index, input_p=0.6, output_p=0.4, weight_p=0.5, 
                           embed_p=0.1, hidden_p=0.2)

In [42]:
cbs = [partial(AvgStatsCallback,accuracy_activity),
       CudaCallback, Recorder,
       partial(GradientClipping, clip=0.1),
       partial(RNNTrainer, α=2., β=1.),
       ProgressBarCallback]

In [43]:
learn = Learner(model, data, cross_entropy_activity, lr=5e-3, cb_funcs=cbs, opt_func=adam_opt())

In [44]:
learn.fit(20)

epoch,train_loss,train_accuracy_activity,valid_loss,valid_accuracy_activity,time
0,2.327349,0.362165,1.548164,0.52327,00:00
1,1.382998,0.574134,0.923341,0.718638,00:00
2,0.988249,0.685943,0.712049,0.787165,00:00
3,0.806282,0.755272,0.566669,0.824609,00:00
4,0.644157,0.79915,0.501129,0.839397,00:00
5,0.632528,0.803619,0.454039,0.845312,00:00
6,0.542831,0.821466,0.434724,0.846931,00:00
7,0.522649,0.828641,0.421824,0.847712,00:00
8,0.517371,0.83023,0.410881,0.855022,00:00
9,0.515498,0.826456,0.412232,0.847545,00:00


In [45]:
awd_lstm = learn.model
# awd_lstm(xb.cuda()).shape

# Evaluation

In [46]:
test_traces.head()

Unnamed: 0,event_id,org:resource,lifecycle:transition,concept:name,AMOUNT_REQ,time:timestamp_Year,time:timestamp_Month,time:timestamp_Week,time:timestamp_Day,time:timestamp_Dayofweek,...,REG_DATE_Day,REG_DATE_Dayofweek,REG_DATE_Dayofyear,REG_DATE_Is_month_end,REG_DATE_Is_month_start,REG_DATE_Is_quarter_end,REG_DATE_Is_quarter_start,REG_DATE_Is_year_end,REG_DATE_Is_year_start,REG_DATE_Elapsed
173691,"[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,...","[8, 8, 8, 8, 13, 13, 13, 17, 17, 17, 17, 17, 1...","[8, 8, 8, 9, 10, 8, 10, 8, 8, 8, 8, 8, 9, 8, 1...","[8, 9, 11, 12, 12, 12, 12, 15, 17, 16, 18, 19,...","[-0.8520994544284267, -0.8520994544284267, -0....","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...",...,"[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[-1.7144737157983143, -1.7144737157983143, -1...."
173694,"[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,...","[8, 8, 8, 8, 9, 9, 13, 13, 29, 29, 29, 29, 29,...","[8, 8, 8, 9, 10, 8, 10, 8, 10, 8, 10, 8, 8, 8,...","[8, 9, 11, 12, 12, 12, 12, 12, 12, 12, 12, 15,...","[-0.6914461093171712, -0.6914461093171712, -0....","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 11, 11, 11, 11, 11, 11, 11,...","[8, 8, 8, 8, 8, 8, 10, 10, 10, 10, 10, 10, 10,...","[8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, ...",...,"[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[-1.7144498594914457, -1.7144498594914457, -1...."
173724,"[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,...","[8, 8, 8, 9, 9, 25, 25, 25, 25, 9, 9, 13, 13, ...","[8, 8, 9, 10, 8, 10, 8, 9, 8, 10, 8, 10, 8, 10...","[8, 9, 14, 14, 14, 14, 11, 12, 14, 12, 12, 12,...","[-0.04883272887214927, -0.04883272887214927, -...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...",...,"[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[-1.7110000300721004, -1.7110000300721004, -1...."
173730,"[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,...","[8, 8, 8, 8, 9, 9, 38, 38, 38, 38, 38, 38, 38,...","[8, 8, 8, 9, 10, 8, 10, 8, 10, 8, 10, 8, 8, 8,...","[8, 9, 11, 12, 12, 12, 12, 12, 12, 12, 12, 15,...","[-0.04883272887214927, -0.04883272887214927, -...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...",...,"[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[-1.7103170136341452, -1.7103170136341452, -1...."
173733,"[8, 9, 10, 11, 12, 13]","[8, 8, 8, 25, 25, 25]","[8, 8, 9, 10, 8, 8]","[8, 9, 14, 14, 10, 14]","[-0.771772781872799, -0.771772781872799, -0.77...","[8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8]",...,"[8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8]","[-1.7102635962513744, -1.7102635962513744, -1...."


In [149]:
listify(list(test_traces))

['event_id',
 'org:resource',
 'lifecycle:transition',
 'concept:name',
 'AMOUNT_REQ',
 'time:timestamp_Year',
 'time:timestamp_Month',
 'time:timestamp_Week',
 'time:timestamp_Day',
 'time:timestamp_Dayofweek',
 'time:timestamp_Dayofyear',
 'time:timestamp_Is_month_end',
 'time:timestamp_Is_month_start',
 'time:timestamp_Is_quarter_end',
 'time:timestamp_Is_quarter_start',
 'time:timestamp_Is_year_end',
 'time:timestamp_Is_year_start',
 'time:timestamp_Elapsed',
 'REG_DATE_Year',
 'REG_DATE_Month',
 'REG_DATE_Week',
 'REG_DATE_Day',
 'REG_DATE_Dayofweek',
 'REG_DATE_Dayofyear',
 'REG_DATE_Is_month_end',
 'REG_DATE_Is_month_start',
 'REG_DATE_Is_quarter_end',
 'REG_DATE_Is_quarter_start',
 'REG_DATE_Is_year_end',
 'REG_DATE_Is_year_start',
 'REG_DATE_Elapsed']

## Next Step Prediction

In [205]:
DataLoader??

In [151]:
def process_data_for_next_step_prediction(test, input_cols=None, output_col=3, startIndex=1):
    xs, ys = [], []
    if input_cols == None: 
        input_cols=list(test)
    
    i = 0
    input_cols = listify(input_cols)
    for trace in test.values:
        for i in range(startIndex, len(listify(trace[0]))):
            x, y = [], []
            for c in range(len(input_cols)):
                x.append(trace[c][:i])
                
            xs.append(x)
            ys.append(trace[output_col][i])
            
    return pd.DataFrame(xs, columns=input_cols), ys

In [168]:
def pad_collate(samples, pad_idx=1, pad_first=True):
    columns = list(samples)
    values = []
    
    for col in columns:
        max_len = max([len(s) for s in samples[col]])
        res = torch.zeros(len(samples[col]), max_len).long() + pad_idx
    
        for i, s in enumerate(samples[col]):
            if pad_first: res[i, -len(s):] = torch.LongTensor(s)
            else:         res[i, :len(s) ] = torch.LongTensor(s)
        values.append(res)

    return values

In [245]:
bs = 64
test_dl = DataLoader(test_traces, batch_size=bs, collate_fn=pad_collate)
val_dl = DataLoader(valid_traces, batch_size=bs, collate_fn=pad_collate)

whole_dl = DataBunch(test_dl, val_dl)
iter_test = iter(whole_dl.train_dl)
xb, yb = next(iter_test)


KeyError: 0

In [84]:
def predict_next_step(model, df):
    model.eval()
    model.cpu()
    preds = []
    for e in df.values:
        t = torch.stack([tensor(e[c]).float() for c in range(len(list(df)))])
        pred = model(t[None])
        preds.append(pred[0][-1].tolist())
        
    return np.argmax(np.array(preds), axis=1)

In [85]:
def next_step_measure(preds, ys):
    # Simple accuracy measure
    # Do I have to weight it? Check Paper!
    return (np.array(preds) == np.array(ys)).mean()

In [169]:
%%time

x, y = process_data_for_next_step_prediction(test_traces)
xpad = pad_collate(x, pad_first=True)

CPU times: user 21 s, sys: 2.82 s, total: 23.8 s
Wall time: 15.2 s


In [2]:
xpad[3]

NameError: name 'xpad' is not defined

In [182]:
print([len(s) for s in xpad])

[49384, 49384, 49384, 49384, 49384, 49384, 49384, 49384, 49384, 49384, 49384, 49384, 49384, 49384, 49384, 49384, 49384, 49384, 49384, 49384, 49384, 49384, 49384, 49384, 49384, 49384, 49384, 49384, 49384, 49384, 49384]


In [115]:
preds = predict_next_step(awd_lstm, x)

TypeError: 'builtin_function_or_method' object is not iterable

In [None]:
next_step_measure(preds, y)

## Suffix Prediction

In [51]:
def process_data_for_suffix_prediction(test, input_cols=None, output_col=3, startIndex=1):
    xs, ys = [], []
    if input_cols == None: 
        input_cols = list(test)
        
    input_cols = listify(input_cols)
    for trace in test.values:
        for i in range(startIndex, len(listify(trace[0]))):
            x, y = [], []
            for c in range(len(input_cols)):
                x.append(trace[c][:i])
            
            xs.append(x)
            ys.append(trace[output_col][i:])
            
    return pd.DataFrame(xs, columns=input_cols), ys

In [52]:
x, y = process_data_for_suffix_prediction(test_traces)

In [53]:
def predict_suffix(model, df):
    rl = []

    for x in progress_bar(df.values):
        t = torch.stack([tensor(x[c]).float() for c in range(len(list(df)))])
        p = tensor(-1)
        res = []
        
        while p.int() != 3: # 3: eos token
            pred = model(t[None])
            pred = pred[0][-1]
            p = torch.multinomial(torch.softmax(pred, 0), 1).float()
           # p=torch.argmax(pred,0).float()[None]
            if p.int() != 3 or len(res) == 0: 
                res.append(p)
                
            k = torch.cat((t[3],p))
            t = torch.stack([k for c in range(len(list(df)))])


        res = torch.cat(res,0).int().tolist()
        rl.append(res)
        
    return rl

In [54]:
def suffix_measure(preds, ys):
    sim = []
    edits = []
    for p, y in zip(preds, ys):
        l = max(len(p),len(y))
        d = ed.eval(p, y)
        edits.append(abs(d))
        sim.append(1-(abs(d)/l))
    return np.array(edits).mean(), np.array(edits).min(), np.array(edits).max(), np.array(sim).mean()

In [55]:
%%time

#preds = predict_suffix(awd_lstm, x)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs


In [56]:
%%time

#mean_edit, min_edit, max_edit, sim = suffix_measure(preds, y)
#mean_edit, min_edit, max_edit, sim

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.53 µs


## Predict Suffix

In [57]:
def predict_suffix(net, test, top_k=None, cuda=False):
    
    x, y = process_data_for_suffix_prediction(test, col='Activities', startIndex=1)
        
    if cuda:
        net.cuda()
    else:
        net.cpu()

    net.eval()
    h = net.init_hidden(1)
    predictions = []
    
    # First off, run through the prime characters
    for i in range(len(x)):
        
        # Get the size of the trace to be predicted
        size = len(y[i])
        value = x[i]
        activities = []
            
        for j in range(len(value)):
            activity, h = net.predict([value[j]], h, cuda=cuda, top_k=top_k)

        activities.append(activity)

        for k in range(size-1):
            activity, h = net.predict([activities[-1]], h, cuda=cuda, top_k=top_k)
            activities.append(activity)

        print(activities)

        predictions.append(activities)
        
    return predictions