
Process Prediction
==
 - Load Data
 - Categorize / Normalize / Fillmissing
 - Create Datastructure for language model

# Imports

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
from exp.eventlog import *

In [3]:
from exp.dl_utils import *

In [4]:
import editdistance as ed

# Load Data

In [5]:
log=import_xes(untar_data(URLs.BPIC_2012))

# Data Processing

1. Merge Trace Attributes and Event Attributes first in one df. It is easier to copy over the trace attributes
3. Split into Train and Test
2. Create Traces from DF

In [6]:
df=pd.merge(log.events,log.traceAttributes,left_on='trace_id',right_index=True)
df

Unnamed: 0,trace_id,event_id,org:resource,lifecycle:transition,concept:name,time:timestamp,REG_DATE,AMOUNT_REQ
0,173688,0,112,COMPLETE,A_SUBMITTED,2011-09-30 22:38:44.546000+00:00,2011-10-01 00:38:44.546000+02:00,20000
1,173688,1,112,COMPLETE,A_PARTLYSUBMITTED,2011-09-30 22:38:44.880000+00:00,2011-10-01 00:38:44.546000+02:00,20000
2,173688,2,112,COMPLETE,A_PREACCEPTED,2011-09-30 22:39:37.906000+00:00,2011-10-01 00:38:44.546000+02:00,20000
3,173688,3,112,SCHEDULE,W_Completeren aanvraag,2011-09-30 22:39:38.875000+00:00,2011-10-01 00:38:44.546000+02:00,20000
4,173688,4,,START,W_Completeren aanvraag,2011-10-01 09:36:46.437000+00:00,2011-10-01 00:38:44.546000+02:00,20000
...,...,...,...,...,...,...,...,...
262195,214376,1,112,COMPLETE,A_PARTLYSUBMITTED,2012-02-29 22:51:17.423000+00:00,2012-02-29 23:51:16.799000+01:00,15000
262196,214376,2,112,SCHEDULE,W_Afhandelen leads,2012-02-29 22:52:01.287000+00:00,2012-02-29 23:51:16.799000+01:00,15000
262197,214376,3,11169,START,W_Afhandelen leads,2012-03-01 08:26:46.736000+00:00,2012-02-29 23:51:16.799000+01:00,15000
262198,214376,4,11169,COMPLETE,A_DECLINED,2012-03-01 08:27:37.118000+00:00,2012-02-29 23:51:16.799000+01:00,15000


# Split in Train, Test and Validation

Split first only in train set and test set. The train set is used to train the model. The test set is used to test the model later on. Let the model create the validation set on its own.

In [7]:
trace_id='trace_id'

In [8]:
def random_split_traces(d,split=0.8,trace_id='trace_id'):
    traces=d[trace_id].drop_duplicates()
    shuffled=traces.iloc[np.random.permutation(len(traces))].values
    split=int(len(traces)*split)
    return shuffled[:split],shuffled[split:]

In [9]:
train_trace_ids,test_trace_ids=random_split_traces(df,0.8)

In [10]:
def get_df(t,df): return df[df[trace_id].isin(t)]
train_df=get_df(train_trace_ids,df)

Split train into train and validation
--

In [11]:
train_traces,validation_trace_ids=random_split_traces(train_df,0.9)


In [12]:
train_traces

array(['185506', '198439', '192235', '174328', ..., '208766', '185042', '203880', '209230'], dtype=object)

In [13]:
train_df=get_df(train_traces,df)
test_df=get_df(test_trace_ids,df)
valid_df=get_df(validation_trace_ids,df)

# Process Data

In [14]:
def normalize_cont_column(x, mean, std,eps=1e-7): return (x-mean)/(eps + std)

In [15]:
UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ = "xxunk xxpad xxbos xxeos xxrep xxwrep xxup xxmaj".split()
default_spec_tok = [UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ]

from collections import OrderedDict

def uniqueify(x, sort=False):
    res = list(OrderedDict.fromkeys(x).keys())
    if sort: res.sort()
    return res

class Processor():
    def process(self, items): return items

class CategoryProcessor(Processor):
    def __init__(self,default_token=None): 
        self.vocab=None
        self.default_token=default_token

    def __call__(self, items):
        #The vocab is defined on the first use.
        if self.vocab is None:
            self.vocab = uniqueify(items)
            if self.default_token is not None:
                for o in reversed(self.default_token):
                    if o in self.vocab: self.vocab.remove(o)
                    self.vocab.insert(0, o)
            self.otoi  = {v:k for k,v in enumerate(self.vocab)}
        return [self.proc1(o) for o in items]
    def proc1(self, item):  return self.otoi.get(item,0)

    def deprocess(self, idxs):
        assert self.vocab is not None
        return [self.deproc1(idx) for idx in idxs]
    
    def deproc1(self, idx): return self.vocab[idx]

In [16]:
class TraceProcessor(Processor):
    def __init__(self,cat_names,cont_names,date_names,vocabs={}):
        self.vocabs=vocabs
        self.cat_names,self.cont_names,self.date_names=cat_names,cont_names,date_names
    def __call__(self,df):
        cat_names,cont_names=self.cat_names[:],self.cont_names[:]
        for d in self.date_names:
            df,cat, cont = add_datepart(df,d,utc=True)
            cat_names+=listify(cat)    
            cont_names+=listify(cont)

        for c in cat_names:
            if not c in self.vocabs.keys(): 
                self.vocabs[c] = CategoryProcessor(default_spec_tok)
         

            df[c]=self.vocabs[c](df[c])
            
        for c in cont_names:
            df[c]=df[c].astype(float)

            if not c in self.vocabs.keys(): 
                self.vocabs[c]=df[c].mean(),df[c].std()
            df[c]=normalize_cont_column(df[c], *self.vocabs[c])
    
        return df
        
        
    
    def deprocess(self,items,columns):
        pass

In [17]:
def add_datepart(df, fldname, drop=True, time=False,utc=False):
    "Helper function that adds columns relevant to a date."
    df=df.copy()
    fld = df[fldname]
    fld_dtype = fld.dtype
    if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        fld_dtype = np.datetime64

    if not np.issubdtype(fld_dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, utc=utc,infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    cols=[]
    for n in attr: 
        col_name=targ_pre +"_"+ n
        df[col_name] = getattr(fld.dt, n.lower())
        cols.append(col_name)
    df[targ_pre + '_Elapsed'] = fld.astype(np.int64) // 10 ** 9
    if drop: df.drop(fldname, axis=1, inplace=True)
    return df,cols,targ_pre + '_Elapsed'

In [18]:
# encode data and create vocab
cat_names=['event_id','org:resource','lifecycle:transition','concept:name',]
date_names=['time:timestamp','REG_DATE']
cont_names=['AMOUNT_REQ']

In [19]:
tp=TraceProcessor(cat_names,cont_names,date_names)

In [20]:
train_df

Unnamed: 0,trace_id,event_id,org:resource,lifecycle:transition,concept:name,time:timestamp,REG_DATE,AMOUNT_REQ
65,173694,0,112,COMPLETE,A_SUBMITTED,2011-10-01 06:10:30.287000+00:00,2011-10-01 08:10:30.287000+02:00,7000
66,173694,1,112,COMPLETE,A_PARTLYSUBMITTED,2011-10-01 06:10:30.591000+00:00,2011-10-01 08:10:30.287000+02:00,7000
67,173694,2,112,COMPLETE,A_PREACCEPTED,2011-10-01 06:11:13.026000+00:00,2011-10-01 08:10:30.287000+02:00,7000
68,173694,3,112,SCHEDULE,W_Completeren aanvraag,2011-10-01 06:11:13.390000+00:00,2011-10-01 08:10:30.287000+02:00,7000
69,173694,4,10912,START,W_Completeren aanvraag,2011-10-01 09:31:25.301000+00:00,2011-10-01 08:10:30.287000+02:00,7000
...,...,...,...,...,...,...,...,...
262189,214373,12,10933,COMPLETE,O_SENT,2012-03-01 19:22:40.043000+00:00,2012-02-29 23:43:09.766000+01:00,8500
262190,214373,13,10933,SCHEDULE,W_Nabellen offertes,2012-03-01 19:22:40.149000+00:00,2012-02-29 23:43:09.766000+01:00,8500
262191,214373,14,10933,COMPLETE,W_Completeren aanvraag,2012-03-01 19:22:41.157000+00:00,2012-02-29 23:43:09.766000+01:00,8500
262192,214373,15,11119,START,W_Nabellen offertes,2012-03-10 11:46:22.700000+00:00,2012-02-29 23:43:09.766000+01:00,8500


In [21]:
train_proc=tp(train_df)

In [22]:
valid_proc=tp(valid_df)

In [23]:
test_proc=tp(test_df) # unknown token einfügen

# Create Traces

In [24]:
def create_traces(event_df,trace_id='trace_id'):
    ll=[]
    trace_ids=[]
    cols=list(event_df)
    cols.remove(trace_id)
    for n, g in event_df.groupby(trace_id):
        l=[]
        
        for c in cols:
            l.append(list(g[c]))
        ll.append(l)
        trace_ids.append(n)  
        

    df=pd.DataFrame(ll,columns=cols)
    df.index=trace_ids
    return df


In [25]:
train_traces=create_traces(train_proc)
valid_traces=create_traces(valid_proc)

# LanguageModel Dataloader

In [26]:
bs,bptt=128,70

In [27]:
class LMDataSet():
    def __init__(self, df, bs=64, bptt=70, shuffle=False):
        self.bs,self.bptt,self.shuffle = bs,bptt,shuffle
        self.cols=list(df)

        total_len = sum(df.apply(lambda x: max([len(listify(x[k])) for k in self.cols]),axis=1))
        self.n_batch = total_len // self.bs

        self.batched=self.batchify(df)
        #print(self.bs,self.bptt,self.shuffle,total_len, self.n_batch)
        #print(self.batched)
    
    def __len__(self): return ((self.n_batch-1) // self.bptt) * self.bs
    
    def __getitem__(self, idx):
        source = self.batched[:,idx % self.bs]
        seq_idx = (idx // self.bs) * self.bptt
        x,y=source[:,seq_idx:seq_idx+self.bptt],source[:,seq_idx+1:seq_idx+self.bptt+1]
        return x,y
    
    def batchify(self,df):
        if self.shuffle: df=df.sample(frac=1)
        
        dd={}
        for c in self.cols:
            dd[c]=[]
        for i, row in df.iterrows():
            l=max([len(listify(row[c])) for c in self.cols])
            for c in self.cols:
                dd[c].append(tensor(row[c]).expand(l))
        for c in self.cols:
            s= torch.cat([torch.cat((tensor([2.0]),t.float(),tensor([3.0]))) for t in dd[c]])
            dd[c]=s[:self.n_batch * self.bs].view(self.bs, self.n_batch)
        return torch.stack([dd[c] for c in self.cols])

In [28]:
def get_dls(train_ds, valid_ds,  **kwargs):
    return (DataLoader(LMDataSet(train_ds, shuffle=True,bptt=bptt), batch_size=bs),
            DataLoader(LMDataSet(valid_ds, shuffle=False,bptt=bptt), batch_size=bs))

In [29]:
valid_traces

Unnamed: 0,event_id,org:resource,lifecycle:transition,concept:name,AMOUNT_REQ,time:timestamp_Year,time:timestamp_Month,time:timestamp_Week,time:timestamp_Day,time:timestamp_Dayofweek,...,REG_DATE_Day,REG_DATE_Dayofweek,REG_DATE_Dayofyear,REG_DATE_Is_month_end,REG_DATE_Is_month_start,REG_DATE_Is_quarter_end,REG_DATE_Is_quarter_start,REG_DATE_Is_year_end,REG_DATE_Is_year_start,REG_DATE_Elapsed
173745,"[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,...","[8, 8, 8, 26, 26, 26, 26, 10, 19, 19, 19, 19, ...","[8, 8, 9, 10, 8, 9, 8, 10, 8, 8, 8, 8, 8, 9, 8...","[8, 9, 27, 27, 10, 11, 27, 11, 12, 14, 13, 15,...","[-1.0950748562047796, -1.0950748562047796, -1....","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...",...,"[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[-1.7183822419108066, -1.7183822419108066, -1...."
173772,"[8, 9, 10]","[8, 8, 8]","[8, 8, 8]","[8, 9, 26]","[-1.0150515319613596, -1.0150515319613596, -1....","[8, 8, 8]","[8, 8, 8]","[8, 8, 8]","[8, 8, 8]","[8, 8, 8]",...,"[8, 8, 8]","[8, 8, 8]","[8, 8, 8]","[8, 8, 8]","[8, 8, 8]","[8, 8, 8]","[8, 8, 8]","[8, 8, 8]","[8, 8, 8]","[-1.716028551178518, -1.716028551178518, -1.71..."
173805,"[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,...","[8, 8, 8, 8, 22, 22, 10, 39, 39, 39, 39, 39, 1...","[8, 8, 8, 9, 10, 8, 10, 8, 8, 8, 8, 8, 9, 8, 1...","[8, 9, 10, 11, 11, 11, 11, 12, 14, 13, 15, 16,...","[-0.8550048834745198, -0.8550048834745198, -0....","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, ...","[8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, ...","[8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, ...",...,"[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[-1.7118693036475292, -1.7118693036475292, -1...."
173889,"[8, 9, 10, 11, 12, 13]","[8, 8, 8, 37, 37, 37]","[8, 8, 9, 10, 8, 8]","[8, 9, 27, 27, 26, 27]","[-0.65494657286597, -0.65494657286597, -0.6549...","[8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8]","[8, 8, 8, 9, 9, 9]","[16, 16, 16, 9, 9, 9]","[14, 14, 14, 9, 9, 9]",...,"[9, 9, 9, 9, 9, 9]","[9, 9, 9, 9, 9, 9]","[9, 9, 9, 9, 9, 9]","[8, 8, 8, 8, 8, 8]","[9, 9, 9, 9, 9, 9]","[8, 8, 8, 8, 8, 8]","[9, 9, 9, 9, 9, 9]","[8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8]","[-1.695505992476729, -1.695505992476729, -1.69..."
173922,"[8, 9, 10]","[8, 8, 8]","[8, 8, 8]","[8, 9, 26]","[-0.05477164104032068, -0.05477164104032068, -...","[8, 8, 8]","[8, 8, 8]","[8, 8, 8]","[16, 16, 16]","[14, 14, 14]",...,"[9, 9, 9]","[9, 9, 9]","[9, 9, 9]","[8, 8, 8]","[9, 9, 9]","[8, 8, 8]","[9, 9, 9]","[8, 8, 8]","[8, 8, 8]","[-1.692488798422282, -1.692488798422282, -1.69..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214250,"[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,...","[8, 8, 8, 8, 28, 28, 28, 28, 28, 28, 28, 28, 2...","[8, 8, 8, 9, 10, 8, 8, 8, 8, 8, 9, 8, 10, 8, 1...","[8, 9, 10, 11, 11, 12, 13, 14, 15, 16, 17, 11,...","[-0.45488826225742024, -0.45488826225742024, -...","[9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1...","[30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 3...","[15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 1...","[12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 1...",...,"[36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 3...","[12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 1...","[159, 159, 159, 159, 159, 159, 159, 159, 159, ...","[9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, ...","[9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[1.6623151470209272, 1.6623151470209272, 1.662..."
214259,"[8, 9, 10, 11, 12, 13]","[8, 8, 8, 28, 28, 28]","[8, 8, 9, 10, 8, 8]","[8, 9, 27, 27, 26, 27]","[-0.65494657286597, -0.65494657286597, -0.6549...","[9, 9, 9, 9, 9, 9]","[10, 10, 10, 10, 10, 10]","[30, 30, 30, 30, 30, 30]","[15, 15, 15, 15, 15, 15]","[12, 12, 12, 12, 12, 12]",...,"[36, 36, 36, 36, 36, 36]","[12, 12, 12, 12, 12, 12]","[159, 159, 159, 159, 159, 159]","[9, 9, 9, 9, 9, 9]","[9, 9, 9, 9, 9, 9]","[8, 8, 8, 8, 8, 8]","[9, 9, 9, 9, 9, 9]","[8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8]","[1.662454055903578, 1.662454055903578, 1.66245..."
214265,"[8, 9, 10, 11, 12, 13]","[8, 8, 8, 28, 28, 28]","[8, 8, 9, 10, 8, 8]","[8, 9, 27, 27, 26, 27]","[-1.1750981804481995, -1.1750981804481995, -1....","[9, 9, 9, 9, 9, 9]","[10, 10, 10, 10, 10, 10]","[30, 30, 30, 30, 30, 30]","[15, 15, 15, 15, 15, 15]","[12, 12, 12, 12, 12, 12]",...,"[36, 36, 36, 36, 36, 36]","[12, 12, 12, 12, 12, 12]","[159, 159, 159, 159, 159, 159]","[9, 9, 9, 9, 9, 9]","[9, 9, 9, 9, 9, 9]","[8, 8, 8, 8, 8, 8]","[9, 9, 9, 9, 9, 9]","[8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8]","[1.6626268513106743, 1.6626268513106743, 1.662..."
214274,"[8, 9, 10, 11, 12, 13, 14]","[8, 8, 8, 8, 12, 12, 12]","[8, 8, 8, 9, 10, 8, 8]","[8, 9, 10, 11, 11, 26, 11]","[-0.8550048834745198, -0.8550048834745198, -0....","[9, 9, 9, 9, 9, 9, 9]","[10, 10, 10, 10, 10, 10, 10]","[30, 30, 30, 30, 30, 30, 30]","[15, 15, 15, 15, 15, 15, 15]","[12, 12, 12, 12, 12, 12, 12]",...,"[36, 36, 36, 36, 36, 36, 36]","[12, 12, 12, 12, 12, 12, 12]","[159, 159, 159, 159, 159, 159, 159]","[9, 9, 9, 9, 9, 9, 9]","[9, 9, 9, 9, 9, 9, 9]","[8, 8, 8, 8, 8, 8, 8]","[9, 9, 9, 9, 9, 9, 9]","[8, 8, 8, 8, 8, 8, 8]","[8, 8, 8, 8, 8, 8, 8]","[1.662831463836143, 1.662831463836143, 1.66283..."


In [30]:
data = DataBunch(*get_dls(train_traces, valid_traces))

In [31]:
iter_dl = iter(data.train_dl)
xb,yb = next(iter_dl)
xb.size()

torch.Size([128, 31, 70])

# Basic Model

In [32]:
class BasicModel(nn.Module):
    def __init__(self, n_in,n_out,n_emb,nh):
        super().__init__()
        self.emb=nn.Embedding(n_in, 7, padding_idx=1)
        self.lin1=nn.Linear(7,nh)
        self.relu=nn.ReLU()
        self.lin2=nn.Linear(nh,n_out)
        
    def __call__(self, x):
        x=x[:,3] # magic number for 'concept:name'
        x=x.long()
        x=self.emb(x)
        x=self.lin1(x)
        x=self.relu(x)
        x=self.lin2(x)
        return x.float()

In [33]:
def getBasicModel():
    vocab=len((tp.vocabs['concept:name']).vocab) # Stupid 'concept:name' model
    n_emb,nh=int(vocab/2),10
    model=BasicModel(bs*bptt,vocab,n_emb,nh)
    return model

In [34]:
len((tp.vocabs['concept:name']).vocab)

32

In [35]:
xb.shape

torch.Size([128, 31, 70])

In [36]:
xb[None,0,:,1:10].shape

torch.Size([1, 31, 9])

In [37]:
model=getBasicModel()
pred = model(xb)
pred.shape,yb[:,0].shape

(torch.Size([128, 70, 32]), torch.Size([128, 70]))

In [38]:
def cross_entropy_activity(input, target):
    target=target[:,3] # magic number for 'concept:name'
    bs,sl =target.size()
    return F.cross_entropy(input.view(bs * sl, -1), target.flatten().long())
cross_entropy_activity(pred,yb)

tensor(3.5108, grad_fn=<NllLossBackward>)

In [39]:
def accuracy_activity(input, target): 
    target=target[:,3] # magic number for 'concept:name'
    bs,sl =target.size()
    return (torch.argmax(input.view(bs * sl, -1), dim=1)==target.flatten().long()).float().mean()
accuracy_activity(pred,yb)

tensor(0.0095)

# Training Loop

**Callbacks**

In [40]:
sched = combine_scheds([0.3, 0.7], [sched_cos(0.3, 0.6), sched_cos(0.6, 0.2)]) 

In [41]:
class CudaCallback(Callback):
    def begin_fit(self): self.model.cuda()
    def begin_batch(self): self.run.xb,self.run.yb = self.xb.cuda(),self.yb.cuda()

In [42]:
cbfs = [partial(AvgStatsCallback,accuracy_activity),
        CudaCallback, 
        Recorder,
        partial(ParamScheduler, 'lr', sched),
        ProgressBarCallback,
       ]

**Model**

In [43]:
opt_func = partial(Optimizer, steppers=[sgd_step])

In [44]:
model=getBasicModel()

In [45]:
opt = opt_func(model.parameters(), lr=0.5)


**Learner**

In [46]:
learn = Learner(model,data,cross_entropy_activity,cb_funcs=cbfs,opt_func=opt_func)


In [47]:
learn.fit(20) 

epoch,train_loss,train_accuracy_activity,valid_loss,valid_accuracy_activity,time
0,2.805393,0.312617,2.338314,0.443415,00:00
1,2.168686,0.445573,1.904574,0.479185,00:00
2,1.781064,0.554608,1.590225,0.593583,00:00
3,1.515376,0.591948,1.394304,0.609766,00:00
4,1.355849,0.617719,1.270987,0.643527,00:00
5,1.259113,0.637521,1.219177,0.652009,00:00
6,1.203855,0.64324,1.163284,0.652288,00:00
7,1.180973,0.643171,1.131654,0.652009,00:00
8,1.123988,0.651605,1.083322,0.667188,00:00
9,1.106155,0.657361,1.078861,0.667188,00:00


In [48]:
basic_model=learn.model
basic_model(xb.cuda()).shape

torch.Size([128, 70, 32])

# Testing

In [49]:
test_proc

Unnamed: 0,trace_id,event_id,org:resource,lifecycle:transition,concept:name,AMOUNT_REQ,time:timestamp_Year,time:timestamp_Month,time:timestamp_Week,time:timestamp_Day,...,REG_DATE_Day,REG_DATE_Dayofweek,REG_DATE_Dayofyear,REG_DATE_Is_month_end,REG_DATE_Is_month_start,REG_DATE_Is_quarter_end,REG_DATE_Is_quarter_start,REG_DATE_Is_year_end,REG_DATE_Is_year_start,REG_DATE_Elapsed
0,173688,8,8,8,8,0.345345,8,0,8,38,...,37,14,0,9,9,9,9,8,8,-1.730947
1,173688,9,8,8,9,0.345345,8,0,8,38,...,37,14,0,9,9,9,9,8,8,-1.730947
2,173688,10,8,8,10,0.345345,8,0,8,38,...,37,14,0,9,9,9,9,8,8,-1.730947
3,173688,11,8,9,11,0.345345,8,0,8,38,...,37,14,0,9,9,9,9,8,8,-1.730947
4,173688,12,10,10,11,0.345345,8,8,8,8,...,37,14,0,9,9,9,9,8,8,-1.730947
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262172,214370,9,8,8,9,0.345345,9,10,30,15,...,36,12,159,9,9,8,9,8,8,1.666038
262173,214370,10,8,9,27,0.345345,9,10,30,15,...,36,12,159,9,9,8,9,8,8,1.666038
262174,214370,11,33,10,27,0.345345,9,13,30,8,...,36,12,159,9,9,8,9,8,8,1.666038
262175,214370,12,33,8,26,0.345345,9,13,30,8,...,36,12,159,9,9,8,9,8,8,1.666038


In [50]:
test_traces=create_traces(test_proc)


In [51]:
test_traces.iloc[0:1]

Unnamed: 0,event_id,org:resource,lifecycle:transition,concept:name,AMOUNT_REQ,time:timestamp_Year,time:timestamp_Month,time:timestamp_Week,time:timestamp_Day,time:timestamp_Dayofweek,...,REG_DATE_Day,REG_DATE_Dayofweek,REG_DATE_Dayofyear,REG_DATE_Is_month_end,REG_DATE_Is_month_start,REG_DATE_Is_quarter_end,REG_DATE_Is_quarter_start,REG_DATE_Is_year_end,REG_DATE_Is_year_start,REG_DATE_Elapsed
173688,"[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,...","[8, 8, 8, 8, 10, 58, 58, 58, 58, 58, 10, 10, 1...","[8, 8, 8, 9, 10, 8, 8, 8, 8, 8, 9, 8, 10, 8, 1...","[8, 9, 10, 11, 11, 12, 14, 13, 15, 16, 17, 11,...","[0.3453449801767789, 0.3453449801767789, 0.345...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, ...","[38, 38, 38, 38, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,...","[10, 10, 10, 10, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,...",...,"[37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 3...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, ...","[9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, ...","[9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, ...","[9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[-1.7309466408830803, -1.7309466408830803, -1...."


## Next Step Prediction

In [52]:
def process_data_for_next_step_prediction(test,input_cols=None,output_col=3,startIndex=1):
    xs,ys=[],[]
    if input_cols == None: input_cols=list(test)
    input_cols=listify(input_cols)
    for trace in test.values:
        for i in range(startIndex,len(listify(trace[0]))):
            x,y=[],[]
            for c in range(len(input_cols)):
                x.append(trace[c][:i])
            
            xs.append(x)
            ys.append(trace[output_col][i])
    return pd.DataFrame(xs,columns=input_cols),ys


In [53]:
x

NameError: name 'x' is not defined

In [54]:
def predict_next_step(model,df):
    model.eval()
    model.cpu()
    preds=[]
    for e in df.values:
        t=torch.stack([tensor(e[c]).float() for c in range(len(list(df)))])
        pred=model(t[None])
        preds.append(pred[0][-1].tolist())
    return np.argmax(np.array(preds),axis=1)



In [55]:
def next_step_measure(preds,ys):
    # Simple accuracy measure
    # Do I have to weight it? Check Paper!
    return (np.array(preds)==np.array(ys)).mean()

In [56]:
x,y=process_data_for_next_step_prediction(test_traces)
preds=predict_next_step(basic_model,x)
next_step_measure(preds,y)

0.6466468111327924

## Suffix Prediction

In [70]:
def process_data_for_suffix_prediction(test,input_cols=None,output_col=3,startIndex=1):
    xs,ys=[],[]
    if input_cols == None: input_cols=list(test)
    input_cols=listify(input_cols)
    for trace in test.values:
        for i in range(startIndex,len(listify(trace[0]))):
            x,y=[],[]
            for c in range(len(input_cols)):
                x.append(trace[c][:i])
            
            xs.append(x)
            ys.append(trace[output_col][i:])
    return pd.DataFrame(xs,columns=input_cols),ys

In [71]:
x,y=process_data_for_suffix_prediction(test_traces)

In [63]:
x

Unnamed: 0,event_id,org:resource,lifecycle:transition,concept:name,AMOUNT_REQ,time:timestamp_Year,time:timestamp_Month,time:timestamp_Week,time:timestamp_Day,time:timestamp_Dayofweek,...,REG_DATE_Day,REG_DATE_Dayofweek,REG_DATE_Dayofyear,REG_DATE_Is_month_end,REG_DATE_Is_month_start,REG_DATE_Is_quarter_end,REG_DATE_Is_quarter_start,REG_DATE_Is_year_end,REG_DATE_Is_year_start,REG_DATE_Elapsed
0,[8],[8],[8],[8],[0.3453449801767789],[8],[0],[8],[38],[10],...,[37],[14],[0],[9],[9],[9],[9],[8],[8],[-1.7309466408830803]
1,"[8, 9]","[8, 8]","[8, 8]","[8, 9]","[0.3453449801767789, 0.3453449801767789]","[8, 8]","[0, 0]","[8, 8]","[38, 38]","[10, 10]",...,"[37, 37]","[14, 14]","[0, 0]","[9, 9]","[9, 9]","[9, 9]","[9, 9]","[8, 8]","[8, 8]","[-1.7309466408830803, -1.7309466408830803]"
2,"[8, 9, 10]","[8, 8, 8]","[8, 8, 8]","[8, 9, 10]","[0.3453449801767789, 0.3453449801767789, 0.345...","[8, 8, 8]","[0, 0, 0]","[8, 8, 8]","[38, 38, 38]","[10, 10, 10]",...,"[37, 37, 37]","[14, 14, 14]","[0, 0, 0]","[9, 9, 9]","[9, 9, 9]","[9, 9, 9]","[9, 9, 9]","[8, 8, 8]","[8, 8, 8]","[-1.7309466408830803, -1.7309466408830803, -1...."
3,"[8, 9, 10, 11]","[8, 8, 8, 8]","[8, 8, 8, 9]","[8, 9, 10, 11]","[0.3453449801767789, 0.3453449801767789, 0.345...","[8, 8, 8, 8]","[0, 0, 0, 0]","[8, 8, 8, 8]","[38, 38, 38, 38]","[10, 10, 10, 10]",...,"[37, 37, 37, 37]","[14, 14, 14, 14]","[0, 0, 0, 0]","[9, 9, 9, 9]","[9, 9, 9, 9]","[9, 9, 9, 9]","[9, 9, 9, 9]","[8, 8, 8, 8]","[8, 8, 8, 8]","[-1.7309466408830803, -1.7309466408830803, -1...."
4,"[8, 9, 10, 11, 12]","[8, 8, 8, 8, 10]","[8, 8, 8, 9, 10]","[8, 9, 10, 11, 11]","[0.3453449801767789, 0.3453449801767789, 0.345...","[8, 8, 8, 8, 8]","[0, 0, 0, 0, 8]","[8, 8, 8, 8, 8]","[38, 38, 38, 38, 8]","[10, 10, 10, 10, 8]",...,"[37, 37, 37, 37, 37]","[14, 14, 14, 14, 14]","[0, 0, 0, 0, 0]","[9, 9, 9, 9, 9]","[9, 9, 9, 9, 9]","[9, 9, 9, 9, 9]","[9, 9, 9, 9, 9]","[8, 8, 8, 8, 8]","[8, 8, 8, 8, 8]","[-1.7309466408830803, -1.7309466408830803, -1...."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48680,[8],[8],[8],[8],[0.3453449801767789],[9],[10],[30],[15],[12],...,[36],[12],[159],[9],[9],[8],[9],[8],[8],[1.6660380085462703]
48681,"[8, 9]","[8, 8]","[8, 8]","[8, 9]","[0.3453449801767789, 0.3453449801767789]","[9, 9]","[10, 10]","[30, 30]","[15, 15]","[12, 12]",...,"[36, 36]","[12, 12]","[159, 159]","[9, 9]","[9, 9]","[8, 8]","[9, 9]","[8, 8]","[8, 8]","[1.6660380085462703, 1.6660380085462703]"
48682,"[8, 9, 10]","[8, 8, 8]","[8, 8, 9]","[8, 9, 27]","[0.3453449801767789, 0.3453449801767789, 0.345...","[9, 9, 9]","[10, 10, 10]","[30, 30, 30]","[15, 15, 15]","[12, 12, 12]",...,"[36, 36, 36]","[12, 12, 12]","[159, 159, 159]","[9, 9, 9]","[9, 9, 9]","[8, 8, 8]","[9, 9, 9]","[8, 8, 8]","[8, 8, 8]","[1.6660380085462703, 1.6660380085462703, 1.666..."
48683,"[8, 9, 10, 11]","[8, 8, 8, 33]","[8, 8, 9, 10]","[8, 9, 27, 27]","[0.3453449801767789, 0.3453449801767789, 0.345...","[9, 9, 9, 9]","[10, 10, 10, 13]","[30, 30, 30, 30]","[15, 15, 15, 8]","[12, 12, 12, 13]",...,"[36, 36, 36, 36]","[12, 12, 12, 12]","[159, 159, 159, 159]","[9, 9, 9, 9]","[9, 9, 9, 9]","[8, 8, 8, 8]","[9, 9, 9, 9]","[8, 8, 8, 8]","[8, 8, 8, 8]","[1.6660380085462703, 1.6660380085462703, 1.666..."


In [59]:
def predict_suffix(model,df):
    rl=[]

    for x in progress_bar(df.values):
        t=torch.stack([tensor(x[c]).float() for c in range(len(list(df)))])
        p=tensor(-1)
        res=[]
        while p.int()!=3: # 3: eos token
            pred=model(t[None])
            pred=pred[0][-1]
            p=torch.multinomial(torch.softmax(pred,0),1).float()
           # p=torch.argmax(pred,0).float()[None] -> zusätzliche abbruchbedingung
            if p.int()!=3 or len(res)==0: res.append(p)
            k=torch.cat((t[3],p))
            t=torch.stack([k for c in range(len(list(df)))])


        res=torch.cat(res,0).int().tolist()
        rl.append(res)
    return rl

In [60]:
def suffix_measure(preds,ys):
    sim=[]
    edits=[]
    for p,y in zip(preds,ys):
        l=max(len(p),len(y))
        d=ed.eval(p,y)
        edits.append(abs(d))
        sim.append(1-(abs(d)/l))
    return np.array(edits).mean(),np.array(edits).min(),np.array(edits).max(),np.array(sim).mean()


In [61]:
preds=predict_suffix(basic_model,x)

In [62]:
mean_edit,min_edit,max_edit,sim=suffix_measure(preds,y)
mean_edit,min_edit,max_edit,sim

(26.459874704734517, 0, 247, 0.1743588706444003)