Process Prediction
==
 - Load Data
 - Categorize / Normalize / Fillmissing
 - Create Datastructure for language model

# Imports

In [None]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [None]:
from exp.eventlog import *

In [None]:
from exp.dl_utils import *

In [None]:
import editdistance as ed

# Load Data

In [None]:
log=import_xes(untar_data(URLs.BPIC_2012))

In [None]:
log.traceAttributes

In [None]:
log.events

# Data Processing

1. Merge Trace Attributes and Event Attributes first in one df. It is easier to copy over the trace attributes 
2. Create Traces from DF

In [None]:
df=pd.merge(log.events,log.traceAttributes,left_on='trace_id',right_index=True)
df

# Create Traces

In [None]:
def create_traces(event_df,trace_id='trace_id'):
    ll=[]
    trace_ids=[]
    cols=list(event_df)
    cols.remove(trace_id)
    for n, g in event_df.groupby(trace_id):
        l=[]
        
        for c in cols:
            l.append(list(g[c]))
        ll.append(l)
        trace_ids.append(n)  
        

    df=pd.DataFrame(ll,columns=cols)
    df.index=trace_ids
    return df
traces=create_traces(df)
traces

In [None]:
k=[[1,2],[3,4,5]]

[item for sublist in k for item in sublist]



# Split in Train, Test

Split first only in train set and test set. The train set is used to train the model. The test set is used to test the model later on. Let the model create the validation set on its own.

In [None]:
def random_split_traces(d,split=0.8,trace_id='trace_id'):
    traces=d[trace_id].drop_duplicates()
    shuffled=traces.iloc[np.random.permutation(len(traces))].values
    split=int(len(traces)*split)
    return shuffled[:split],shuffled[split:]

In [None]:
train,test=random_split_traces(df,0.8)

In [None]:
len(train)

# TracesDatabunch 

Create a custom data bunch class for traces. The data bunch includes the following:

- The data bunch can split the data into train set and validation set.
- The data bunch can encode and decode the data. It keeps track about the encoding vocabulary, i.e. it creates the vocabulary while encoding the training set and applies the training vocabulary on the validation set and the test set.
- The data bunch creates the data sets and the pytorch data loaders that are used in the training loop to train the pytorch models. It supports multiple training styles including language model training, suffix prediction training and next step prediction training.

Steps to implement:
--
1. Get files -> Create custom TraceList
2. Split validation set
    - random 
3. Process Data:
    - Dates, Continuous Variables, Categorical Variables
4. Transform to tensor
5. DataLoader
6. DataBunch
7. Add test set (optional)

In [None]:
#export
def compose(x, funcs, *args, order_key='_order', **kwargs):
    key = lambda o: getattr(o, order_key, 0)
    for f in sorted(listify(funcs), key=key): x = f(x, **kwargs)
    return x

class ItemList(ListContainer):
    def __init__(self, items, path='.', tfms=None):
        super().__init__(items)
        self.path,self.tfms = Path(path),tfms

    def __repr__(self): return f'{super().__repr__()}\nPath: {self.path}'
    
    def new(self, items, cls=None):
        if cls is None: cls=self.__class__
        return cls(items, self.path, tfms=self.tfms)
    
    def  get(self, i): return i
    def _get(self, i): return compose(self.get(i), self.tfms)
    
    def __getitem__(self, idx):
        res = super().__getitem__(idx)
        if isinstance(res,list): return [self._get(o) for o in res]
        return self._get(res)



## Create Trace List

In [None]:
#export
class ListContainer():
    def __init__(self, items): self.items = listify(items)
    def __getitem__(self, idx):
        if isinstance(idx, (int,slice)): return self.items[idx]
        if isinstance(idx[0],bool):
            assert len(idx)==len(self) # bool mask
            return [o for m,o in zip(idx,self.items) if m]
        return [self.items[i] for i in idx]
    def __len__(self): return len(self.items)
    def __iter__(self): return iter(self.items)
    def __setitem__(self, i, o): self.items[i] = o
    def __delitem__(self, i): del(self.items[i])
    def __repr__(self):
        res = f'{self.__class__.__name__} ({len(self)} items)\n{self.items[:10]}'
        if len(self)>10: res = res[:-1]+ '...]'
        return res

In [None]:
class TraceList(ListContainer):
    def __init__(self,items,df,cat_names,cont_names,date_names,trace_id):
        super().__init__(items)
        self.cat_names,self.cont_names,self.date_names = cat_names,cont_names,date_names
        self.df=df
        self.trace_id=trace_id
   
    @classmethod
    def from_df(cls, df, date_names=[],cat_names=[], cont_names=[],trace_id='trace_id')->'TraceList':
        "Get the list of inputs in the `col` of `path/csv_name`."
        return cls(items=list(df[trace_id].drop_duplicates()),df=df.copy(),date_names=date_names, cat_names=cat_names, cont_names=cont_names,trace_id=trace_id)
    
    def get(self, o): return self.df[self.df[self.trace_id].isin(listify(self.items[o]))]
    
    def _get(self, o): return self.df[self.df[self.trace_id].isin(listify(self.items[o]))].values
    
    def new(self, items, cls=None,df=None,cat_names=None,cont_names=None,date_names=None,trace_id=None):
        if cls is None: cls=self.__class__
        if cat_names is None: cat_names=self.cat_names
        if cont_names is None: cont_names=self.cont_names
        if date_names is None: date_names=self.date_names
        if trace_id is None: trace_id=self.trace_id
        if df is None: 
            print('hm')
            df=self.df[self.df[self.trace_id].isin(items)]
        return cls(items,df,cat_names,cont_names,date_names,trace_id)
    
    def __getitem__(self, idx):
        if isinstance(idx,list): return [self._get(o) for o in idx]
        return self._get(idx)


In [None]:
data=df[df['trace_id'].isin(train)]

In [None]:
# encode data and create vocab
cat_columns=['event_id','org:resource','lifecycle:transition','concept:name',]
date_columns=['time:timestamp','REG_DATE']
con_columns=['AMOUNT_REQ']

In [None]:
il=TraceList.from_df(data,cat_names=cat_columns,cont_names=con_columns,date_names=date_columns)
il

In [None]:
il.get(slice(3,5))

In [None]:
len(il[[1,2]])

## Split in Train Set and Validation Set

In [None]:
import random

In [None]:
#export

def random_splitter(fn, p_valid): return random.random() < p_valid
def split_by_func(items, f):
    mask = [f(o) for o in items]
    # `None` values will be filtered out
    f = [o for o,m in zip(items,mask) if m==False]
    t = [o for o,m in zip(items,mask) if m==True ]
    return f,t

class SplitData():
    def __init__(self, train, valid): self.train,self.valid = train,valid
    
    @classmethod
    def split_by_func(cls, il, f):
        lists = map(il.new, split_by_func(il.items, f))
        return cls(*lists)

    def __repr__(self): return f'{self.__class__.__name__}\nTrain: {self.train}\nValid: {self.valid}\n'

In [None]:
sd = SplitData.split_by_func(il, partial(random_splitter, p_valid=0.1));sd

In [None]:
len(sd.train.df),len(sd.valid.df)

## Pre-Process Data

In [None]:




#events_enc,traceAttributes_enc,cat_columns,num_columns,vocabs=encode_data(log.events,log.traceAttributes,cat_columns,date_columns,num_columns)


In [None]:
def add_datepart(df, fldname, drop=True, time=False,utc=False):
    "Helper function that adds columns relevant to a date."
    df=df.copy()
    fld = df[fldname]
    fld_dtype = fld.dtype
    if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        fld_dtype = np.datetime64

    if not np.issubdtype(fld_dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, utc=utc,infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    cols=[]
    for n in attr: 
        col_name=targ_pre +"_"+ n
        df[col_name] = getattr(fld.dt, n.lower())
        cols.append(col_name)
    df[targ_pre + '_Elapsed'] = fld.astype(np.int64) // 10 ** 9
    if drop: df.drop(fldname, axis=1, inplace=True)
    return df,cols,targ_pre + '_Elapsed'

In [None]:
def normalize_cont_column(x, mean, std,eps=1e-7): return (x-mean)/(eps + std)

In [None]:
UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ = "xxunk xxpad xxbos xxeos xxrep xxwrep xxup xxmaj".split()
default_spec_tok = [UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ]

from collections import OrderedDict

def uniqueify(x, sort=False):
    res = list(OrderedDict.fromkeys(x).keys())
    if sort: res.sort()
    return res

class Processor():
    def process(self, items): return items

class CategoryProcessor(Processor):
    def __init__(self,default_token=None): 
        self.vocab=None
        self.default_token=default_token

    def __call__(self, items):
        #The vocab is defined on the first use.
        if self.vocab is None:
            self.vocab = uniqueify(items)
            if self.default_token is not None:
                for o in reversed(self.default_token):
                    if o in self.vocab: self.vocab.remove(o)
                    self.vocab.insert(0, o)
            self.otoi  = {v:k for k,v in enumerate(self.vocab)}
        return [self.proc1(o) for o in items]
    def proc1(self, item):  return self.otoi[item]

    def deprocess(self, idxs):
        assert self.vocab is not None
        return [self.deproc1(idx) for idx in idxs]
    def deproc1(self, idx): return self.vocab[idx]



In [None]:
class TraceProcessor(Processor):
    def __init__(self,vocabs={}):
        self.vocabs=vocabs
        self.df=tl.df
        self.date_names=tl.date_names
        self.cat_names=tl.cat_names
        self.cont_names=tl.cont_names
        self.tl=tl
    def __call__(self,tl):
        df=self.df
        cat_names,cont_names=tl.cat_names,tl.cont_names
        for d in tl.date_names:
            df,cat, cont = add_datepart(df,d,utc=True)
            cat_names+=listify(cat)    
            cont_names+=listify(cont)    
        for c in cat_names:
            if not c in self.vocabs.keys(): 
                self.vocabs[c] = CategoricalProcessor(default_spec_tok)
            df[c]=self.vocabs[c](df[c])
            
        for c in cont_names:
            if not c in self.vocabs.keys(): 
                df[c]=df[c].astype(float)
                self.vocabs[c]=df[c].mean(),df[c].std()
            df[c]=normalize_cont_column(df[c], *self.vocabs[c])
    
        return self.tl.new(self.tl.items,df=df,cat_names=cat_names,cont_names=cont_names)
        
        
    
    def deprocess(self,items,columns):
        pass

In [None]:
sd.train.cat_names

In [None]:
tp=TraceProcessor()

In [None]:
train_processed=tp(sd.train)
valid_processed=tp(sd.valid)

In [None]:
valid_processed.get(2)

In [None]:
len(valid_processed),len(train_processed)

# Testing

In [None]:
test_data=df[df['trace_id'].isin(test)]

In [None]:
test_tl=TraceList.from_df(test_data,cat_names=cat_columns,cont_names=con_columns,date_names=date_columns)


In [None]:
test_processed=tp(test_tl)

In [None]:
test_processed.df

In [None]:
test_traces=create_traces(test_processed.df)


In [None]:
def process_data_for_suffix_prediction(test,cols=None,startIndex=1):
    x,y={},{}
    if cols == None: cols=list(test)
    cols=listify(cols)
    for col in cols:
        x[col],y[col]=[],[]
        for trace in test[col]: 
            for i in range(startIndex,len(listify(trace))):
                x[col].append(trace[:i])
                y[col].append(trace[i:])
    return x,y


In [None]:
x,y=process_data_for_suffix_prediction(test_traces)

In [None]:
def process_data_for_next_step_prediction(test,col=None,startIndex=1):
    x,y=[],[]
    traces=test.values
    if col!=None: traces=test[col].values
    for trace in traces:
        for i in range(startIndex,len(trace)):
            x.append(flatten_ir_list(listify(trace[:i])))
            y.append(flatten_ir_list(listify(trace[i])))
    return x,y


In [None]:
def suffix_measure(preds,ys,col='concept:name'):
    sum=0.0
    for p,y in zip(preds[col],ys[col]):
        l=len(p)
        d=ed.eval(p,y)
        sim=1-d/l
        sum+=sim
    return sum/len(preds[col])


In [None]:
def next_step_measure(preds,ys):
    # Simple accuracy measure
    # Do I have to weight it? Check Paper!
    return (np.array(preds)==np.array(ys)).astype(float).mean()
