In [1]:
import pm4py
import pandas as pd
import datetime
from math import ceil
import matplotlib.pyplot as plt

# Data
## Download and read
- Download `.xes` file(archive) from [here](https://data.4tu.nl/articles/dataset/BPI_Challenge_2012/12689204)
- Read this `.xes`
- Convert to good old `.csv`

In [2]:
file_path = 'BPI_Challenge_2012.xes'
event_log = pm4py.read_xes(file_path)
start_activities = pm4py.get_start_activities(event_log)
end_activities = pm4py.get_end_activities(event_log)

HBox(children=(HTML(value='parsing log, completed traces :: '), FloatProgress(value=0.0, max=13087.0), HTML(va…




In [3]:
df = pm4py.convert_to_dataframe(event_log)
df.to_csv('bpi_12.csv')

## Drop data
In the article only (activity, time_stamp) is used. Also leave trace id

In [4]:
df = df[['time:timestamp', 'case:concept:name', 'concept:name']]
df = df.rename(columns={'time:timestamp': 'timestamp', 'case:concept:name': 'trace_id', 'concept:name': 'activity'})
df['trace_id'] = df['trace_id'].apply(lambda x: int(x))

## Time-related features

- $t_{w}$ - time passed between Sunday midnight and the event
- $t_e$ - time passed between the completion of the given event and the completion of the previous one
- $t_t$ - time passed between the start of the trace and the given event

### $t_w$

In [5]:
def get_t_w(df):
    _df = df.copy()
    _dt_s_mn = _df['timestamp'].apply(lambda x: (x - x.replace(hour=0, minute=0, second=0, microsecond=0)).total_seconds())
    _dt_s_mn += _df['timestamp'].apply(lambda x: x.weekday() * 24 * 60 * 60)
    return _dt_s_mn.values

In [6]:
tw = get_t_w(df)

### $t_e$

In [14]:
def get_t_e(df):
    te = df['timestamp'].copy().diff()
    tr_diff = df['trace_id'].diff().fillna(1)
    te[tr_diff != 0] = 0
    return te.values * 1e-9

In [15]:
te = get_t_e(df)

### $t_t$

In [18]:
def get_t_t(df):
    traces = list(set(df['trace_id']))
    out = df.copy()[['timestamp', 'trace_id']]
    t_ts = {}
    for t in traces:
        t_ts[t] = df['timestamp'][df['trace_id'] == t].min()
    out['tt'] = out.apply(lambda x: (x['timestamp'] - t_ts[x['trace_id']]).total_seconds(), axis=1)
    return out['tt'].values

In [19]:
tt = get_t_t(df)

In [20]:
df['tt'] = tt
df['te'] = te
df['tw'] = tw

In [21]:
df

Unnamed: 0,timestamp,trace_id,activity,tt,te,tw
0,2011-10-01 00:38:44.546000+02:00,173688,A_SUBMITTED,0.000,0.000,434324.546
1,2011-10-01 00:38:44.880000+02:00,173688,A_PARTLYSUBMITTED,0.334,0.334,434324.880
2,2011-10-01 00:39:37.906000+02:00,173688,A_PREACCEPTED,53.360,53.026,434377.906
3,2011-10-01 00:39:38.875000+02:00,173688,W_Completeren aanvraag,54.329,0.969,434378.875
4,2011-10-01 11:36:46.437000+02:00,173688,W_Completeren aanvraag,39481.891,39427.562,473806.437
...,...,...,...,...,...,...
262195,2012-02-29 23:51:17.423000+01:00,214376,A_PARTLYSUBMITTED,0.624,0.624,258677.423
262196,2012-02-29 23:52:01.287000+01:00,214376,W_Afhandelen leads,44.488,43.864,258721.287
262197,2012-03-01 09:26:46.736000+01:00,214376,W_Afhandelen leads,34529.937,34485.449,293206.736
262198,2012-03-01 09:27:37.118000+01:00,214376,A_DECLINED,34580.319,50.382,293257.118


## Scaling
later

## Activity:
one hot

In [22]:
oh = pd.get_dummies(df['activity'])

In [23]:
df = pd.concat([df, oh], axis=1)

## Environment
Ther given scheme is the following:
- recieving window of $(a_i,\ t_{e,\ i},\ t_{w,\ i},\ t_{t,\ i}) = e_i$. So the input to model is $\{ e_{i},\ e_{i-1},\ \dots,\ e_{i-ws} \}$ 
- prodice $\hat{e}_{i+1}$
- predict $\hat{e}_{i+2}$ using $\{ \hat{e}_{i+1},\ e_{i},\ \dots,\ e_{i-ws +1} \}$
The metric is calculated by `environment`. It returns rewards for time prediction and for next step classifiation. So basicly `env` just stores data of trace. 

In [None]:
class PMEnv:
    def __init__(self, trace_df, dt_bins=[0, 1, 10, 60, 120, 240, 480, 1440, 2880, 4320,
                                          7200, 10080, 14400, 20160, 30240, 40320, 50400],
                 window_size=1):
        self.trace_df = trace_df
        self.window_size = window_size
        self.dt_array = trace_df['te'].values
        self.activity_array = trace_df['activity'].values
        self.prew_ev = trace_df[window_size-1]
        self.state = trace_df[:window_size]
        self.dt_bins = dt_bins
        
        self.i = 0
        
    def step(self, action: np.ndarray):
        # next activity is taken just from action
        next_act = action[1:]
        # construct next time features
        next_te = action[0]
        next_tt = self.state['tt'] + next_te
        next_tw = ceil(self.state['tt'] + next_te) % (7 * 24 * 60 * 60)
        next_event = np.concatenate([[next_tw, next_tt, next_te], next_act])
        next_state = np.concatenate([self.state[1:], next_event], axis=0)
        # let's calculate rewards
        
        
        return next_state, reward, done, info

In [None]:
import numpy as np