In [None]:
import pm4py
import pandas as pd
import datetime
from math import ceil
import matplotlib.pyplot as plt
import numpy as np
from importlib import reload
import random
import seaborn as sns
from IPython import display

# What is it about
## Data
We have data of processes - `bpi12`. It is a sequence of (`time_stamp`, `activity`,...) 
and other stuff which is not considered by community. Basicly here it is `BPI_Challenge_2012.xes`

In [None]:
file_path = 'datasets/BPI_Challenge_2012.xes'
event_log = pm4py.read_xes(file_path)
df = pm4py.convert_to_dataframe(event_log)
df = df.dropna()
df

We works only wuith activity adn time

In [None]:
df = pm4py.convert_to_dataframe(event_log)
df = df[df['lifecycle:transition']=='COMPLETE']
df = df[['time:timestamp', 'case:concept:name', 'concept:name']]
df = df.rename(columns={'time:timestamp': 'timestamp', 'case:concept:name': 'trace_id', 'concept:name': 'activity'})
df['trace_id'] = df['trace_id'].apply(lambda x: int(x))
df.reset_index(inplace=True)
df.to_csv('datasets/bpi_12.csv', index=False)
df

Some defenitions must be mentioned:
- event ($e_i$) - {`activity` ($a$), `time_stamp` ($t$)}
- trace (t) - $[e_1,\ e_2, \dots]$

## Purpose
Main objective - predict next `time_stamp` and `activity`. 

For activity metric, used by community is accuracy, and in latest works `f1`. For `time_stamp` ussualy MAE, sometimes weighted MAE.

Nevetheless some other objectives are discovered - whole trace prediction. The metric for activity prediction - Damerau-Levenshtein Similarity

# Visualization
## Activity transition
Firstly let's take a look on all activities transition  and try to estimate $P(a_{i+1} | a_i)$. It is a square matrix, which worth to be presented as `sns.heatmap`

In [None]:
activities = list(set(df['activity'].values))
n_act = len(activities)
traces = list(set(df['trace_id'].values))
_df = df.copy()
_df['activity_next'] = _df['activity'].shift(1)
_df['trace_id_next'] = _df['trace_id'].shift(1)
_df = _df.dropna()
_df = _df[_df['trace_id_next'] == _df['trace_id']]

In [None]:
_df[_df['trace_id']==214373]

In [None]:

transitions_counts = np.zeros((n_act, n_act))
nums_of_ac_next = []
for _i_next, act_next in enumerate(activities):
    _df_ac_next = _df[_df['activity_next'] == act_next]
    nums_of_ac_next.append(_df_ac_next.shape[0])
    for _i, act in enumerate(activities):
        transitions_counts[_i_next, _i] += _df_ac_next[_df_ac_next['activity'] == act].shape[0]

In [None]:
transitions_counts[transitions_counts == 0] = np.nan
sns.set(rc={'figure.facecolor':'white'})
sns.heatmap(transitions_counts, xticklabels=list(activities), yticklabels=list(activities))
plt.title('Total number of transition  $a_t -> a_{t+1}$')

plt.xlabel('$a_i$')
plt.ylabel('$a_{i+1}$')
plt.show()

In [None]:
print(f'There are {np.isnan(transitions_counts).sum()} not existing transitions')

In [None]:
transitions_counts[transitions_counts < 10] = np.nan

In [None]:
sns.set(rc={'figure.facecolor':'white'})
sns.heatmap(transitions_counts, xticklabels=list(activities), yticklabels=list(activities))
plt.title('Total number of transition  $a_t -> a_{t+1}$')

plt.xlabel('$a_i$')
plt.ylabel('$a_{i+1}$')
plt.show()

In [None]:
print(f'There are {np.isnan(transitions_counts).sum()} not existing transitions')

Cant say there are much rare transitions

In [None]:
sns.set(rc={'figure.facecolor':'white'})
sns.heatmap(transitions_counts / np.array(nums_of_ac_next).reshape((-1, 1)),
            xticklabels=list(activities), yticklabels=list(activities))
plt.title('Probability of transition  $a_t -> a_{t+1}$')
plt.xlabel('$a_i$')
plt.ylabel('$a_{i+1}$')
plt.show()

Can't say this dataset has uniform distribution.

## Time features
For leveraging predictive models the following features are created:
- $t_e$ - time since previous event
- $t_w$ - time since the beginning of week
- $t_t$ - time since the beginning of trace

let's visualize all these features

In [None]:
df

In [None]:
import rl4pm_lib.preprocessing as preprocessing


prepro = preprocessing.DfPreprocesser()
prepro.fit(df)
df_preprocessed = prepro.transform(df)
df_preprocessed.to_csv('bpi_12_preprocessed.csv', index=False)

In [None]:
df_preprocessed

In [None]:
plt.hist(df_preprocessed['te'].values, bins=50)
plt.title('$t_e$ distribution')
plt.xlabel('$t_e$, s')
plt.ylabel('count')
plt.show()

In [None]:
low_bond = 24 *  60 * 60
plt.hist(df_preprocessed[df_preprocessed['te'] > low_bond]['te'].values, bins=50)
plt.title(f'$t_e >$ {low_bond / 60 / 60: .1f} h distribution')
plt.xlabel('$t_e$, s')
plt.ylabel('count')
plt.show()

This distribution is far from uniform too

In [None]:
for quant in [0.9, 0.95, 0.97, 0.99]:
    print(f'{quant * 100} % of $te$ are less then {np.quantile(df_preprocessed["te"].values, quant) / (60 * 60): .2f} h')

## What is wrong with this visualization
- transition matrix has a lot of `nan`
- Whant to see which transitions lead to high $t_e$ 
### SberProcessMining
Let's use [this](https://github.com/SberProcessMining/Sber_Process_Mining) repo to got a nice visualization

In [None]:
! pip install sberpm

In [None]:
from sberpm import DataHolder

In [None]:
data_holder = DataHolder(data=df, 
                         id_column='trace_id', 
                         activity_column='activity', 
                         start_timestamp_column='timestamp')

Strange warnings. It read all right, enven pandas can read datetime with no additional formats, so why does it nedd it, dunno

In [None]:
data_holder.data.head(3)

In [None]:
from sberpm.autoinsights import AutoInsights
from sberpm.miners import SimpleMiner
from sberpm.visual import GraphvizPainter

the longest-lasting transitions are colored red

In [None]:
! pip install graphviz

In [None]:
auto_i = AutoInsights(data_holder, time_unit='day')
simple_miner = SimpleMiner(data_holder)
simple_miner.apply()
auto_i.apply(miner=simple_miner, mode='time')
graph = auto_i.get_graph()

painter = GraphvizPainter()
painter.apply_insights(graph)
painter.show()

all transitions are given a status:
- Optimal (positive insights)
- (negative insights)

In [None]:
auto_i = AutoInsights(data_holder, time_unit='day')
simple_miner = SimpleMiner(data_holder)
simple_miner.apply()
auto_i.apply(miner=simple_miner, mode='overall')
graph = auto_i.get_graph()

painter = GraphvizPainter()
painter.apply_insights(graph)
painter.show()

## Max $t_e$ & trace len

In [None]:
df_preprocessed

In [None]:
traces_te_max = {}
traces_len = {}
for t_id in df_preprocessed['trace_id'].values:
    _d = df_preprocessed[df_preprocessed['trace_id'] == t_id]
    traces_len[t_id] = _d.shape[0]
    traces_te_max[t_id] = _d['te'].max()

In [None]:
plt.scatter(np.array(list(traces_te_max.values())) / 3600 / 24, traces_len.values(), s=0.1)
plt.xlabel('max $t_e, days$')
plt.ylabel('max trace length')

Можно убрать все `trace` в которых есть $t_e$ длиннее `33`дней и которые в себе насчитывают более `100` процессов 

Ожидаются следующие улучшения:
- улучшится масштабирование времени и улучшется качество
- меньше ресурсов в нейросетевых подходах

In [None]:
MAX_TE = 33 * 24 * 60 * 60
MAX_TRACE_LEN = 100

# Loops
На рисунке были видны циклы.
Это плохо. Если мы использаем для обучения окна длиннйо 2, то возникает проблема:

$A \rightarrow B \rightarrow A \rightarrow B \rightarrow A \dots A \rightarrow B \rightarrow C$

сформируют признаки

$$[A,\ B],\ [A]$$
$$[B,\ A],\ [B]$$
$$[A,\ B],\ [A]$$
$$\dots$$

и не получится обучить что после $[A,\ B]$ идёт $[C]$

## Какие циклы?
Чтобы сильно не париться, будем искать циклы $A \rightarrow B \rightarrow A$. Отметим `trace_id` с циклами

In [None]:
def is_there_cycle(x) -> bool:
    out = None
    if len(x) < 3:
        out = False
    else:
        out = False
        _i = 0
        l = len(x)
        while (not out) and (_i + 2 <= l - 1):
            if (x[_i] == x[_i+2]) or (x[_i] == x[_i+1]):
                out = True
            _i += 1
    return out

In [None]:
def number_of_cycles(x) -> int:
    out = 0
    if len(x) < 3:
        out = 0
    else:
        out = 0
        _i = 0
        l = len(x)
        while (_i + 2 <= l - 1):
            if (x[_i] == x[_i+2]) or (x[_i] == x[_i+1]):
                out += 1
            _i += 1
    return out

In [None]:
assert is_there_cycle([1, 2, 3, 2]) == True
assert is_there_cycle([1, 2, 3]) == False
assert is_there_cycle([1, 2, 3, 3, 5, 6]) == True
assert is_there_cycle([1, 2, 3, 4, 5, 6]) == False

In [None]:
traces_is_cycle = {}
for t_id in df_preprocessed['trace_id'].values:
    _d = df[df['trace_id'] == t_id]
    traces_is_cycle[t_id] = is_there_cycle(_d['activity'].values)

In [None]:
print(f'{np.array(list(traces_is_cycle.values())).sum() / len(traces_is_cycle) * 100: .2f}% of all traces got cycles')

Хотелось бы както это исправить

In [None]:
traces_num_cycles = {}
for t_id in df_preprocessed['trace_id'].values:
    _d = df[df['trace_id'] == t_id]
    traces_num_cycles[t_id] = number_of_cycles(_d['activity'].values)

In [None]:
plt.scatter(np.array(list(traces_te_max.values())) / 3600 / 24, traces_num_cycles.values(), s=0.1)
plt.xlabel('max $t_e, days$')
plt.ylabel('number of cycles len 2 or 1 ')
plt.title('Max te vs number of cycles')
plt.show()

In [None]:
plt.scatter(list(traces_len.values()), traces_num_cycles.values(), s=0.2)
plt.xlabel('length of trace')
plt.ylabel('number of cycles, len 2 or 1 ')
plt.title('Trace length vs number of cycles')
plt.show()

тоесть если есть цикл, то он занимает пол процесса. Ужс