In [21]:
import pandas as pd
import numpy as np
import datetime

In [2]:
test_df = pd.read_csv('datasets/test_df.csv').drop(columns='Unnamed: 0')
test_df['timestamp'] = test_df['timestamp'].apply(lambda x: datetime.datetime.fromisoformat(x))

train_df = pd.read_csv('datasets/train_df.csv').drop(columns='Unnamed: 0')
train_df['timestamp'] = train_df['timestamp'].apply(lambda x: datetime.datetime.fromisoformat(x))

## Proprocessing:
For leveraging predictive models the following features are created:
- $t_e$ - time since previous event
- $t_w$ - time since the beginning of week
- $t_t$ - time since the beginning of trace
- one hot encoded labels

In [3]:
import rl4pm_lib.preprocessing as preprocessing

column_feature = {'te': 0, 'tt': 1, 'tw': 2}
prepro = preprocessing.DfPreprocesser()
prepro.fit(train_df)
train_df_pr = prepro.transform(train_df)
test_df_pr = prepro.transform(test_df)

Let's construct features as moving window

In [4]:
win_len = 2
max_window_len = 5

In [5]:
test_labels = test_df_pr.drop(columns=['te', 'tt', 'tw', 'trace_id']).values.argmax(axis=1)
train_labels = train_df_pr.drop(columns=['te', 'tt', 'tw', 'trace_id']).values.argmax(axis=1)

In [24]:
def make_window_features(df, win_len):
    traces = list(set(df['trace_id'].values))
    outs = []
    labels = []
    for _i, trace in enumerate(traces):
        _df = df[df['trace_id'] == trace]
        outs.append(make_window_features_for_trace(_df, win_len)[:-1])  # one must left 4 prediction
        labels.append(_df.drop(columns=['te', 'tt', 'tw', 'trace_id']).values.argmax(axis=1)[win_len:])
    out = pd.concat(outs, axis=0)
    return out, np.concatenate(labels)

In [25]:
def make_window_features_for_trace(df, win_len):
    _win_len = win_len-1
    out = df[_win_len:].copy()
    out.reset_index(drop=True, inplace=True)
    sh = df.shape[0]
    for _i in range(win_len-1):
        df_to_app = df[_i:sh-_win_len + _i].copy()
        shape_miss = df.shape[0] - df_to_app.shape[0]
        
        rename_dict = {col: col + f'__{_i+1}' for col in df_to_app.columns}
        df_to_app.rename(columns=rename_dict, inplace=True)
        df_to_app.reset_index(drop=True, inplace=True)
        
        out = pd.concat([out, df_to_app], axis=1)
    out.dropna(inplace=True)
    shape_miss = df.shape[0] - out.shape[0]
    return out

In [27]:
test_df_pr.shape[0] - len(set(test_df_pr['trace_id'].values))

58110

In [28]:
test_df_pr_win, test_labels = make_window_features(test_df_pr, win_len)

In [33]:
train_df_pr_win, train_labels = make_window_features(train_df_pr, win_len)

Also it is not one line to get labels