In [1]:
import pandas as pd
import numpy as np
import datetime
from IPython import display
from dateutil.parser import parse

from rl4pm_lib.utils_supervised import make_window_features

import seaborn as sns

from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate

from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import xgboost as xgb

import matplotlib.pyplot as plt

In [2]:
test_df = pd.read_csv('datasets/test_df_nr.csv')
test_df['timestamp'] = test_df['timestamp'].apply(lambda x: parse(x))
test_df['trace_id'] = test_df['trace_id'].apply(lambda x: int(x))

train_df = pd.read_csv('datasets/train_df_nr.csv')
train_df['timestamp'] = train_df['timestamp'].apply(lambda x: parse(x))
train_df['trace+id'] = train_df['trace_id'].apply(lambda x: int(x))

## Proprocessing:
### 1. Make features
For leveraging predictive models the following features are created:
- $t_e$ - time since previous event
- $t_w$ - time since the beginning of week
- $t_t$ - time since the beginning of trace
- one hot encoded labels

### 2. Make window with lags

### 3. Scale it

In [3]:
import rl4pm_lib.preprocessing as preprocessing
# make features
column_feature = {'tt': 0, 'te': 1, 'tw': 2}
prepro = preprocessing.DfPreprocesser()
prepro.fit(train_df)
train_df_pr = prepro.transform(train_df)
test_df_pr = prepro.transform(test_df)

In [4]:
test_df_pr.columns

Index(['tt', 'te', 'tw', 'trace_id', 'timestamp', 1, 2, 3, 4, 5, 6], dtype='object')

In [5]:
win_len = 2

test_df_pr = test_df_pr.rename(columns={i+1: str(i+1) for i in range(6)})
train_df_pr = train_df_pr.rename(columns={i+1: str(i+1) for i in range(6)})

test_df_pr_win, test_labels, test_tes = make_window_features(train_df_pr, win_len)

train_df_pr_win, train_labels, train_tes = make_window_features(train_df_pr, win_len)

The data above can be just writed to disk so it is fixed

In [6]:
to_write_win_test = test_df_pr_win
to_write_win_test['labels'] = test_labels
to_write_win_test['te_true'] = test_tes

to_write_win_train = train_df_pr_win
to_write_win_train['labels'] = train_labels
to_write_win_train['te_true'] = train_tes

to_write_win_test.to_csv(f'datasets/test_features_win_{win_len}_nr.csv', index=False)
to_write_win_train.to_csv(f'datasets/train_features_win_{win_len}_nr.csv', index=False)

In [7]:
test_df_pr_win = pd.read_csv(f'datasets/test_features_win_{win_len}_nr.csv')
train_df_pr_win = pd.read_csv(f'datasets/train_features_win_{win_len}_nr.csv')

if 'timestamp' in test_df_pr_win:
    test_df_pr_win['timestamp'] = test_df_pr_win['timestamp'].apply(lambda x: parse(x))
if 'timestamp' in train_df_pr_win:
    train_df_pr_win['timestamp'] = train_df_pr_win['timestamp'].apply(lambda x: parse(x))
    
test_df_pr_win.sort_values(by=['timestamp'], inplace=True)
train_df_pr_win.sort_values(by=['timestamp'], inplace=True)

test_labels, test_tes = test_df_pr_win['labels'], test_df_pr_win['te_true']
test_df_pr_win = test_df_pr_win.drop(columns=['labels', 'te_true'])

train_labels, train_tes = train_df_pr_win['labels'], train_df_pr_win['te_true']
train_df_pr_win = train_df_pr_win.drop(columns=['labels', 'te_true'])

In [8]:
activities = []
_d = train_df_pr_win.copy()
for _c in _d.columns:
    if (_c[:2] == 'W_') and ('__' not in _c):
        activities.append(_c)
activities

[]

In [9]:
if ('timestamp' in train_df_pr_win) and (type(train_df_pr_win['timestamp'].values[0])==str):
    train_df_pr_win['timestamp'] = train_df_pr_win['timestamp'].apply(lambda x: parse(x))
if ('timestamp' in test_df_pr_win) and (type(test_df_pr_win['timestamp'].values[0])==str):
    test_df_pr_win['timestamp'] = test_df_pr_win['timestamp'].apply(lambda x: parse(x))

Also must scale continious (time related features)

In [10]:
train_df_pr_win.shape, train_labels.shape

((9176, 22), (9176,))

In [11]:
from rl4pm_lib.preprocessing import PaperScalerPd as PaperScaler

Let's construct features as moving window

Ok, features and targets are ready, let's do `.fit()`, `.predict())))))`

What methods will I use? Of course all, which were listed [here](https://github.com/lemikhovalex/DA_CV)

## PipeLine

In [12]:

pipe = Pipeline([('scaler', PaperScaler(column_feature)),
                 ('rfc', RandomForestClassifier(n_estimators=30, max_depth=4, random_state=42, n_jobs=-1))
                ])

In [13]:
pipe.fit(train_df_pr_win.drop(columns=['timestamp']), train_labels)

Pipeline(steps=[('scaler',
                 <rl4pm_lib.preprocessing.PaperScalerPd object at 0x00000185A3723FD0>),
                ('rfc',
                 RandomForestClassifier(max_depth=4, n_estimators=30, n_jobs=-1,
                                        random_state=42))])

# Classification
## Random Forest Classifier

In [14]:
splitter = TimeSeriesSplit(n_splits=7)
xs = splitter.split(train_df_pr_win, train_labels)

In [None]:
%%time
param_grid = { 
    'rfc__n_estimators': np.linspace(140, 250, num=15, dtype=int),
    'rfc__max_depth' : np.linspace(10, 20, num=10, dtype=int),
}

model_gs = Pipeline([('scaler', PaperScaler(column_feature)),
                     ('rfc', RandomForestClassifier(random_state=42, class_weight='balanced'))
                    ])
CV_rfc = GridSearchCV(estimator=model_gs, param_grid=param_grid, scoring='accuracy',
                      cv=TimeSeriesSplit(n_splits=5), n_jobs=3, verbose=1)
CV_rfc.fit(train_df_pr_win, train_labels)

Fitting 5 folds for each of 150 candidates, totalling 750 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   29.8s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:  2.1min
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:  4.7min


In [None]:
CV_rfc.best_params_

In [None]:
rfc_activ_best = CV_rfc.best_estimator_

In [None]:
rfc_activ_best = Pipeline([('scaler', PaperScaler(column_feature)),
                            ('rfc', RandomForestClassifier(random_state=42, 
                                                           max_depth=18,
                                                           n_estimators=218
                                                          ))
                           ])

In [None]:
rfc_activ_best = CV_rfc.best_estimator_

In [None]:
%%time

rfc_activ_best.fit(train_df_pr_win, train_labels)

train_labels_hat = rfc_activ_best.predict(train_df_pr_win)
test_labels_hat = rfc_activ_best.predict(test_df_pr_win)

test_acc_rfc = accuracy_score(test_labels_hat, test_labels)
train_acc_rfc = accuracy_score(train_labels_hat, train_labels)

print(f'Random Forest Classifier, after grid search')
print(f'test  accuracy = {test_acc_rfc: .2f}\ntrain accuracy = {train_acc_rfc: .2f}')
print(f'test  f1 = {f1_score(test_labels, test_labels_hat, average="weighted"): .2f}')
print(f'train f1 = {f1_score(train_labels, train_labels_hat, average="weighted"): .2f}')

In [None]:

cm = confusion_matrix(test_labels, test_labels_hat)
cm_prob = cm / np.nan_to_num(cm, 0).sum(axis=1).reshape(-1, 1)

df_cm = pd.DataFrame(cm, index = activities,
                         columns = activities)
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=True)
plt.title('RFC\nConfusion matrix')
plt.xlabel('predicted')
plt.ylabel('true')
plt.show()

In [None]:

cm = confusion_matrix(test_labels, test_labels_hat)
cm_prob = cm / np.nan_to_num(cm, 0).sum(axis=1).reshape(-1, 1)

df_cm = pd.DataFrame(cm_prob, index = activities,
                         columns = activities)
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=True)
plt.title('RFC\nConfusion matrix as probabilities')
plt.xlabel('predicted')
plt.ylabel('true')
plt.show()

Resuls some how are beter then in article on LSTM approach

## RFC not balanced

In [None]:
%%time
param_grid_rfcb = { 
    'rfc__n_estimators': np.linspace(140, 250, num=15, dtype=int),
    'rfc__max_depth' : np.linspace(10, 20, num=10, dtype=int),
}

model_gs_rfcnb = Pipeline([('scaler', PaperScaler(column_feature)),
                           ('rfc', RandomForestClassifier(random_state=42))
                          ])
CV_rfcub = GridSearchCV(estimator=model_gs_rfcnb, param_grid=param_grid, scoring='accuracy',
                      cv=TimeSeriesSplit(n_splits=5), n_jobs=3, verbose=1)
CV_rfcub.fit(train_df_pr_win, train_labels)

In [None]:
CV_rfcub.best_params_

In [None]:
rfc_ub_activ_best = CV_rfc.best_estimator_

In [None]:
%%time

rfc_ub_activ_best.fit(train_df_pr_win, train_labels)

train_labels_hat_rfcub = rfc_ub_activ_best.predict(train_df_pr_win)
test_labels_hat_rfcub = rfc_ub_activ_best.predict(test_df_pr_win)

test_acc_rfcub = accuracy_score(test_labels_hat_rfcub, test_labels)
train_acc_rfcub = accuracy_score(train_labels_hat_rfcub, train_labels)

print(f'Random Forest Classifier, after grid search')
print(f'test  accuracy = {test_acc_rfcub: .2f}\ntrain accuracy = {train_acc_rfcub: .2f}')
print(f'test  f1 = {f1_score(test_labels, test_labels_hat_rfcub, average="weighted"): .2f}')
print(f'train f1 = {f1_score(train_labels, train_labels_hat_rfcub, average="weighted"): .2f}')

In [None]:

cm = confusion_matrix(test_labels, train_labels_hat_rfcub)
cm_prob = cm / np.nan_to_num(cm, 0).sum(axis=1).reshape(-1, 1)

df_cm = pd.DataFrame(cm, index = activities,
                         columns = activities)
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=True)
plt.title('RFC unbalanced\nConfusion matrix')
plt.xlabel('predicted')
plt.ylabel('true')
plt.show()

In [None]:

cm = confusion_matrix(test_labels, test_labels_hat)
cm_prob = cm / np.nan_to_num(cm, 0).sum(axis=1).reshape(-1, 1)

df_cm = pd.DataFrame(cm_prob, index = activities,
                         columns = activities)
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=True)
plt.title('RFC unbalanced\nConfusion matrix as probabilities')
plt.xlabel('predicted')
plt.ylabel('true')
plt.show()

## Grad Boost

In [None]:
%%time
param = {'objective': 'multi:softprob',  # error evaluation for multiclass training
         'num_class': 6,
         'tree_method': 'gpu_hist',

        }

grid_xgb = {'xgb__max_depth': np.linspace(5, 10, num=5, dtype=int),
            'xgb__n_estimators': np.linspace(3, 10, num=7, dtype=int),
            'xgb__eta': np.linspace(0.001, 0.3, num=15, dtype=float)
           }

model_gs_xgb = Pipeline([('scaler', PaperScaler(column_feature)),
                         ('xgb', xgb.XGBClassifier(**param))
                        ])
CV_xgb = GridSearchCV(estimator=model_gs_xgb, param_grid=grid_xgb, scoring='accuracy',
                      cv=TimeSeriesSplit(n_splits=5), n_jobs=3, verbose=1)
CV_xgb.fit(train_df_pr_win, train_labels)

print(CV_xgb.best_score_)
print(CV_xgb.best_params_)

In [None]:
xgb_clf_best = CV_xgb.best_estimator_

In [None]:
xgb_clf_best = Pipeline([('scaler', PaperScaler(column_feature)),
                         ('xgb', xgb.XGBClassifier(objective='multi:softprob',
                                                   num_class=6,
                                                   tree_method='gpu_hist',
                                                   max_depth=8,
                                                   n_estimators=10
                                                  )
                         )]
                       )

In [None]:
xgb_clf_best.fit(train_df_pr_win, train_labels)

train_labels_hat = xgb_clf_best.predict(train_df_pr_win)
test_labels_hat = xgb_clf_best.predict(test_df_pr_win)

test_acc_xgb = accuracy_score(test_labels_hat, test_labels)
train_acc_xgb = accuracy_score(train_labels_hat, train_labels)

print(f'XG boost Classifier')
print(f'test  accuracy = {test_acc_xgb: .2f}\ntrain accuracy = {train_acc_xgb: .2f}')

print(f'test  f1 = {f1_score(test_labels, test_labels_hat, average="weighted"): .2f}')
print(f'train f1 = {f1_score(train_labels, train_labels_hat, average="weighted"): .2f}')

In [None]:
cm = confusion_matrix(test_labels, test_labels_hat)
cm_prob = cm / np.nan_to_num(cm, 0).sum(axis=1).reshape(-1, 1)

df_cm = pd.DataFrame(cm, index = activities,
                         columns = activities)
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=True)
plt.title('XG boost Classifier\nConfusion matrix XGB')
plt.xlabel('predicted')
plt.ylabel('true')
plt.show()

In [None]:
cm = confusion_matrix(test_labels, test_labels_hat)
cm_prob = cm / np.nan_to_num(cm, 0).sum(axis=1).reshape(-1, 1)

df_cm = pd.DataFrame(cm_prob, index = activities,
                         columns = activities)
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=True)
plt.title('XG boost Classifier\nConfusion matrix XGB')
plt.xlabel('predicted')
plt.ylabel('true')
plt.show()

## Classification results
Classique approaches povides results, which outperforms result, given in article as baseline, but...

- As been shown in visualization, there are a lot of loops, model that doesnt consern information on all the trail seems to be useless for end time prediction
- Accuracy is what been used for benchmarking in papers. Maybe it is not the best metric? Who knows, nevertheless this is a bench

# Regression

In [None]:
%%time
param_grid_rfr = { 
    'rfr__n_estimators': np.linspace(140, 250, num=15, dtype=int),
    'rfr__max_depth' : np.linspace(10, 20, num=10, dtype=int),
}

model_gs_reg = Pipeline([('scaler', PaperScaler(column_feature)),
                         ('rfr', RandomForestRegressor(random_state=42))
                        ])
CV_rfr = GridSearchCV(estimator=model_gs_reg, param_grid=param_grid_rfr, scoring='neg_mean_absolute_error',
                      cv=TimeSeriesSplit(n_splits=5), n_jobs=3, verbose=1)
CV_rfr.fit(train_df_pr_win, train_tes)

In [None]:
CV_rfr.best_params_

In [None]:
rfr =  Pipeline([('scaler', PaperScaler(column_feature)),
                 ('rfr', RandomForestRegressor(random_state=42,
                                               n_estimators=163,
                                               max_depth=10
                                              ))
                        ])

In [None]:
rfr = CV_rfr.best_estimator_
rfr.fit(train_df_pr_win, train_tes)
train_tes_hat_rfr = rfr.predict(train_df_pr_win)
test_tes_hat_rfr = rfr.predict(test_df_pr_win)

test_mae_xgb = MAE(test_tes_hat_rfr, test_tes) / 3600. / 24
train_mae_xgb = MAE(train_tes_hat_rfr, train_tes) / 3600. / 24

print(f'Random Forest Regression')
print(f'MAE  accuracy = {test_mae_xgb: .2f}, days\ntrain MAE = {test_mae_xgb: .2f}, days')

In [None]:
plt.scatter(test_tes / 3600. / 24, test_tes_hat_rfr / 3600. / 24, s=0.1)
plt.title('Random Forest Regression $t_e$ prediction')
plt.xlabel('true, days')
plt.ylabel('predicted, days')
plt.show()

In [None]:
plt.hist((test_tes - test_tes_hat_rfr) / 3600. / 24, bins=50)
plt.xlabel('$t_e$ diff, days')
plt.title('Random Forest Regressor\nTrue $t_e$ - predicted $t_3$')
plt.plot()

## Dull

In [None]:
te_dull = train_tes.mean()
test_dull = te_dull * np.ones(test_tes.shape)
train_dull = te_dull * np.ones(train_tes.shape)

test_mae_dull = MAE(test_dull, test_tes) / 3600. / 24
train_mae_dull = MAE(train_dull, train_tes) / 3600. / 24

print(f'Random Forest Regression')
print(f'MAE  accuracy = {test_mae_dull: .2f}, days\ntrain MAE = {test_mae_dull: .2f}, days')

Again better then papers)

In [None]:
param = {'tree_method': 'gpu_hist'
        }

grid_xgbr = {'xgbr__max_depth': np.linspace(5, 10, num=5, dtype=int),
            'xgbr__n_estimators': np.linspace(3, 10, num=7, dtype=int),
            'xgbr__eta': np.linspace(0.001, 0.3, num=15, dtype=float)
           }

model_gs_xgbr = Pipeline([('scaler', PaperScaler(column_feature)),
                         ('xgbr', xgb.XGBClassifier(**param))
                        ])
CV_xgbr = GridSearchCV(estimator=model_gs_xgbr, param_grid=grid_xgbr, scoring='neg_mean_absolute_error',
                      cv=TimeSeriesSplit(n_splits=5), n_jobs=3, verbose=1)
CV_xgbr.fit(train_df_pr_win, train_tes)

In [None]:
CV_xgbr.best_params_

In [None]:
xgbr = CV_xgbr.best_estimator_
xgbr.fit(train_df_pr_win, train_tes)
train_tes_hat_xgbr = xgbr.predict(train_df_pr_win)
test_tes_hat_xgbr = xgbr.predict(test_df_pr_win)

test_mae_xgbr = MAE(test_tes_hat_xgbr, test_tes) / 3600. / 24
train_mae_xgbr = MAE(train_tes_hat_xgbr, train_tes) / 3600. / 24

print(f'Random Forest Regression')
print(f'MAE  accuracy = {test_mae_xgbr: .2f}, days\ntrain MAE = {test_mae_xgbr: .2f}, days')

In [None]:
plt.scatter(test_tes / 3600. / 24, test_tes_hat_xgbr / 3600. / 24, s=0.1)
plt.title('Random Forest Regressor $t_e$ prediction')
plt.xlabel('true, days')
plt.ylabel('predicted, days')
plt.show()

XGB is not that cool

In [None]:
grid_ridge = {'ridge__alpha': np.linspace(1., 30., num=100, dtype=float)}

model_gs_ridge = Pipeline([('scaler', PaperScaler(column_feature)),
                         ('ridge', Ridge())
                        ])
CV_ridge = GridSearchCV(estimator=model_gs_ridge, param_grid=grid_ridge, scoring='neg_mean_absolute_error',
                      cv=TimeSeriesSplit(n_splits=5), n_jobs=6, verbose=1)
CV_ridge.fit(train_df_pr_win, train_tes)

In [None]:
CV_ridge.best_params_

In [None]:
ridge = CV_ridge.best_estimator_
ridge.fit(train_df_pr_win, train_tes)
train_tes_hat_ridge = ridge.predict(train_df_pr_win)
test_tes_hat_ridge = ridge.predict(test_df_pr_win)

test_mae_ridge = MAE(test_tes_hat_ridge, test_tes) / 3600. / 24
train_mae_ridge = MAE(train_tes_hat_ridge, train_tes) / 3600. / 24

print(f'Random Forest Regression')
print(f'MAE  accuracy = {test_mae_ridge: .2f}, days\ntrain MAE = {test_mae_ridge: .2f}, days')

In [None]:
plt.scatter(test_tes / 3600. / 24, test_tes_hat_ridge / 3600. / 24, s=0.1)
plt.title('Ridge $t_e$ prediction')
plt.xlabel('true, days')
plt.ylabel('predicted, days')
plt.show()

In [None]:
plt.hist((test_tes - test_tes_hat_ridge) / 3600. / 24, bins=50)
plt.xlabel('$t_e$ diff, days')
plt.title('Ridge\nTrue $t_e$ - predicted $t_3$')
plt.plot()