In [27]:
import pandas as pd
import numpy as np
import datetime
from IPython import display
from dateutil.parser import parse

import matplotlib.pyplot as plt

In [29]:
test_df = pd.read_csv('datasets/test_df.csv')
test_df['timestamp'] = test_df['timestamp'].apply(lambda x: parse(x))

train_df = pd.read_csv('datasets/train_df.csv')
train_df['timestamp'] = train_df['timestamp'].apply(lambda x: parse(x))

## Proprocessing:
For leveraging predictive models the following features are created:
- $t_e$ - time since previous event
- $t_w$ - time since the beginning of week
- $t_t$ - time since the beginning of trace
- one hot encoded labels

In [30]:
import rl4pm_lib.preprocessing as preprocessing

column_feature = {'tt': 0, 'te': 1, 'tw': 2}
prepro = preprocessing.DfPreprocesser()
prepro.fit(train_df)
train_df_pr = prepro.transform(train_df)
test_df_pr = prepro.transform(test_df)

Also must scale continious (time related features)

In [31]:
scaler = preprocessing.PaperScaler(column_feature)
scaler.fit(train_df_pr)
train_df_pr_sc = scaler.transform(train_df_pr)
test_df_pr_sc = scaler.transform(test_df_pr)

Let's construct features as moving window

In [32]:
win_len = 2

In [33]:
from rl4pm_lib.utils_supervised import make_window_features

In [34]:
test_df_pr_sc

Unnamed: 0,tt,te,tw,trace_id,W_Afhandelen leads,W_Beoordelen fraude,W_Completeren aanvraag,W_Nabellen incomplete dossiers,W_Nabellen offertes,W_Valideren aanvraag
0,0.000000,0.000000,5.447731,173718,0,0,1,0,0,0
1,0.000002,0.000006,5.447905,173718,0,0,0,0,1,0
2,0.000017,0.000044,5.449267,173718,0,0,0,0,1,0
3,0.000043,0.000076,5.451627,173718,0,0,0,0,1,0
4,0.000045,0.000005,5.451791,173718,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
13603,0.000042,0.000046,2.738036,214208,0,0,0,0,1,0
13604,0.000097,0.000163,2.743060,214208,0,0,0,0,1,0
13605,0.078414,0.230624,2.873128,214208,0,0,0,0,1,0
13606,0.078422,0.000023,2.873825,214208,0,0,0,0,1,0


In [35]:
test_df_pr_sc_win, test_labels, test_tes = make_window_features(test_df_pr_sc, win_len)
test_df_pr_sc_win.drop(columns=['trace_id'], inplace=True)

In [36]:
train_df_pr_sc_win, train_labels, train_tes = make_window_features(train_df_pr_sc, win_len)
train_df_pr_sc_win.drop(columns=['trace_id'], inplace=True)

Ok, features and targets are ready, let's do `.fit()`, `.predict())))))`

What methods will I use? Of course all, which were listed [here](https://github.com/lemikhovalex/DA_CV)

# Classification
## Random Forest Classifier

In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [38]:
%%time
rfc_activ = RandomForestClassifier(n_estimators=30, max_depth=4, random_state=42, n_jobs=-1)

rfc_activ.fit(train_df_pr_sc_win, train_labels)
train_labels_hat = rfc_activ.predict(train_df_pr_sc_win)
test_labels_hat = rfc_activ.predict(test_df_pr_sc_win)

test_acc_rfc = accuracy_score(test_labels_hat, test_labels)
train_acc_rfc = accuracy_score(train_labels_hat, train_labels)
print(f'Random Forest Classifier\ntest  accuracy = {test_acc_rfc: .2f}\ntrain accuracy = {train_acc_rfc: .2f}')

Random Forest Classifier
test  accuracy =  0.82
train accuracy =  0.80
CPU times: user 1.12 s, sys: 98.5 ms, total: 1.22 s
Wall time: 453 ms


In [39]:
from sklearn.model_selection import GridSearchCV

In [40]:
%%time
param_grid = { 
    'n_estimators': np.linspace(140, 250, num=15, dtype=int),
    'max_depth' : np.linspace(10, 20, num=10, dtype=int),
}

model_gs = RandomForestClassifier(random_state=42)
CV_rfc = GridSearchCV(estimator=model_gs, param_grid=param_grid, scoring='accuracy', cv=7, n_jobs=-1, verbose=3)
CV_rfc.fit(train_df_pr_sc_win, train_labels)

Fitting 7 folds for each of 150 candidates, totalling 1050 fits
CPU times: user 17.6 s, sys: 788 ms, total: 18.4 s
Wall time: 28min 25s


GridSearchCV(cv=7, estimator=RandomForestClassifier(random_state=42), n_jobs=-1,
             param_grid={'max_depth': array([10, 11, 12, 13, 14, 15, 16, 17, 18, 20]),
                         'n_estimators': array([140, 147, 155, 163, 171, 179, 187, 195, 202, 210, 218, 226, 234,
       242, 250])},
             scoring='accuracy', verbose=3)

In [41]:
CV_rfc.best_params_

{'max_depth': 11, 'n_estimators': 202}

In [45]:
%%time
rfc_activ_best = RandomForestClassifier(random_state=42, max_depth=11, n_estimators=202)
rfc_activ_best.fit(train_df_pr_sc_win, train_labels)

train_labels_hat = rfc_activ_best.predict(train_df_pr_sc_win)
test_labels_hat = rfc_activ_best.predict(test_df_pr_sc_win)

test_acc_rfc = accuracy_score(test_labels_hat, test_labels)
train_acc_rfc = accuracy_score(train_labels_hat, train_labels)

print(f'Random Forest Classifier, after grid search')
print(f'test  accuracy = {test_acc_rfc: .2f}\ntrain accuracy = {train_acc_rfc: .2f}')

Random Forest Classifier, after grid search
test  accuracy =  0.84
train accuracy =  0.91
CPU times: user 15.3 s, sys: 0 ns, total: 15.3 s
Wall time: 15.5 s


Resuls some how are beter then in article on LSTM approach

## K Neighbors Classifier

In [46]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate

In [50]:
# knn_clf = KNeighborsClassifier()
# grid_knn = GridSearchCV(knn_clf, {'n_neighbors': np.linspace(3, 30, 6, dtype=int),
#                                   'weights': ['distance', 'uniform']
#                                  },
#                         n_jobs=-1, scoring='accuracy', cv=7, verbose=3)
# grid_knn.fit(train_df_pr_sc_win, train_labels)
# print(grid_xgb.best_score_)
# print(grid_xgb.best_params_)

Fitting 7 folds for each of 12 candidates, totalling 84 fits


exception calling callback for <Future at 0x7f254cf05a58 state=finished raised TerminatedWorkerError>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/joblib/externals/loky/_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "/usr/local/lib/python3.6/dist-packages/joblib/parallel.py", line 359, in __call__
    self.parallel.dispatch_next()
  File "/usr/local/lib/python3.6/dist-packages/joblib/parallel.py", line 792, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "/usr/local/lib/python3.6/dist-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File "/usr/local/lib/python3.6/dist-packages/joblib/parallel.py", line 777, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/usr/local/lib/python3.6/dist-packages/joblib/_parallel_backends.py", line 531, in apply_async
    future = self._workers.submit(SafeFunction(func))
  File "/usr/local/lib/

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}

Ok, next

## Grad Boost

In [51]:
import xgboost as xgb

In [69]:
%%time
param = {'max_depth': 17,  # the maximum depth of each tree
         'eta': 0.3,  # the training step for each iteration
         'objective': 'multi:softprob',  # error evaluation for multiclass training
         'num_class': 6,
         'n_jobs': -1,
         'n_estimators': 7  ,
         'tree_method': 'gpu_hist',
         'use_label_encoder': False,
         'eval_metric': 'mlogloss'
        }

xgb_clf = xgb.XGBClassifier(**param)
xgb_clf.fit(train_df_pr_sc_win, train_labels)

train_labels_hat = xgb_clf.predict(train_df_pr_sc_win)
test_labels_hat = xgb_clf.predict(test_df_pr_sc_win)

test_acc_xgb = accuracy_score(test_labels_hat, test_labels)
train_acc_xgb = accuracy_score(train_labels_hat, train_labels)

print(f'XG boost Classifier')
print(f'test  accuracy = {test_acc_xgb: .2f}\ntrain accuracy = {train_acc_xgb: .2f}')

XG boost Classifier
test  accuracy =  0.84
train accuracy =  0.95
CPU times: user 8.07 s, sys: 1.31 s, total: 9.38 s
Wall time: 7.68 s


In [67]:
%%time

param = {'objective': 'multi:softprob',  # error evaluation for multiclass training
         'num_class': 6,
         'tree_method': 'gpu_hist',
         'use_label_encoder': False,
         'eval_metric': 'mlogloss'
        }

xgb_clf = xgb.XGBClassifier(**param)
grid_xgb = GridSearchCV(xgb_clf, {'max_depth': np.linspace(3, 20, num=2, dtype=int),
                                  'n_estimators': np.linspace(3, 15, num=2, dtype=int),
                                  'eta': np.linspace(1e-2, 0.5, num=2, dtype=float)
                                 },
                        n_jobs=-1, scoring='accuracy', cv=7, verbose=3)
grid_xgb.fit(train_df_pr_sc_win, train_labels)
print(grid_xgb.best_score_)
print(grid_xgb.best_params_)

Fitting 7 folds for each of 8 candidates, totalling 56 fits


KeyboardInterrupt: 

In [68]:
%%time

param = {'objective': 'multi:softprob',  # error evaluation for multiclass training
         'num_class': 6,
         'tree_method': 'gpu_hist',
         'n_jobs': -1,
         'use_label_encoder': False,
         'eval_metric': 'mlogloss'
        }

xgb_clf = xgb.XGBClassifier(**param)
grid_xgb = GridSearchCV(xgb_clf, {'max_depth': np.linspace(3, 20, num=2, dtype=int),
                                  'n_estimators': np.linspace(3, 15, num=2, dtype=int),
                                  'eta': np.linspace(1e-2, 0.5, num=2, dtype=float)
                                 },
                        n_jobs=-1, scoring='accuracy', cv=7, verbose=3)
grid_xgb.fit(train_df_pr_sc_win, train_labels)
print(grid_xgb.best_score_)
print(grid_xgb.best_params_)

Fitting 7 folds for each of 8 candidates, totalling 56 fits


KeyboardInterrupt: 

In [None]:
param = {'objective': 'multi:softprob',  # error evaluation for multiclass training
         'num_class': 6,
         'tree_method': 'gpu_hist'
        }

xgb_clf = xgb.XGBClassifier(**param)
grid_xgb = GridSearchCV(xgb_clf, {'max_depth': np.linspace(5, 10, num=5, dtype=int),
                                  'n_estimators': np.linspace(3, 10, num=7, dtype=int),
                                  'eta': np.linspace(0.001, 0.3, num=20, dtype=float)
                                 },
                        n_jobs=-1, scoring='accuracy', cv=2, verbose=3)
grid_xgb.fit(train_df_pr_sc_win, train_labels)
print(grid_xgb.best_score_)
print(grid_xgb.best_params_)

In [None]:
xgb_clf_best = grid_xgb.best_estimator_

In [None]:
train_labels_hat = xgb_clf_best.predict(train_df_pr_sc_win)
test_labels_hat = xgb_clf_best.predict(test_df_pr_sc_win)

test_acc_xgb = accuracy_score(test_labels_hat, test_labels)
train_acc_xgb = accuracy_score(train_labels_hat, train_labels)

print(f'XG boost Classifier')
print(f'test  accuracy = {test_acc_xgb: .2f}\ntrain accuracy = {train_acc_xgb: .2f}')

## Classification results
Classique approaches povides results, which outperforms result, given in article as baseline, but...

- As been shown in visualization, there are a lot of loops, model that doesnt consern information on all the trail seems to be useless for end time prediction
- Accuracy is what been used for benchmarking in papers. Maybe it is not the best metric? Who knows, nevertheless this is a bench

# Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error as MAE

In [None]:
%%time
rfc_te = RandomForestRegressor(n_estimators=150, max_depth=17, random_state=42, n_jobs=-1)

rfc_te.fit(train_df_pr_sc_win, train_tes)
train_tes_hat = rfc_te.predict(train_df_pr_sc_win)
test_tes_hat = rfc_te.predict(test_df_pr_sc_win)

test_mae_rfr = MAE(test_tes_hat * scaler.scales['te'], test_tes * scaler.scales['te'])
train_mae_rfr = MAE(train_tes_hat * scaler.scales['te'], train_tes * scaler.scales['te'])
print(f'Random Forest Classifier\ntest  MAE = {test_mae_rfr / 60 / 60: .2f}h\ntrain MAE = {train_mae_rfr / 60 / 60: .2f}h')

Again better then papers)

In [None]:
import xgboost as xgb
param = {'max_depth': 17,  # the maximum depth of each tree
         'eta': 0.3,  # the training step for each iteration
         'n_jobs': -1,
         'n_estimators': 7      
        }

xgb_reg = xgb.XGBRegressor(**param)
xgb_reg.fit(train_df_pr_sc_win, train_tes)

train_tes_hat = xgb_reg.predict(train_df_pr_sc_win)
test_tes_hat = xgb_reg.predict(test_df_pr_sc_win)

test_mae_xgb = MAE(test_tes_hat * scaler.scales['te'], test_tes * scaler.scales['te'])
train_mae_xgb = MAE(test_tes_hat * scaler.scales['te'], test_tes * scaler.scales['te'])

print(f'XG boost Regressor')
print(f'Random Forest Classifier\ntest  MAE = {test_mae_xgb / 3600 / 24: .2f}d\ntrain MAE = {train_mae_xgb / 3600 / 24: .2f}d')

XGB is not that cool

# No long activity!