In [1]:
import pandas as pd
import numpy as np
import datetime
from IPython import display
from dateutil.parser import parse
from rl4pm_lib.utils_supervised import make_window_features

import matplotlib.pyplot as plt

In [2]:
test_df = pd.read_csv('datasets/test_df.csv')
test_df['timestamp'] = test_df['timestamp'].apply(lambda x: parse(x))

train_df = pd.read_csv('datasets/train_df.csv')
train_df['timestamp'] = train_df['timestamp'].apply(lambda x: parse(x))

## Proprocessing:
### 1. Make features
For leveraging predictive models the following features are created:
- $t_e$ - time since previous event
- $t_w$ - time since the beginning of week
- $t_t$ - time since the beginning of trace
- one hot encoded labels

### 2. Make window with lags

### 3. Scale it

In [3]:
import rl4pm_lib.preprocessing as preprocessing
# make features
column_feature = {'tt': 0, 'te': 1, 'tw': 2}
# prepro = preprocessing.DfPreprocesser()
# prepro.fit(train_df)
# train_df_pr = prepro.transform(train_df)
# test_df_pr = prepro.transform(test_df)

In [4]:
win_len = 2

# test_df_pr_win, test_labels, test_tes = make_window_features(train_df_pr, win_len)

# train_df_pr_win, train_labels, train_tes = make_window_features(train_df_pr, win_len)

The data above can be just writed to disk so it is fixed

In [5]:
# to_write_win_test = test_df_pr_win
# to_write_win_test['labels'] = test_labels
# to_write_win_test['te_true'] = test_tes

# to_write_win_train = train_df_pr_win
# to_write_win_train['labels'] = train_labels
# to_write_win_train['te_true'] = train_tes

# to_write_win_test.to_csv(f'datasets/test_features_win_{win_len}.csv', index=False)
# to_write_win_train.to_csv(f'datasets/train_features_win_{win_len}.csv', index=False)

In [6]:
to_write_win_test = pd.read_csv(f'datasets/test_features_win_{win_len}.csv')
test_df_pr_win = to_write_win_test.drop(columns=['labels', 'te_true'])
test_labels, test_tes = to_write_win_test['labels'], to_write_win_test['te_true']

to_write_win_train = pd.read_csv(f'datasets/train_features_win_{win_len}.csv')
train_df_pr_win = to_write_win_train.drop(columns=['labels', 'te_true'])
train_labels, train_tes = to_write_win_train['labels'], to_write_win_train['te_true']

Also must scale continious (time related features)

In [7]:
train_df_pr_win.shape, train_labels.shape

((43574, 20), (43574,))

In [8]:
from rl4pm_lib.preprocessing import PaperScalerPd as PaperScaler

scaler = preprocessing.PaperScaler(column_feature)
scaler.fit(train_df_pr_win)
train_df_pr_sc = scaler.transform(train_df_pr_win)
test_df_pr_sc = scaler.transform(test_df_pr_win)

Let's construct features as moving window

Ok, features and targets are ready, let's do `.fit()`, `.predict())))))`

What methods will I use? Of course all, which were listed [here](https://github.com/lemikhovalex/DA_CV)

## PipeLine

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
pipe = Pipeline([('scaler', PaperScaler(column_feature)),
                 ('rfc', RandomForestClassifier(n_estimators=30, max_depth=4, random_state=42, n_jobs=-1))
                ])

In [10]:
pipe.fit(train_df_pr_win, train_labels)

Pipeline(steps=[('scaler',
                 <rl4pm_lib.preprocessing.PaperScalerPd object at 0x0000015C6B816080>),
                ('rfc',
                 RandomForestClassifier(max_depth=4, n_estimators=30, n_jobs=-1,
                                        random_state=42))])

# Classification
## Random Forest Classifier

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [12]:
%%time
pipe = Pipeline([('scaler', PaperScaler(column_feature)),
                 ('rfc', RandomForestClassifier(n_estimators=30, max_depth=4, random_state=42, n_jobs=-1))
                ])

pipe.fit(train_df_pr_win, train_labels)
train_labels_hat = pipe.predict(train_df_pr_win)
test_labels_hat = pipe.predict(test_df_pr_win)

test_acc_rfc = accuracy_score(test_labels_hat, test_labels)
train_acc_rfc = accuracy_score(train_labels_hat, train_labels)
print(f'Random Forest Classifier\ntest  accuracy = {test_acc_rfc: .2f}\ntrain accuracy = {train_acc_rfc: .2f}')

Random Forest Classifier
test  accuracy =  0.80
train accuracy =  0.80
Wall time: 18.9 s


In [13]:
from sklearn.model_selection import GridSearchCV

In [None]:
%%time
param_grid = { 
    'rfc__n_estimators': np.linspace(140, 250, num=15, dtype=int),
    'rfc__max_depth' : np.linspace(10, 20, num=10, dtype=int),
}

model_gs = pipe = Pipeline([('scaler', PaperScaler(column_feature)),
                            ('rfc', RandomForestClassifier(random_state=42))
                           ])
CV_rfc = GridSearchCV(estimator=model_gs, param_grid=param_grid, scoring='accuracy', cv=7, n_jobs=-1, verbose=3)
CV_rfc.fit(train_df_pr_win, train_labels)

Fitting 7 folds for each of 150 candidates, totalling 1050 fits


In [None]:
CV_rfc.best_params_

In [None]:
%%time
rfc_activ_best = RandomForestClassifier(random_state=42, max_depth=11, n_estimators=202)
rfc_activ_best.fit(train_df_pr_sc_win, train_labels)

train_labels_hat = rfc_activ_best.predict(train_df_pr_win)
test_labels_hat = rfc_activ_best.predict(test_df_pr_win)

test_acc_rfc = accuracy_score(test_labels_hat, test_labels)
train_acc_rfc = accuracy_score(train_labels_hat, train_labels)

print(f'Random Forest Classifier, after grid search')
print(f'test  accuracy = {test_acc_rfc: .2f}\ntrain accuracy = {train_acc_rfc: .2f}')

Resuls some how are beter then in article on LSTM approach

## K Neighbors Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate

In [None]:
# knn_clf = KNeighborsClassifier()
# grid_knn = GridSearchCV(knn_clf, {'n_neighbors': np.linspace(3, 30, 6, dtype=int),
#                                   'weights': ['distance', 'uniform']
#                                  },
#                         n_jobs=-1, scoring='accuracy', cv=7, verbose=3)
# grid_knn.fit(train_df_pr_sc_win, train_labels)
# print(grid_xgb.best_score_)
# print(grid_xgb.best_params_)

Ok, next

## Grad Boost

In [None]:
import xgboost as xgb

In [None]:
%%time
param = {'max_depth': 17,  # the maximum depth of each tree
         'eta': 0.3,  # the training step for each iteration
         'objective': 'multi:softprob',  # error evaluation for multiclass training
         'num_class': 6,
         'n_jobs': -1,
         'n_estimators': 7  ,
         'tree_method': 'gpu_hist',
         'use_label_encoder': False,
         'eval_metric': 'mlogloss'
        }

xgb_clf = xgb.XGBClassifier(**param)
xgb_clf.fit(train_df_pr_sc_win, train_labels)

train_labels_hat = xgb_clf.predict(train_df_pr_sc_win)
test_labels_hat = xgb_clf.predict(test_df_pr_sc_win)

test_acc_xgb = accuracy_score(test_labels_hat, test_labels)
train_acc_xgb = accuracy_score(train_labels_hat, train_labels)

print(f'XG boost Classifier')
print(f'test  accuracy = {test_acc_xgb: .2f}\ntrain accuracy = {train_acc_xgb: .2f}')

In [None]:
%%time

param = {'objective': 'multi:softprob',  # error evaluation for multiclass training
         'num_class': 6,
         'tree_method': 'gpu_hist',
         'use_label_encoder': False,
         'eval_metric': 'mlogloss'
        }

xgb_clf = xgb.XGBClassifier(**param)
grid_xgb = GridSearchCV(xgb_clf, {'max_depth': np.linspace(3, 20, num=2, dtype=int),
                                  'n_estimators': np.linspace(3, 15, num=2, dtype=int),
                                  'eta': np.linspace(1e-2, 0.5, num=2, dtype=float)
                                 },
                        n_jobs=-1, scoring='accuracy', cv=7, verbose=3)
grid_xgb.fit(train_df_pr_sc_win, train_labels)
print(grid_xgb.best_score_)
print(grid_xgb.best_params_)

In [None]:
%%time

param = {'objective': 'multi:softprob',  # error evaluation for multiclass training
         'num_class': 6,
         'tree_method': 'gpu_hist',
         'n_jobs': -1,
         'use_label_encoder': False,
         'eval_metric': 'mlogloss'
        }

xgb_clf = xgb.XGBClassifier(**param)
grid_xgb = GridSearchCV(xgb_clf, {'max_depth': np.linspace(3, 20, num=2, dtype=int),
                                  'n_estimators': np.linspace(3, 15, num=2, dtype=int),
                                  'eta': np.linspace(1e-2, 0.5, num=2, dtype=float)
                                 },
                        n_jobs=-1, scoring='accuracy', cv=7, verbose=3)
grid_xgb.fit(train_df_pr_sc_win, train_labels)
print(grid_xgb.best_score_)
print(grid_xgb.best_params_)

In [None]:
param = {'objective': 'multi:softprob',  # error evaluation for multiclass training
         'num_class': 6,
         'tree_method': 'gpu_hist'
        }

xgb_clf = xgb.XGBClassifier(**param)
grid_xgb = GridSearchCV(xgb_clf, {'max_depth': np.linspace(5, 10, num=5, dtype=int),
                                  'n_estimators': np.linspace(3, 10, num=7, dtype=int),
                                  'eta': np.linspace(0.001, 0.3, num=20, dtype=float)
                                 },
                        n_jobs=-1, scoring='accuracy', cv=2, verbose=3)
grid_xgb.fit(train_df_pr_sc_win, train_labels)
print(grid_xgb.best_score_)
print(grid_xgb.best_params_)

In [None]:
xgb_clf_best = grid_xgb.best_estimator_

In [None]:
train_labels_hat = xgb_clf_best.predict(train_df_pr_sc_win)
test_labels_hat = xgb_clf_best.predict(test_df_pr_sc_win)

test_acc_xgb = accuracy_score(test_labels_hat, test_labels)
train_acc_xgb = accuracy_score(train_labels_hat, train_labels)

print(f'XG boost Classifier')
print(f'test  accuracy = {test_acc_xgb: .2f}\ntrain accuracy = {train_acc_xgb: .2f}')

## Classification results
Classique approaches povides results, which outperforms result, given in article as baseline, but...

- As been shown in visualization, there are a lot of loops, model that doesnt consern information on all the trail seems to be useless for end time prediction
- Accuracy is what been used for benchmarking in papers. Maybe it is not the best metric? Who knows, nevertheless this is a bench

# Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error as MAE

In [None]:
%%time
rfc_te = RandomForestRegressor(n_estimators=150, max_depth=17, random_state=42, n_jobs=-1)

rfc_te.fit(train_df_pr_sc_win, train_tes)
train_tes_hat = rfc_te.predict(train_df_pr_sc_win)
test_tes_hat = rfc_te.predict(test_df_pr_sc_win)

test_mae_rfr = MAE(test_tes_hat * scaler.scales['te'], test_tes * scaler.scales['te'])
train_mae_rfr = MAE(train_tes_hat * scaler.scales['te'], train_tes * scaler.scales['te'])
print(f'Random Forest Classifier\ntest  MAE = {test_mae_rfr / 60 / 60: .2f}h\ntrain MAE = {train_mae_rfr / 60 / 60: .2f}h')

Again better then papers)

In [None]:
import xgboost as xgb
param = {'max_depth': 17,  # the maximum depth of each tree
         'eta': 0.3,  # the training step for each iteration
         'n_jobs': -1,
         'n_estimators': 7      
        }

xgb_reg = xgb.XGBRegressor(**param)
xgb_reg.fit(train_df_pr_sc_win, train_tes)

train_tes_hat = xgb_reg.predict(train_df_pr_sc_win)
test_tes_hat = xgb_reg.predict(test_df_pr_sc_win)

test_mae_xgb = MAE(test_tes_hat * scaler.scales['te'], test_tes * scaler.scales['te'])
train_mae_xgb = MAE(test_tes_hat * scaler.scales['te'], test_tes * scaler.scales['te'])

print(f'XG boost Regressor')
print(f'Random Forest Classifier\ntest  MAE = {test_mae_xgb / 3600 / 24: .2f}d\ntrain MAE = {train_mae_xgb / 3600 / 24: .2f}d')

XGB is not that cool

# No long activity!