In [1]:
import pandas as pd
import numpy as np
import datetime
from IPython import display

import matplotlib.pyplot as plt

In [2]:
test_df = pd.read_csv('datasets/test_df.csv').drop(columns='Unnamed: 0')
test_df['timestamp'] = test_df['timestamp'].apply(lambda x: datetime.datetime.fromisoformat(x))

train_df = pd.read_csv('datasets/train_df.csv').drop(columns='Unnamed: 0')
train_df['timestamp'] = train_df['timestamp'].apply(lambda x: datetime.datetime.fromisoformat(x))

## Proprocessing:
For leveraging predictive models the following features are created:
- $t_e$ - time since previous event
- $t_w$ - time since the beginning of week
- $t_t$ - time since the beginning of trace
- one hot encoded labels

In [3]:
import rl4pm_lib.preprocessing as preprocessing

column_feature = {'tt': 0, 'te': 1, 'tw': 2}
prepro = preprocessing.DfPreprocesser()
prepro.fit(train_df)
train_df_pr = prepro.transform(train_df)
test_df_pr = prepro.transform(test_df)

Also must scale continious (time related features)

In [4]:
scaler = preprocessing.PaperScaler(column_feature)
scaler.fit(train_df_pr)
train_df_pr_sc = scaler.transform(train_df_pr)
test_df_pr_sc = scaler.transform(test_df_pr)

Let's construct features as moving window

In [5]:
win_len = 2
max_window_len = 5

In [6]:
def make_window_features_for_trace(df, win_len):
    _win_len = win_len-1
    out = df[_win_len:].copy()
    out.reset_index(drop=True, inplace=True)
    sh = df.shape[0]
    for _i in range(win_len-1):
        df_to_app = df[_i:sh-_win_len + _i].copy()
        shape_miss = df.shape[0] - df_to_app.shape[0]
        
        rename_dict = {col: col + f'__{_i+1}' for col in df_to_app.columns}
        df_to_app.rename(columns=rename_dict, inplace=True)
        df_to_app.reset_index(drop=True, inplace=True)
        
        out = pd.concat([out, df_to_app], axis=1)
    out.dropna(inplace=True)
    shape_miss = df.shape[0] - out.shape[0]
    return out

In [7]:
def make_window_features(df, win_len):
    traces = list(set(df['trace_id'].values))
    outs = []
    labels = []
    tes = []
    for _i, trace in enumerate(traces):
        _df = df[df['trace_id'] == trace]
        outs.append(make_window_features_for_trace(_df, win_len)[:-1])  # one must left 4 prediction
        labels.append(_df.drop(columns=['te', 'tt', 'tw', 'trace_id']).values.argmax(axis=1)[win_len:])
        tes.append(_df['te'].values[win_len:])
    return pd.concat(outs, axis=0), np.concatenate(labels), np.concatenate(tes)

In [8]:
test_df_pr_sc_win, test_labels, test_tes = make_window_features(test_df_pr_sc, win_len)
test_df_pr_sc_win.drop(columns=['trace_id'], inplace=True)

In [9]:
train_df_pr_sc_win, train_labels, train_tes = make_window_features(train_df_pr_sc, win_len)
train_df_pr_sc_win.drop(columns=['trace_id'], inplace=True)

Ok, features and targets are ready, let's do `.fit()`, `.predict())))))`

What methods will I use? Of course all, which were listed [here](https://github.com/lemikhovalex/DA_CV)

# Classification
## Random Forest Classifier

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [11]:
%%time
rfc_activ = RandomForestClassifier(n_estimators=130, max_depth=17, random_state=42, n_jobs=-1)

rfc_activ.fit(train_df_pr_sc_win, train_labels)
train_labels_hat = rfc_activ.predict(train_df_pr_sc_win)
test_labels_hat = rfc_activ.predict(test_df_pr_sc_win)

test_acc_rfc = accuracy_score(test_labels_hat, test_labels)
train_acc_rfc = accuracy_score(train_labels_hat, train_labels)
print(f'Random Forest Classifier\ntest  accuracy = {test_acc_rfc: .2f}\ntrain accuracy = {train_acc_rfc: .2f}')

Random Forest Classifier
test  accuracy =  0.84
train accuracy =  0.85
Wall time: 8.18 s


In [18]:
from sklearn.model_selection import GridSearchCV

In [13]:
%%time
param_grid = { 
    'n_estimators': np.linspace(100, 200, num=40, dtype=int),
    'max_depth' : np.linspace(7, 25, num=6, dtype=int),
}

model_gs = RandomForestClassifier(random_state=42)
CV_rfc = GridSearchCV(estimator=model_gs, param_grid=param_grid, scoring='accuracy', cv=2, n_jobs=-1)
CV_rfc.fit(train_df_pr_sc_win, train_labels)

Wall time: 23min 54s


GridSearchCV(cv=2, estimator=RandomForestClassifier(random_state=42), n_jobs=-1,
             param_grid={'max_depth': array([ 7, 10, 14, 17, 21, 25]),
                         'n_estimators': array([100, 102, 105, 107, 110, 112, 115, 117, 120, 123, 125, 128, 130,
       133, 135, 138, 141, 143, 146, 148, 151, 153, 156, 158, 161, 164,
       166, 169, 171, 174, 176, 179, 182, 184, 187, 189, 192, 194, 197,
       200])},
             scoring='accuracy')

In [14]:
CV_rfc.best_params_

{'max_depth': 17, 'n_estimators': 115}

In [15]:
%%time
rfc_activ_best = CV_rfc.best_estimator_
rfc_activ_best.fit(train_df_pr_sc_win, train_labels)

train_labels_hat = rfc_activ_best.predict(train_df_pr_sc_win)
test_labels_hat = rfc_activ_best.predict(test_df_pr_sc_win)

test_acc_rfc = accuracy_score(test_labels_hat, test_labels)
train_acc_rfc = accuracy_score(train_labels_hat, train_labels)

print(f'Random Forest Classifier, after grid search')
print(f'test  accuracy = {test_acc_rfc: .2f}\ntrain accuracy = {train_acc_rfc: .2f}')

Random Forest Classifier, after grid search
test  accuracy =  0.84
train accuracy =  0.85
Wall time: 32.9 s


Resuls some how are beter then in article on LSTM approach

## K Neighbors Classifier

In [31]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate

In [34]:
xgb_clf = KNeighborsClassifier()
grid_xgb = GridSearchCV(xgb_clf, {'n_neighbors': np.linspace(3, 30, 6, dtype=int),
                                  'weights': ['distance', 'uniform']
                                 },
                        n_jobs=-1, scoring='accuracy', cv=2, verbose=3)
grid_xgb.fit(train_df_pr_sc_win, train_labels)
print(grid_xgb.best_score_)
print(grid_xgb.best_params_)

Fitting 2 folds for each of 12 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  24 | elapsed:   10.5s remaining:   14.8s
[Parallel(n_jobs=-1)]: Done  19 out of  24 | elapsed:   15.4s remaining:    4.0s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   17.1s finished


0.1542316506107193
{'n_neighbors': 30, 'weights': 'distance'}


Ok, next

## Grad Boost

In [19]:
import xgboost as xgb

In [20]:
param = {'max_depth': 17,  # the maximum depth of each tree
         'eta': 0.3,  # the training step for each iteration
         'objective': 'multi:softprob',  # error evaluation for multiclass training
         'num_class': 24,
         'n_jobs': -1,
         'n_estimators': 7  ,
         'tree_method': 'gpu_hist'       
        }

xgb_clf = xgb.XGBClassifier(**param)
xgb_clf.fit(train_df_pr_sc_win, train_labels)

train_labels_hat = xgb_clf.predict(train_df_pr_sc_win)
test_labels_hat = xgb_clf.predict(test_df_pr_sc_win)

test_acc_xgb = accuracy_score(test_labels_hat, test_labels)
train_acc_xgb = accuracy_score(train_labels_hat, train_labels)

print(f'XG boost Classifier')
print(f'test  accuracy = {test_acc_xgb: .2f}\ntrain accuracy = {train_acc_xgb: .2f}')

XG boost Classifier
test  accuracy =  0.85
train accuracy =  0.89


In [24]:
param = {'objective': 'multi:softprob',  # error evaluation for multiclass training
         'num_class': 24,
         'tree_method': 'gpu_hist'
        }

xgb_clf = xgb.XGBClassifier(**param)
grid_xgb = GridSearchCV(xgb_clf, {'max_depth': np.linspace(3, 20, num=15, dtype=int),
                                  'n_estimators': np.linspace(3, 15, num=6, dtype=int),
                                  'eta': np.linspace(1e-2, 0.5, num=10, dtype=float)
                                 },
                        n_jobs=-1, scoring='accuracy', cv=2, verbose=3)
grid_xgb.fit(train_df_pr_sc_win, train_labels)
print(grid_xgb.best_score_)
print(grid_xgb.best_params_)

Fitting 2 folds for each of 900 candidates, totalling 1800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed: 12.9min
[Parallel(n_jobs=-1)]: Done 488 tasks      | elapsed: 28.2min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed: 48.2min
[Parallel(n_jobs=-1)]: Done 1128 tasks      | elapsed: 71.7min
[Parallel(n_jobs=-1)]: Done 1544 tasks      | elapsed: 97.0min
[Parallel(n_jobs=-1)]: Done 1800 out of 1800 | elapsed: 115.2min finished


0.8247602732106704
{'eta': 0.01, 'max_depth': 7, 'n_estimators': 7}


In [28]:
param = {'objective': 'multi:softprob',  # error evaluation for multiclass training
         'num_class': 24,
         'tree_method': 'gpu_hist'
        }

xgb_clf = xgb.XGBClassifier(**param)
grid_xgb = GridSearchCV(xgb_clf, {'max_depth': np.linspace(5, 10, num=5, dtype=int),
                                  'n_estimators': np.linspace(3, 10, num=7, dtype=int),
                                  'eta': np.linspace(0.001, 0.3, num=20, dtype=float)
                                 },
                        n_jobs=-1, scoring='accuracy', cv=2, verbose=3)
grid_xgb.fit(train_df_pr_sc_win, train_labels)
print(grid_xgb.best_score_)
print(grid_xgb.best_params_)

Fitting 2 folds for each of 700 candidates, totalling 1400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   12.0s
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 488 tasks      | elapsed:  9.8min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed: 15.8min
[Parallel(n_jobs=-1)]: Done 1128 tasks      | elapsed: 23.2min
[Parallel(n_jobs=-1)]: Done 1400 out of 1400 | elapsed: 28.9min finished


0.8249605090439851
{'eta': 0.07968421052631579, 'max_depth': 6, 'n_estimators': 8}


In [29]:
xgb_clf_best = grid_xgb.best_estimator_

In [30]:
train_labels_hat = xgb_clf_best.predict(train_df_pr_sc_win)
test_labels_hat = xgb_clf_best.predict(test_df_pr_sc_win)

test_acc_xgb = accuracy_score(test_labels_hat, test_labels)
train_acc_xgb = accuracy_score(train_labels_hat, train_labels)

print(f'XG boost Classifier')
print(f'test  accuracy = {test_acc_xgb: .2f}\ntrain accuracy = {train_acc_xgb: .2f}')

XG boost Classifier
test  accuracy =  0.84
train accuracy =  0.83


## Classification results
Classique approaches povides results, which outperforms result, given in article as baseline, but...

- As been shown in visualization, there are a lot of loops, model that doesnt consern information on all the trail seems to be useless for end time prediction
- Accuracy is what been used for benchmarking in papers. Maybe it is not the best metric? Who knows, nevertheless this is a bench

# Regression

In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error as MAE

In [41]:
%%time
rfc_te = RandomForestRegressor(n_estimators=150, max_depth=17, random_state=42, n_jobs=-1)

rfc_te.fit(train_df_pr_sc_win, train_tes)
train_tes_hat = rfc_te.predict(train_df_pr_sc_win)
test_tes_hat = rfc_te.predict(test_df_pr_sc_win)

test_mae_rfr = MAE(test_tes_hat * scaler.scales['te'], test_tes * scaler.scales['te'])
train_mae_rfr = MAE(train_tes_hat * scaler.scales['te'], train_tes * scaler.scales['te'])
print(f'Random Forest Classifier\ntest  MAE = {test_mae_rfr / 60 / 60: .2f}h\ntrain MAE = {train_mae_rfr / 60 / 60: .2f}h')

Random Forest Classifier
test  MAE =  10.82h
train MAE =  7.58h
Wall time: 27.3 s


Again better then papers)

In [54]:
import xgboost as xgb
param = {'max_depth': 17,  # the maximum depth of each tree
         'eta': 0.3,  # the training step for each iteration
         'n_jobs': -1,
         'n_estimators': 7      
        }

xgb_reg = xgb.XGBRegressor(**param)
xgb_reg.fit(train_df_pr_sc_win, train_tes)

train_tes_hat = xgb_reg.predict(train_df_pr_sc_win)
test_tes_hat = xgb_reg.predict(test_df_pr_sc_win)

test_mae_xgb = MAE(test_tes_hat * scaler.scales['te'], test_tes * scaler.scales['te'])
train_mae_xgb = MAE(test_tes_hat * scaler.scales['te'], test_tes * scaler.scales['te'])

print(f'XG boost Regressor')
print(f'Random Forest Classifier\ntest  MAE = {test_mae_xgb / 3600 / 24: .2f}d\ntrain MAE = {train_mae_xgb / 3600 / 24: .2f}d')

XG boost Regressor
Random Forest Classifier
test  MAE =  4.32d
train MAE =  4.32d


XGB is not that cool

# No long activity!