# Evaluate benchmark based on PMLB datasets

## Read artifacts

In [17]:
import os
import zipfile
import pandas as pd
import io

adir = '../wf_artifacts'
tables = []
artifacts_count = 0
csv_count = 0
for f in os.listdir(adir):
    if f.endswith('.zip'):
        path = os.path.join(adir, f)
        
        with zipfile.ZipFile(path, 'r') as zip_ref:
            artifacts_count += 1
            for csv in zip_ref.namelist():
                if csv.endswith('.csv'):
                    with zip_ref.open(csv) as csv_file:
                        tables.append(pd.read_csv(io.TextIOWrapper(csv_file)))
                        
df_results = pd.concat(tables)
df_results.reset_index(drop=True, inplace=True)
df_results.to_csv('../results/pmlb_results.csv')
print(f'procesed {artifacts_count} artifacts with {len(tables)} results.')

procesed 7 artifacts with 980 results.


In [18]:
df_results.head()

Unnamed: 0,dataset,estimator,time,model_string,random_seed,train_log_loss,train_roc_auc_score,train_accuracy_score,test_log_loss,test_roc_auc_score,test_accuracy_score,est_params
0,adult,CatBoost,48.27582,,57302,0.245185,0.945539,0.885701,0.270222,0.931962,0.879517,"{""learning_rate"": 0.027444514393716094, ""depth..."
1,adult,CatBoost,14.672965,,92067,0.24769,0.943395,0.886008,0.284197,0.927506,0.871635,"{""learning_rate"": 0.06489778555799723, ""depth""..."
2,adult,CatBoost,14.076289,,33585,0.25401,0.94052,0.883858,0.274802,0.930852,0.875525,"{""learning_rate"": 0.04705587211479468, ""depth""..."
3,adult,CatBoost,14.730036,,41729,0.25443,0.940453,0.882784,0.275142,0.928517,0.876651,"{""learning_rate"": 0.03821966920492691, ""depth""..."
4,adult,CatBoost,19.638265,,66580,0.24657,0.944759,0.886546,0.269056,0.932059,0.878186,"{""learning_rate"": 0.03894519547731018, ""depth""..."


In [19]:
estimators = df_results['estimator'].unique()
estimators

array(['CatBoost', 'DecisionTree', 'KNeighbors', 'LGBM',
       'LogisticRegression', 'RandomForest', 'XGB'], dtype=object)

In [20]:
for est in estimators:
    est_results = df_results[df_results['estimator'] == est]
    test_log_loss = est_results['test_log_loss'].mean()
    test_roc_auc_score = est_results['test_roc_auc_score'].mean()
    test_accuracy_score = est_results['test_accuracy_score'].mean()
    print(f'{est} {test_log_loss} {test_roc_auc_score} {test_accuracy_score}')

CatBoost 0.2799517491768882 0.8655151257668129 0.8554791452441571
DecisionTree 0.40246649514397026 0.7567860731215051 0.7676632244629715
KNeighbors 0.4253110663912993 0.7806517650517587 0.7766611497479942
LGBM 0.29747967685378846 0.8535790478177493 0.8442971565457359
LogisticRegression 0.4050428244224626 0.787851548966885 0.7953859643589313
RandomForest 0.33643924767540445 0.8342741796636507 0.8248544960519376
XGB 0.29022756115696335 0.8571114472113562 0.8471373743469862


## TODO: Remove this 
Results from not tuned estimators

In [21]:
df_resuts_not_tuned = pd.read_csv('../results/pmlb_results_not_tuned.csv')
not_tuned_estimators = df_resuts_not_tuned['estimator'].unique()
not_tuned_estimators

array(['RILS-ROLS', 'HROCH', 'CatBoost', 'GradientBoosting', 'LGBM',
       'XGB', 'DecisionTree', 'LogisticRegression', 'KNeighbors',
       'RandomForest', 'SVC'], dtype=object)

In [23]:
for est in not_tuned_estimators:
    est_results = df_resuts_not_tuned[df_resuts_not_tuned['estimator'] == est]
    test_log_loss = est_results['test_log_loss'].mean()
    test_roc_auc_score = est_results['test_roc_auc_score'].mean()
    test_accuracy_score = est_results['test_accuracy_score'].mean()
    print(f'{est} {test_log_loss} {test_roc_auc_score} {test_accuracy_score}')

RILS-ROLS 0.5749358822023897 0.7883113606820921 0.8504784648330119
HROCH 0.29211639416661506 0.8798953087212932 0.8637305122727116
CatBoost 0.30405095366919416 0.8589860391296925 0.8473960362575694
GradientBoosting 0.35367304251436066 0.8056949805403602 0.8018813394469697
LGBM 0.32189060740532 0.8532837855967486 0.8422880513038286
XGB 0.3703251985011235 0.8436586151985429 0.8386334213627732
DecisionTree 7.431414500519904 0.7480900426762037 0.7904900581073104
LogisticRegression 0.4700994127252017 0.775308712498848 0.7909953147052841
KNeighbors 1.3434050552780068 0.7432619332829737 0.763191336086232
RandomForest 0.3778169318182072 0.8290807953615884 0.8188584612106503
SVC 0.3861992731830962 0.7651071056760607 0.7716504647502572


## Evaluate
TODO: implement this