In [13]:
# ! pip install git+https://github.com/FBruzzesi/scikit-lego.git@ordinal-classification sklearn pandas xlrd

In [14]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier
from sklearn.model_selection import cross_validate
from sklearn.multiclass import OneVsRestClassifier

from sklearn.metrics import classification_report, balanced_accuracy_score, f1_score, make_scorer

from sklearn import clone

from sklego.meta import OrdinalClassifier

In [15]:
url = "https://stats.idre.ucla.edu/stat/data/ologit.dta"

df = pd.read_stata(url).assign(apply = lambda t: t["apply"].cat.codes)

target = "apply"
features = [c for c in df.columns if c != target]

X, y = df[features].to_numpy(), df[target].to_numpy()

In [16]:
def score_estimator(estimator, X, y, scoring) -> pd.DataFrame:

    return (
        pd.DataFrame(cross_validate(estimator, X, y, cv=10, scoring=scoring, n_jobs=-1))
        .loc[:, [f"test_{s}" for s in scoring.keys()]]
        .rename(columns={f"test_{s}": s for s in scoring.keys()})
        )

def compare_meta_models(base_estimator, X, y, scoring) -> pd.DataFrame:

    oc_estimator = OrdinalClassifier(clone(base_estimator), use_calibration=True, n_jobs=-1)
    oc_scores = score_estimator(oc_estimator, X, y, scoring)
    
    ovr_estimator = OneVsRestClassifier(clone(base_estimator), n_jobs=-1)
    ovr_scores = score_estimator(ovr_estimator, X, y, scoring)

    scores = pd.merge(oc_scores, ovr_scores, left_index=True, right_index=True, suffixes=["_oc", "_ovr"])
    return (scores.reindex(sorted(scores.columns), axis=1))

In [17]:
scoring = {'accuracy': make_scorer(balanced_accuracy_score), "f1": make_scorer(f1_score, average="weighted")}

In [18]:
estimators = [LogisticRegression(), ExtraTreesClassifier(max_depth=5), HistGradientBoostingClassifier(max_depth=5)]

for base_estimator in estimators:

    print(base_estimator.__class__.__name__)
    scores = compare_meta_models(base_estimator, X, y, scoring)
    print(scores.describe(percentiles=[0.5]))
    print()

LogisticRegression
       accuracy_oc  accuracy_ovr      f1_oc     f1_ovr
count    10.000000     10.000000  10.000000  10.000000
mean      0.379545      0.365584   0.496891   0.480228
std       0.027217      0.012178   0.030147   0.024780
min       0.350649      0.350649   0.447083   0.447083
50%       0.377706      0.366883   0.505263   0.486443
max       0.442641      0.389610   0.530615   0.517500

ExtraTreesClassifier
       accuracy_oc  accuracy_ovr      f1_oc     f1_ovr
count    10.000000     10.000000  10.000000  10.000000
mean      0.376082      0.366342   0.482819   0.479117
std       0.052527      0.040000   0.049658   0.042576
min       0.296537      0.296537   0.405556   0.398889
50%       0.358225      0.358225   0.488406   0.480000
max       0.451299      0.449134   0.554097   0.532857

HistGradientBoostingClassifier
       accuracy_oc  accuracy_ovr      f1_oc     f1_ovr
count    10.000000     10.000000  10.000000  10.000000
mean      0.319481      0.319264   0.410795   0