In [9]:
# ! pip install git+https://github.com/FBruzzesi/scikit-lego.git@ordinal-classification

In [10]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import cross_validate
from sklearn.multiclass import OneVsRestClassifier

from sklearn.metrics import classification_report, balanced_accuracy_score, f1_score, make_scorer

from sklearn import clone

from sklego.meta import OrdinalClassifier

In [11]:
url = "https://stats.idre.ucla.edu/stat/data/ologit.dta"

df = pd.read_stata(url).assign(apply = lambda t: t["apply"].cat.codes)

target = "apply"
features = [c for c in df.columns if c != target]

X, y = df[features].to_numpy(), df[target].to_numpy()

In [12]:
def score_estimator(estimator, X, y, scoring) -> pd.DataFrame:

    return (
        pd.DataFrame(cross_validate(estimator, X, y, cv=5, scoring=scoring, n_jobs=-1))
        .loc[:, [f"test_{s}" for s in scoring.keys()]]
        .rename(columns={f"test_{s}": s for s in scoring.keys()})
        )

def compare_meta_models(base_estimator, X, y, scoring) -> pd.DataFrame:

    oc_estimator = OrdinalClassifier(clone(base_estimator), use_calibration=True, n_jobs=-1)
    oc_scores = score_estimator(oc_estimator, X, y, scoring)
    
    ovr_estimator = OneVsRestClassifier(clone(base_estimator), n_jobs=-1)
    ovr_scores = score_estimator(ovr_estimator, X, y, scoring)

    scores = pd.merge(oc_scores, ovr_scores, left_index=True, right_index=True, suffixes=["_oc", "_ovr"])
    return (scores.reindex(sorted(scores.columns), axis=1))

def fit_predict_meta_models(base_estimator, X, y):

    oc_cal_estimator = OrdinalClassifier(clone(base_estimator), use_calibration=True, n_jobs=-1)
    oc_cal_preds = oc_cal_estimator.fit(X, y).predict_proba(X)

    oc_no_cal_estimator = OrdinalClassifier(clone(base_estimator), use_calibration=False, n_jobs=-1)
    oc_no_cal_preds = oc_no_cal_estimator.fit(X, y).predict_proba(X)

    ovr_estimator = OneVsRestClassifier(clone(base_estimator), n_jobs=-1)
    ovr_preds = ovr_estimator.fit(X, y).predict_proba(X)

    return oc_cal_preds, oc_no_cal_preds, ovr_preds

def is_monotonic(row, split_idx):
    left, right = np.split(row, [split_idx])
    is_monotonic = (np.diff(left)>0).all() & (np.diff(right)<0).all()
    return is_monotonic

def check_monotonicity(arr):
    argmax = np.argmax(arr, axis=1)
    
    return np.array([is_monotonic(row, split_idx) for row, split_idx in zip(arr, argmax)])


In [13]:
a = np.array([[.1, .2, .3], [.3, .2, .1], [.1, .2, .1], [.3, .1, .2], [.2, .1, .3]])
check_monotonicity(a)

array([ True,  True,  True, False, False])

In [14]:
scoring = {'accuracy': make_scorer(balanced_accuracy_score), "f1": make_scorer(f1_score, average="weighted")}

In [15]:
estimators = [LogisticRegression(), ExtraTreesClassifier(max_depth=5), HistGradientBoostingClassifier(max_depth=5)]

for base_estimator in estimators:

    print(base_estimator.__class__.__name__)
    scores = compare_meta_models(base_estimator, X, y, scoring)
    print(scores.describe(percentiles=[0.5]))
    print()

LogisticRegression


       accuracy_oc  accuracy_ovr     f1_oc    f1_ovr
count     5.000000      5.000000  5.000000  5.000000
mean      0.368831      0.364069  0.487874  0.481174
std       0.010307      0.007482  0.022214  0.016987
min       0.353896      0.353896  0.453290  0.453290
50%       0.374459      0.362554  0.500000  0.483333
max       0.378788      0.374459  0.506667  0.498835

ExtraTreesClassifier
       accuracy_oc  accuracy_ovr     f1_oc    f1_ovr
count     5.000000      5.000000  5.000000  5.000000
mean      0.353680      0.377056  0.462485  0.490132
std       0.014867      0.038643  0.025648  0.041356
min       0.331169      0.327922  0.433073  0.445189
50%       0.353896      0.374459  0.459792  0.501344
max       0.371212      0.424784  0.502998  0.546011

HistGradientBoostingClassifier
       accuracy_oc  accuracy_ovr     f1_oc    f1_ovr
count     5.000000      5.000000  5.000000  5.000000
mean      0.332900      0.346861  0.399589  0.453681
std       0.004630      0.074274  0.020394  0

In [21]:
estimators = [LogisticRegression(), ExtraTreesClassifier(max_depth=5), HistGradientBoostingClassifier(max_depth=5)]

for base_estimator in estimators:
    print(base_estimator.__class__.__name__)
    oc_cal_preds, oc_no_cal_preds, ovr_preds = fit_predict_meta_models(base_estimator, X, y)


    print("Percentage of monotonic constraint respect:\n")
    print(
        f"\tOrdinalClassifier with calibration: {100*check_monotonicity(oc_cal_preds).mean():.2f}%",
        f"\tOrdinalClassifier without calibration: {100*check_monotonicity(oc_no_cal_preds).mean():.2f}%",
        f"\tOVR Classifier: {100*check_monotonicity(ovr_preds).mean():.2f}%",
        sep="\n"
    )

    print()

LogisticRegression
Percentage of monotonic constraint respect:

	OrdinalClassifier with calibration: 100.00%
	OrdinalClassifier without calibration: 100.00%
	OVR Classifier: 100.00%

ExtraTreesClassifier
Percentage of monotonic constraint respect:

	OrdinalClassifier with calibration: 100.00%
	OrdinalClassifier without calibration: 94.50%
	OVR Classifier: 94.75%

HistGradientBoostingClassifier
Percentage of monotonic constraint respect:

	OrdinalClassifier with calibration: 100.00%
	OrdinalClassifier without calibration: 89.25%
	OVR Classifier: 89.25%

