In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%load_ext watermark
%watermark --packages numpy,scikit-learn,scikit-lego

numpy       : 1.26.1
scikit-learn: 1.3.2
scikit-lego : 0.7.4



In [3]:
import pandas as pd
import numpy as np

from sklearn.datasets import make_regression
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

from sklego.datasets import load_hearts
from sklego.meta._grouped_utils import relative_shrinkage, min_n_obs_shrinkage
from sklego.meta.grouped_predictor import GroupedPredictor


# Regression

In [4]:
size = 1000
df = (pd.DataFrame(np.random.randn(1000, 4), columns=list("abcd"))
    .assign(
        g1 = np.random.randint(0, 2, size),
        g2 = np.random.randint(0, 4, size),
        target = lambda t: (t["g1"] + t["g2"]) * t["a"] + t["b"]
    ))

target = "target"
groups = ["g1", "g2"]

X, y = df.drop(columns=target), df[target]

for shrinkage in (None, "relative"):
    for use_global, fallback in [(True, "global"), (True, "next"), (False, "next"), (False, "raise")]:

        print(f"shrinkage: {shrinkage}, use_global: {use_global}, fallback: {fallback}")

        try:
            model = GroupedPredictor(
                LinearRegression(),
                groups=groups,
                use_global_model=use_global,
                shrinkage=shrinkage,
                fallback_method=fallback
            )
            _ = model.fit(X, y)

            X_pred = X.copy()
            X_pred.loc[size-1, "g2"] = 4 # -> This group value is not in the training data, hence the fallback method is used

            preds = model.predict(X_pred)
            print(r2_score(y, preds))
            print(model.estimators_.keys())

        except KeyError as e:
            print(e)

        print()
# The case (shrinkage: None, use_global: False, fallback: raise) breaks with KeyError but the error message is unclear


shrinkage: None, use_global: True, fallback: global
0.9999977050654287
dict_keys([(1,), (1, 0, 0), (1, 0, 1), (1, 0, 2), (1, 0, 3), (1, 1, 0), (1, 1, 1), (1, 1, 2), (1, 1, 3)])

shrinkage: None, use_global: True, fallback: next
0.9999959205842953
dict_keys([(1,), (1, 0), (1, 1), (1, 0, 0), (1, 0, 1), (1, 0, 2), (1, 0, 3), (1, 1, 0), (1, 1, 1), (1, 1, 2), (1, 1, 3)])

shrinkage: None, use_global: False, fallback: next
0.9999959205842953
dict_keys([(0,), (1,), (0, 0), (0, 1), (0, 2), (0, 3), (1, 0), (1, 1), (1, 2), (1, 3)])

shrinkage: None, use_global: False, fallback: raise
2

shrinkage: relative, use_global: True, fallback: global
0.8219989950988353
dict_keys([(1,), (1, 0), (1, 1), (1, 0, 0), (1, 0, 1), (1, 0, 2), (1, 0, 3), (1, 1, 0), (1, 1, 1), (1, 1, 2), (1, 1, 3)])

shrinkage: relative, use_global: True, fallback: next
0.8219984523950433
dict_keys([(1,), (1, 0), (1, 1), (1, 0, 0), (1, 0, 1), (1, 0, 2), (1, 0, 3), (1, 1, 0), (1, 1, 1), (1, 1, 2), (1, 1, 3)])

shrinkage: relative, u

# Classification

This first case is the [one reported](https://github.com/koaning/scikit-lego/issues/616) that "triggered" this all rework, to which I am adding an extra group, called `random`.

In [5]:
size = 303
df = (
    load_hearts(as_frame=True)
    .drop(columns=['thal'])
    .assign(random = np.random.randint(0, 3, size))
    )

target = "target"
groups = ["sex", "random"]

X, y = df.drop(columns=target), df[target]

model = GroupedPredictor(
    LogisticRegression(max_iter=1000), groups=groups,
    use_global_model=True, shrinkage="relative",
).fit(X, y)

In [6]:
preds = model.predict(X)
assert np.logical_or(preds == 0, preds == 1).all()

In [7]:
probas = model.predict_proba(X)
assert np.allclose(probas.sum(axis=1), 1.)

## Another Classification Task 

This one follows from the case raise in [#579](https://github.com/koaning/scikit-lego/issues/579), where subgroups can have different labels.
The example is taken from the unit tests.

In [8]:
group_size = 1000

# y_choices_grpa, y_choices_grpb = [0, 1, 2], [0, 1, 2, 4]
# y_choices_grpa, y_choices_grpb = [0, 2], [0, 2]
# y_choices_grpa, y_choices_grpb = [0, 1, 2, 3], [0, 4]
y_choices_grpa, y_choices_grpb = [0, 1, 2], [0,  3]

group_col = np.repeat(["A", "B"], group_size)
x_col = np.random.normal(size=group_size * 2)
y_col = np.hstack(
[
    np.random.choice(y_choices_grpa, size=group_size),
    np.random.choice(y_choices_grpb, size=group_size),
]
)
df = pd.DataFrame({"group": group_col, "x": x_col, "y": y_col})

model = GroupedPredictor(
    estimator=LogisticRegression(),
    groups="group",
    use_global_model=True,
    # shrinkage="relative",
    )
X, y = (df[["group", "x"]], df["y"])
_ = model.fit(X, y)
y_proba = model.predict_proba(X)

assert np.allclose(y_proba.sum(axis=1), 1.)
y_proba

array([[0.3418046 , 0.32449327, 0.33370213, 0.        ],
       [0.35561696, 0.32769173, 0.31669131, 0.        ],
       [0.34193341, 0.32452521, 0.33354138, 0.        ],
       ...,
       [0.50098164, 0.        , 0.        , 0.49901836],
       [0.51933877, 0.        , 0.        , 0.48066123],
       [0.4972803 , 0.        , 0.        , 0.5027197 ]])

As one can see the last column (label 3) is always 0 for the first group, and label 1 and 2 are always 0 for the second group as expected.