In [2]:
import sys
sys.path.append("../MMPF")
sys.path.append("../scripts")

import data
import utils
import experiments
from MinimaxParetoFair.MMPF_trainer import APSTAR, SKLearn_Weighted_LLR


from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

import numpy as np

%load_ext autoreload
%autoreload 2

In [25]:
dataset = "german"
X_train, Y_train, X_val, Y_val, X_test, Y_test =  data.get_fold(dataset, 5, 10, 0)
A_train, A_val, A_test = experiments.get_subgroup_feature(dataset, X_train, X_val, X_test, 4)
col_trans = ColumnTransformer(
    [
        ("numeric", StandardScaler(), data.NUM_FEATURES[dataset]),
        (
            "categorical",
            OneHotEncoder(
                drop="if_binary", sparse_output=False, handle_unknown="ignore"
            ),
            data.CAT_FEATURES[dataset],
        ),
    ],
    verbose_feature_names_out=False,
)
col_trans.set_output(transform="pandas")

preprocess = Pipeline([("preprocess", col_trans)])
preprocess.fit(X_train)
X_train = preprocess.transform(X_train)
X_val = preprocess.transform(X_val)
X_test = preprocess.transform(X_test)

{'Female_False': 0, 'Male_False': 1, 'Male_True': 2, 'Female_True': 3}


In [26]:
model = SKLearn_Weighted_LLR(
    X_train.values,
    Y_train.values,
    A_train.values,
    X_val.values,
    Y_val.values,
    A_val.values,
    C_reg = 1e-2
)

In [27]:
mu_ini = np.ones(len(A_train.unique()))
mu_ini /= mu_ini.sum()

results = APSTAR(model, mu_ini, niter=200, max_patience=200, Kini=1,
                                      Kmin=20, alpha=0.5, verbose=False)

mu_best = results['mu_best_list'][-1]
model.weighted_fit(X_train,Y_train,A_train, mu_best)

patience counter: 199 total iterations: 201
-----------------------------------------


In [29]:
Y_test_pred = (model.predict_proba(X_test)[:, 1] > 0.5).astype(int)

In [30]:
utils.min_balanced_accuracy(Y_test, Y_test_pred, A_test)

0.5

In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

In [None]:
class MinimaxPareto(BaseEstimator, ClassifierMixin):
    def __init__(
        self,
        n_iterations=100,
        C = 1.0,
    ):
        self.n_iterations = n_iterations
        self.C = C
        

    def fit(self, X, y, sensitive_attribute):
        X, y = check_X_y(X, y)
        # insert sensitive attribute as first column of the dataframe
        self.classes_ = np.unique(y)
        
        model = SKLearn_Weighted_LLR(
            X.values,
            y.values,
            sensitive_attribute.values,
            X.values,
            y.values,
            sensitive_attribute.values,
            C_reg = self.C
        )
        mu_ini = np.ones(len(sensitive_attribute.unique()))
        mu_ini /= mu_ini.sum()
        results = APSTAR(model, mu_ini, niter=self.n_iterations, max_patience=200, Kini=1,
                                      Kmin=20, alpha=0.5, verbose=False)

        mu_best = results['mu_best_list'][-1]
        model.weighted_fit(X,y,sensitive_attribute, mu_best)
        self.model = model
        return self
        

    def predict(self, X):
        check_is_fitted(self)
        X = check_array(X)
        predictions = self.predict_proba(X)[:, 1]
        return (predictions > 0.5).astype(int)
        

    def predict_proba(self, X):
        check_is_fitted(self)
        X = check_array(X)
        return self.model.predict_proba(X)
        