In [1]:
from collections import Counter
from itertools import product
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from mlresearch.utils import set_matplotlib_style, parallel_loop
from mlresearch.utils._check_pipelines import check_random_states
from recgame.environments import BaseEnvironment
from recgame.recourse import DiCE

set_matplotlib_style(16)



# Define functions

In [2]:
def get_scaler(n_agents=10_000, n_continuous=2, bias_factor=0, random_state=None):
    rng = np.random.default_rng(random_state)
    groups = pd.Series(rng.binomial(1,.5, n_agents), name="groups")
    counts = Counter(groups)
    continuous_cols = [f"f{i}" for i in range(n_continuous)]
    
    # Generate the input dataset
    X_0 = pd.DataFrame(
        rng.normal(loc=0, scale=1/5, size=(counts[0], n_continuous)),
        index=groups[groups == 0].index,
        columns=continuous_cols,
    )

    X_1 = pd.DataFrame(
        rng.normal(loc=bias_factor, scale=1/5, size=(counts[1], n_continuous)),
        index=groups[groups == 1].index,
        columns=continuous_cols,
    )

    X = pd.concat([X_0, X_1]).sort_index()
    return MinMaxScaler().fit(X)


In [3]:
def biased_data_generator(n_agents, n_continuous=2, bias_factor=0, scaler=None, random_state=None):
    """
    groups feature: 
    - 0 -> Disadvantaged group
    - 1 -> Advantaged group
    
    ``bias_factor`` varies between [0, 1], 0 is completely unbiased, 1 is fully biased.
    """
    rng = np.random.default_rng(random_state)
    groups = pd.Series(rng.binomial(1,.5, n_agents), name="groups")
    counts = Counter(groups)
    continuous_cols = [f"f{i}" for i in range(n_continuous)]

    # Generate the input dataset
    X_0 = pd.DataFrame(
        rng.normal(loc=0, scale=1/5, size=(counts[0], n_continuous)),
        index=groups[groups == 0].index,
        columns=continuous_cols,
    )

    X_1 = pd.DataFrame(
        rng.normal(loc=bias_factor, scale=1/5, size=(counts[1], n_continuous)),
        index=groups[groups == 1].index,
        columns=continuous_cols,
    )

    X = pd.concat([X_0, X_1]).sort_index()
    
    # TEST: scale continuous features
    if scaler is not None:
        X.loc[:,:] = scaler.transform(X)
    
    X = pd.concat([X, groups], axis=1)
    X = np.clip(X, 0, 1)
    
    # Generate the target
    p1 = 0.5 * bias_factor + 0.5
    p0 = 1 - p1

    y0 = rng.binomial(1, p0, counts[0])
    y1 = rng.binomial(1, p1, counts[1])
    
    y = pd.concat(
        [
            pd.Series((y0 if val==0 else y1), index=group.index) 
            for val, group in X.groupby("groups")
        ]
    ).sort_index()

    return X, y

In [4]:
class IgnoreGroupLR(LogisticRegression):
    def __init__(self, ignore_feature=None, **kwargs):
        super().__init__(**kwargs)
        self.ignore_feature = ignore_feature
    
    def _get_X(self, X):
        return X.copy() if self.ignore_feature is None else X.drop(columns=self.ignore_feature)
    
    def fit(self, X, y):
        """NOTE: X must be a pandas dataframe."""
        super().fit(self._get_X(X), y)
        return self

    def predict(self, X):
        return super().predict(self._get_X(X))
    
    def predict_proba(self, X):
        return super().predict_proba(self._get_X(X))

In [5]:
def fairness_metrics(environment, filter_feature="groups", bins=10, advantaged_pop=1):
    # Get groups
    groups = pd.concat([environment.metadata_[step]["X"][filter_feature] for step in environment.metadata_.keys()])
    groups = groups[~groups.index.duplicated(keep='last')].sort_index()

    # Get time for recourse
    agents_info = environment.analysis.agents_info()
    agents_info = pd.concat([agents_info, groups], axis=1)
    agents_info["time_for_recourse"] = agents_info["favorable_step"] - agents_info["entered_step"]

    # Get fairness analysis
    fairness_analysis = agents_info.dropna().groupby(filter_feature).mean()
    success_rates = environment.analysis.success_rate(filter_feature=filter_feature)
    fairness_analysis["avg_recourse_reliability"] = success_rates.mean()

    # Get disparity metrics
    not_advantaged = fairness_analysis.index[fairness_analysis.index != advantaged_pop]
    disparates = (fairness_analysis.loc[not_advantaged] / fairness_analysis.loc[advantaged_pop])
    disparates = disparates[["time_for_recourse", "avg_recourse_reliability"]]
    disparates.columns = [var for var in disparates.columns + "_disparity"]
    disparates["time_for_recourse_diff"] = (
        fairness_analysis.loc[not_advantaged, "time_for_recourse"] 
        - fairness_analysis.loc[advantaged_pop, "time_for_recourse"]
    )
    disparates.drop(columns="time_for_recourse_disparity", inplace=True)

    # Get Equality of Opportunity
    efforts = pd.concat(
        [environment.metadata_[step]["effort"] for step in environment.metadata_.keys()]
    )
    efforts = efforts.groupby(efforts.index).mean().rename("effort")
    agents_info = pd.concat([agents_info, efforts], axis=1)
    agents_info["effort_bins"] = pd.cut(agents_info["effort"], bins)

    def extract_info(df):
        avg_effort = df["effort"].mean()    
        outcome_rate = df["final_score"].dropna().shape[0] / df.shape[0]
        return pd.Series({"avg_effort": avg_effort, "outcome_rate": outcome_rate})

    eo = agents_info.groupby(["groups", "effort_bins"], group_keys=True).apply(extract_info)
    eo = eo.reset_index().groupby("groups", group_keys=True).apply(lambda df: (df["outcome_rate"] * df["avg_effort"]).sum() / df["avg_effort"].sum())
    disparates["EO"] = eo[not_advantaged] / eo[advantaged_pop]
    return disparates

In [6]:
def _meta_simulation(params):
    """
    N_AGENTS
    N_CONTINUOUS
    N_LOANS
    BIAS_FACTOR
    ADAPTATION
    NEW_AGENTS
    RNG_SEED
    """ 
    N_AGENTS, N_CONTINUOUS, N_LOANS, BIAS_FACTOR, ADAPTATION, NEW_AGENTS, RNG_SEED = params

    
    rng = np.random.default_rng(RNG_SEED)
    
    scaler = get_scaler(
        n_agents=10_000, 
        n_continuous=N_CONTINUOUS, 
        bias_factor=BIAS_FACTOR, 
        random_state=rng
    )

    def env_biased_data_generator(n_agents):
        return biased_data_generator(n_agents, N_CONTINUOUS, BIAS_FACTOR, scaler, rng)[0]
    
    df, y = biased_data_generator(N_AGENTS, N_CONTINUOUS, BIAS_FACTOR, scaler, rng)
    categorical = ["groups"]
    
    model = IgnoreGroupLR(categorical, random_state=RNG_SEED).fit(df, y)
    
    model.coef_ = np.array([[2, 2]])
    model.intercept_ = np.array([-1])
    
    # Define the necessary components to run simulation
    recourse = DiCE(model, categorical=["groups"], immutable=["groups"], random_state=RNG_SEED)
    recourse.set_actions(df)
    recourse.action_set_.lb = [-0.1, -0.1, 0]
    recourse.action_set_.ub = [1.1, 1.1, 1]
    
    environment = BaseEnvironment(
        X=df,
        recourse=recourse,
        data_source_func=env_biased_data_generator,
        threshold=N_LOANS,
        threshold_type="absolute",
        adaptation=ADAPTATION,
        behavior_function="continuous_constant",
        growth_rate=NEW_AGENTS,
        growth_rate_type="absolute",
        random_state=RNG_SEED,
    )
    
    try:
        environment.simulate(20)
        return (
            {
                "N_AGENTS": N_AGENTS,
                "N_CONTINUOUS": N_CONTINUOUS,
                "N_LOANS": N_LOANS,
                "BIAS_FACTOR": BIAS_FACTOR,
                "ADAPTATION": ADAPTATION,
                "NEW_AGENTS": NEW_AGENTS,
                "RNG_SEED": RNG_SEED,
    
            }, 
            environment,
        )
    except:
        print(f"Experiment failed with params {params}\nRetrying with a new random seed.")
        params["RNG_SEED"] = params["RNG_SEED"]+1
        return _meta_simulation(params)
    


# Experiments

In [7]:
N_AGENTS = [100]
N_CONTINUOUS = [2]
N_LOANS = [10]
BIAS_FACTOR = [0., .2, .4, .6, .8, 1.]
ADAPTATION = [.5]
NEW_AGENTS = [10]
RNG_SEED = check_random_states(42, 5)

In [None]:
results = parallel_loop(
    _meta_simulation, 
    list(product(
        N_AGENTS,
        N_CONTINUOUS,
        N_LOANS,
        BIAS_FACTOR,
        ADAPTATION,
        NEW_AGENTS,
        RNG_SEED,
    )),
    n_jobs=-1,
    progress_bar=True
)

Output()

100%|██████████| 90/90 [00:18<00:00,  4.83it/s]
100%|██████████| 90/90 [00:18<00:00,  4.79it/s]
100%|██████████| 90/90 [00:19<00:00,  4.73it/s]
100%|██████████| 90/90 [00:19<00:00,  4.69it/s]
100%|██████████| 90/90 [00:19<00:00,  4.67it/s]
100%|██████████| 90/90 [00:19<00:00,  4.66it/s]
100%|██████████| 90/90 [00:18<00:00,  4.74it/s]
100%|██████████| 90/90 [00:19<00:00,  4.60it/s]
100%|██████████| 90/90 [00:21<00:00,  4.26it/s]
100%|██████████| 90/90 [00:20<00:00,  4.30it/s]
100%|██████████| 90/90 [00:20<00:00,  4.32it/s]
100%|██████████| 90/90 [00:21<00:00,  4.15it/s]
100%|██████████| 90/90 [00:21<00:00,  4.17it/s]
100%|██████████| 90/90 [00:21<00:00,  4.15it/s]
100%|██████████| 90/90 [00:22<00:00,  4.05it/s]
100%|██████████| 90/90 [00:22<00:00,  4.08it/s]
100%|██████████| 90/90 [00:20<00:00,  4.46it/s]
100%|██████████| 90/90 [00:20<00:00,  4.38it/s]
100%|██████████| 90/90 [00:20<00:00,  4.37it/s]
100%|██████████| 90/90 [00:21<00:00,  4.18it/s]
100%|██████████| 90/90 [00:20<00:00,  4.

Only 3 (required 5)  Diverse Counterfactuals found for the given configuration, perhaps try with different parameters... ; total time taken: 00 min 00 sec
Only 3 (required 5)  Diverse Counterfactuals found for the given configuration, perhaps try with different parameters... ; total time taken: 00 min 00 sec
Only 3 (required 5)  Diverse Counterfactuals found for the given configuration, perhaps try with different parameters... ; total time taken: 00 min 00 sec
Only 3 (required 5)  Diverse Counterfactuals found for the given configuration, perhaps try with different parameters... ; total time taken: 00 min 00 sec
Only 4 (required 5)  Diverse Counterfactuals found for the given configuration, perhaps try with different parameters... ; total time taken: 00 min 00 sec
Only 4 (required 5)  Diverse Counterfactuals found for the given configuration, perhaps try with different parameters... ; total time taken: 00 min 00 sec
Only 3 (required 5)  Diverse Counterfactuals found for the given confi

100%|██████████| 90/90 [00:24<00:00,  3.67it/s]
100%|██████████| 90/90 [00:25<00:00,  3.57it/s]
 87%|████████▋ | 78/90 [00:23<00:04,  2.93it/s]

; total time taken: 00 min 00 sec
Only 4 (required 5)  Diverse Counterfactuals found for the given configuration, perhaps try with different parameters... ; total time taken: 00 min 00 sec
Only 4 (required 5)  Diverse Counterfactuals found for the given configuration, perhaps try with different parameters... ; total time taken: 00 min 00 sec
Only 3 (required 5)  Diverse Counterfactuals found for the given configuration, perhaps try with different parameters... ; total time taken: 00 min 00 sec
Only 3 (required 5)  Diverse Counterfactuals found for the given configuration, perhaps try with different parameters... ; total time taken: 00 min 00 sec
Only 3 (required 5)  Diverse Counterfactuals found for the given configuration, perhaps try with different parameters... ; total time taken: 00 min 00 sec
Only 3 (required 5)  Diverse Counterfactuals found for the given configuration, perhaps try with different parameters... ; total time taken: 00 min 00 sec
Only 3 (required 5)  Diverse Counter

100%|██████████| 90/90 [00:25<00:00,  3.55it/s]
100%|██████████| 90/90 [00:26<00:00,  3.39it/s]
100%|██████████| 90/90 [00:27<00:00,  3.32it/s]
100%|██████████| 90/90 [00:26<00:00,  3.41it/s]
100%|██████████| 90/90 [00:24<00:00,  3.60it/s]
100%|██████████| 90/90 [00:25<00:00,  3.55it/s]
100%|██████████| 90/90 [00:21<00:00,  4.18it/s]
100%|██████████| 90/90 [00:21<00:00,  4.23it/s]
100%|██████████| 90/90 [00:21<00:00,  4.10it/s]
100%|██████████| 90/90 [00:21<00:00,  4.11it/s]
100%|██████████| 90/90 [00:21<00:00,  4.26it/s]
100%|██████████| 90/90 [00:21<00:00,  4.19it/s]
100%|██████████| 90/90 [00:22<00:00,  4.07it/s]
100%|██████████| 90/90 [00:22<00:00,  4.09it/s]
100%|██████████| 90/90 [00:21<00:00,  4.19it/s]
100%|██████████| 90/90 [00:21<00:00,  4.23it/s]
100%|██████████| 90/90 [00:21<00:00,  4.21it/s]
100%|██████████| 90/90 [00:21<00:00,  4.23it/s]
100%|██████████| 90/90 [00:21<00:00,  4.26it/s]
100%|██████████| 90/90 [00:21<00:00,  4.27it/s]
100%|██████████| 90/90 [00:22<00:00,  4.

 87%|████████▋ | 78/90 [00:24<00:03,  3.90it/s]

100%|██████████| 90/90 [00:26<00:00,  3.44it/s]
100%|██████████| 90/90 [00:29<00:00,  3.03it/s]
100%|██████████| 90/90 [00:27<00:00,  3.29it/s]
100%|██████████| 90/90 [00:28<00:00,  3.20it/s]
100%|██████████| 90/90 [00:24<00:00,  3.60it/s]
100%|██████████| 90/90 [00:25<00:00,  3.55it/s]
100%|██████████| 90/90 [00:24<00:00,  3.64it/s]
100%|██████████| 90/90 [00:23<00:00,  3.77it/s]
100%|██████████| 90/90 [00:23<00:00,  3.87it/s]
100%|██████████| 90/90 [00:23<00:00,  3.83it/s]
100%|██████████| 90/90 [00:23<00:00,  3.88it/s]
100%|██████████| 90/90 [00:23<00:00,  3.78it/s]
100%|██████████| 90/90 [00:23<00:00,  3.90it/s]
100%|██████████| 90/90 [00:23<00:00,  3.90it/s]
100%|██████████| 90/90 [00:22<00:00,  3.98it/s]
100%|██████████| 90/90 [00:22<00:00,  4.01it/s]
100%|██████████| 90/90 [00:23<00:00,  3.77it/s]
100%|██████████| 90/90 [00:22<00:00,  3.97it/s]
100%|██████████| 90/90 [00:23<00:00,  3.84it/s]
100%|██████████| 90/90 [00:25<00:00,  3.58it/s]
100%|██████████| 90/90 [00:22<00:00,  3.

In [None]:

fair_res = []
for hyperparams, environment in results:
    bias = hyperparams["BIAS_FACTOR"]
    fairness_res = fairness_metrics(environment, filter_feature="groups", bins=10, advantaged_pop=1)
    fairness_res["Merit"] = bias
    fair_res.append(fairness_res)
    

In [None]:
pd.concat(fair_res).groupby("Merit").mean()