In [1]:
# sys path
import sys
sys.path.append('../../')

In [2]:
from holisticai.datasets.synthetic.recruitment import generate_rankings
from holisticai.bias.mitigation.postprocessing.fair_topk.transformer import FairTopK
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


# Top-K Ranking problem

### Generating M synthetic ranking

In [3]:
M = 1
top_n = 20
p = 0.1
ranking = generate_rankings(M, top_n, p)

example of unfair ranking:

In [4]:
def create_unfair_example():
    """
    Setting an unfair ranking where protected group examples are only the last 6 results.
    """
    ranking['protected']=False
    ranking['protected'].iloc[-6:]=True
    return ranking
    
unfair_ranking = create_unfair_example()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ranking['protected'].iloc[-6:]=True


You can use FairTopK  and pass ranking and p_attr (protected groups) separated or in the same dataframe.

In [5]:
import numpy as np
from holisticai.bias.mitigation.postprocessing.debiasing_exposure.algorithm_utils import exposure_metric
from holisticai.bias.mitigation import DisparateImpactRemoverRS

dir = DisparateImpactRemoverRS(query_col='X', group_col='protected', score_col='score', repair_level=1)
new_ranking = dir.transform_features(unfair_ranking)
m_org1 = exposure_metric(unfair_ranking, group_col='protected', query_col='X', score_col='score')
m_org2 = exposure_metric(new_ranking, group_col='protected', query_col='X', score_col='score')
pd.concat([m_org1,m_org2],axis=1)

Unnamed: 0,Value,Value.1
exposure_ratio,516682.109793,0.962027
exposure difference,0.103049,0.0


In [6]:
# Synthetic data
M = 1000
top_n = 20
p = 0.25
rankings = generate_rankings(M, top_n, p, return_p_attr=False)
#rankings , p_attr = generate_rankings(M, top_n, p, return_p_attr=True)

# Bias Mitigation Post-processing
top_n = 20
p = 0.5
alpha = 0.1
fs = FairTopK(top_n=top_n, 
              p=p, 
              alpha=alpha, 
              query_col='X', 
              doc_col='Y', 
              score_col='score', 
              group_col='protected')

# Transform passing all info in the same dataframe
re_rankings = fs.transform(rankings)
#re_rankings = fs.transform(rankings, p_attr=p_attr)

Testing and unfair ranking

In [7]:
re_ranking = fs.transform(unfair_ranking)

def compare_results(old , new):
    old = old.copy()
    new = new.copy()
    old.columns = pd.MultiIndex.from_tuples([['Old Rank',col] for col in old.columns])
    new.columns = pd.MultiIndex.from_tuples([['New Rank',col] for col in new.columns])
    return pd.concat([old.reset_index(drop=True),new.reset_index(drop=True)], axis=1)

compare_results(unfair_ranking , re_ranking)

Unnamed: 0_level_0,Old Rank,Old Rank,Old Rank,Old Rank,New Rank,New Rank,New Rank,New Rank
Unnamed: 0_level_1,X,Y,score,protected,X,Y,score,protected
0,0,20,20,False,0,20,20,False
1,0,19,19,False,0,19,19,False
2,0,18,18,False,0,18,18,False
3,0,17,17,False,0,17,17,False
4,0,16,16,False,0,6,6,True
5,0,15,15,False,0,16,16,False
6,0,14,14,False,0,15,15,False
7,0,13,13,False,0,5,5,True
8,0,12,12,False,0,14,14,False
9,0,11,11,False,0,13,13,False


In [8]:
from holisticai.bias.mitigation.postprocessing import DebiasingExposure
import warnings
warnings.filterwarnings('ignore')

In [9]:
M = 1000
top_n = 20
p = 0.25
rankings = generate_rankings(M, top_n, p, return_p_attr=False)

In [10]:
# create the DebiasingExposure class
dtr = DebiasingExposure(group_col="protected",
                        query_col = 'X',
                        doc_col = 'Y',
                        feature_cols = ['score', 'protected'],
                        score_col = 'score',
                        gamma=2, 
                        number_of_iterations=100, 
                        standardize=True,
                        verbose=1)

# train the model
dtr.fit(rankings)

100%|██████████| 100/100 [00:20<00:00,  4.83it/s]


<holisticai.bias.mitigation.postprocessing.debiasing_exposure.transformer.DebiasingExposure at 0x24b752e1850>

In [11]:
dtr._omega # (model weights for 'score' and 'protected' columns)

array([0.71328943, 0.35384514])

Testing an unfair ranking

In [12]:
re_ranking = dtr.transform(unfair_ranking)
compare_results(unfair_ranking , re_ranking)

Unnamed: 0_level_0,Old Rank,Old Rank,Old Rank,Old Rank,New Rank,New Rank,New Rank,New Rank
Unnamed: 0_level_1,X,Y,score,protected,X,Y,protected,score
0,0,20,20,False,0,20,False,1.591147
1,0,19,19,False,0,19,False,1.482349
2,0,18,18,False,0,18,False,1.373551
3,0,17,17,False,0,17,False,1.264753
4,0,16,16,False,0,16,False,1.155955
5,0,15,15,False,0,15,False,1.047158
6,0,14,14,False,0,14,False,0.93836
7,0,13,13,False,0,13,False,0.829562
8,0,12,12,False,0,12,False,0.720764
9,0,11,11,False,0,11,False,0.611966


### Comparing Fairness (Original Ranking - FairTopK - FairSearch)

In [13]:
# Synthetic data (Initial Rank)
M = 1000
top_n = 20
p = 0.25
rankings = generate_rankings(M, top_n, p, return_p_attr=False)

In [14]:
from holisticai.bias.mitigation.postprocessing.debiasing_exposure.algorithm_utils import exposure_metric

# Bias Mitigation Post-processing
top_n = 20
p = 0.5
alpha = 0.1

fs = FairTopK(top_n=top_n, 
              p=p, 
              alpha=alpha, 
              query_col='X', 
              doc_col='Y', 
              score_col='score', 
              group_col='protected')

dtr = DebiasingExposure(query_col = 'X', 
            doc_col = 'Y', 
            score_col = 'score', 
            group_col="protected", 
            feature_cols = ['score', 'protected'],
            gamma=10, 
            number_of_iterations=100, 
            standardize=True)

dtr.fit(rankings)

                                                 

<holisticai.bias.mitigation.postprocessing.debiasing_exposure.transformer.DebiasingExposure at 0x24b292a5100>

In [20]:
from holisticai.bias.mitigation import DisparateImpactRemoverRS
dir = DisparateImpactRemoverRS(query_col='X', group_col='protected', score_col='score', repair_level=1)

In [16]:
def compare_results(vars):
    new_vars = []
    for i,var in enumerate(vars):
        var = var.copy()
        var.columns = pd.MultiIndex.from_tuples([[f'Rank {i}',col] for col in var.columns])
        var = var.reset_index(drop=True)
        new_vars.append(var)
    return pd.concat(new_vars, axis=1)

In [17]:
re_ranking_fs = fs.transform(unfair_ranking)
re_ranking_dtr = dtr.transform(unfair_ranking)
re_ranking_dir = dir.transform(unfair_ranking)
compare_results([unfair_ranking, re_ranking_fs, re_ranking_dtr, re_ranking_dir])

Unnamed: 0_level_0,Rank 0,Rank 0,Rank 0,Rank 0,Rank 1,Rank 1,Rank 1,Rank 1,Rank 2,Rank 2,Rank 2,Rank 2,Rank 3,Rank 3,Rank 3,Rank 3
Unnamed: 0_level_1,X,Y,score,protected,X,Y,score,protected,X,Y,protected,score,X,Y,score,protected
0,0,20,20,False,0,20,20,False,0,20,False,1.249909,0,20,6,False
1,0,19,19,False,0,19,19,False,0,19,False,1.164436,0,6,6,True
2,0,18,18,False,0,18,18,False,0,18,False,1.078963,0,19,6,False
3,0,17,17,False,0,17,17,False,0,17,False,0.99349,0,18,5,False
4,0,16,16,False,0,6,6,True,0,16,False,0.908017,0,17,5,False
5,0,15,15,False,0,16,16,False,0,15,False,0.822544,0,16,5,False
6,0,14,14,False,0,15,15,False,0,14,False,0.737071,0,5,5,True
7,0,13,13,False,0,5,5,True,0,13,False,0.651597,0,15,4,False
8,0,12,12,False,0,14,14,False,0,12,False,0.566124,0,14,4,False
9,0,11,11,False,0,13,13,False,0,6,True,0.519221,0,4,4,True


In [18]:
re_ranking_fs = fs.transform(unfair_ranking)
re_ranking_dtr = dtr.transform(unfair_ranking)
re_ranking_dir = dir.transform(unfair_ranking)
m_org = exposure_metric(unfair_ranking, group_col='protected', query_col='X', score_col='score')
m_fs = exposure_metric(re_ranking_fs, group_col='protected', query_col='X', score_col='score')
m_dtr = exposure_metric(re_ranking_dtr, group_col='protected', query_col='X', score_col='score')
m_dir = exposure_metric(re_ranking_dir, group_col='protected', query_col='X', score_col='score')
df_metrics = pd.concat([m_org, m_fs, m_dtr, m_dir],axis=1)
df_metrics.columns = ['Original', 'FairTopK', 'FairSearch','DisparateImpactRemoverRS']
df_metrics

Unnamed: 0,Original,FairTopK,FairSearch,DisparateImpactRemoverRS
exposure_ratio,516682.109793,516682.073493,1.54779,0.962027
exposure difference,0.103049,0.103049,0.028562,0.0


In [19]:
re_rankings_fs = fs.transform(rankings)
re_rankings_dtr = dtr.transform(rankings)
re_rankings_dir = dir.transform(rankings)
m_org = exposure_metric(rankings, group_col='protected', query_col='X', score_col='score')
m_fs = exposure_metric(re_rankings_fs, group_col='protected', query_col='X', score_col='score')
m_dtr = exposure_metric(re_rankings_dtr, group_col='protected', query_col='X', score_col='score')
m_dir = exposure_metric(re_rankings_dir, group_col='protected', query_col='X', score_col='score')
df_metrics = pd.concat([m_org, m_fs, m_dtr, m_dir],axis=1)
df_metrics.columns = ['Original', 'FairTopK', 'FairSearch','DisparateImpactRemoverRS']
df_metrics

Unnamed: 0,Original,FairTopK,FairSearch,DisparateImpactRemoverRS
exposure_ratio,27783.387885,27783.389234,0.673929,1.005313
exposure difference,0.048699,0.048699,0.00076,0.002083
