In [1]:
# sys path
import sys
sys.path.append('../../')

In [2]:
from holisticai.datasets.synthetic.recruitment import generate_rankings
from holisticai.bias.mitigation.postprocessing.fair_topk.transformer import FairTopK
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


# Top-K Ranking problem

### Generating M synthetic ranking

In [3]:
M = 1
top_n = 20
p = 0.25
ranking = generate_rankings(M, top_n, p)

In [4]:
ranking

Unnamed: 0,X,Y,score,protected
0,0,20,20,True
1,0,19,19,False
2,0,18,18,False
3,0,17,17,False
4,0,16,16,False
5,0,15,15,True
6,0,14,14,False
7,0,13,13,False
8,0,12,12,False
9,0,11,11,False


example of unfair ranking:

In [5]:
def create_unfair_example():
    """
    Setting an unfair ranking where protected group examples are only the last 6 results.
    """
    ranking['protected']=False
    ranking['protected'].iloc[-6:]=True
    return ranking
    
unfair_ranking = create_unfair_example()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ranking['protected'].iloc[-6:]=True


You can use FairTopK  and pass ranking and p_attr (protected groups) separated or in the same dataframe.

In [6]:
# Synthetic data
M = 1000
top_n = 20
p = 0.25
rankings = generate_rankings(M, top_n, p, return_p_attr=False)
#rankings , p_attr = generate_rankings(M, top_n, p, return_p_attr=True)

# Bias Mitigation Post-processing
top_n = 20
p = 0.5
alpha = 0.1
fs = FairTopK(top_n=top_n, 
              p=p, 
              alpha=alpha, 
              query_col='X', 
              doc_col='Y', 
              score_col='score', 
              group_col='protected')

# Transform passing all info in the same dataframe
re_rankings = fs.transform(rankings)
#re_rankings = fs.transform(rankings, p_attr=p_attr)

Testing and unfair ranking

In [7]:
re_ranking = fs.transform(unfair_ranking)

def compare_results(old , new):
    old = old.copy()
    new = new.copy()
    old.columns = pd.MultiIndex.from_tuples([['Old Rank',col] for col in old.columns])
    new.columns = pd.MultiIndex.from_tuples([['New Rank',col] for col in new.columns])
    return pd.concat([old.reset_index(drop=True),new.reset_index(drop=True)], axis=1)

compare_results(unfair_ranking , re_ranking)

Unnamed: 0_level_0,Old Rank,Old Rank,Old Rank,Old Rank,New Rank,New Rank,New Rank,New Rank
Unnamed: 0_level_1,X,Y,score,protected,X,Y,score,protected
0,0,20,20,False,0,20,20,False
1,0,19,19,False,0,19,19,False
2,0,18,18,False,0,18,18,False
3,0,17,17,False,0,17,17,False
4,0,16,16,False,0,6,6,True
5,0,15,15,False,0,16,16,False
6,0,14,14,False,0,15,15,False
7,0,13,13,False,0,5,5,True
8,0,12,12,False,0,14,14,False
9,0,11,11,False,0,13,13,False


In [8]:
from holisticai.bias.mitigation.postprocessing.debiasing_exposure.transformer import DebiasingExposure
import warnings
warnings.filterwarnings('ignore')

In [9]:
M = 1000
top_n = 20
p = 0.25
rankings = generate_rankings(M, top_n, p, return_p_attr=False)

In [10]:
# create the DebiasingExposure class
dtr = DebiasingExposure(group_col="protected",
                        query_col = 'X',
                        doc_col = 'Y',
                        feature_cols = ['score', 'protected'],
                        score_col = 'score',
                        gamma=2, 
                        number_of_iterations=100, 
                        standardize=True,
                        verbose=1)

# train the model
dtr.fit(rankings)

100%|██████████| 100/100 [00:20<00:00,  4.78it/s]


<holisticai.bias.mitigation.postprocessing.debiasing_exposure.transformer.DebiasingExposure at 0x25420c9bd00>

In [11]:
dtr._omega # (model weights for 'score' and 'protected' columns)

array([0.75391664, 0.36180839])

Testing an unfair ranking

In [12]:
re_ranking = dtr.transform(unfair_ranking)
compare_results(unfair_ranking , re_ranking)

Unnamed: 0_level_0,Old Rank,Old Rank,Old Rank,Old Rank,New Rank,New Rank,New Rank,New Rank
Unnamed: 0_level_1,X,Y,score,protected,X,Y,protected,score
0,0,20,20,False,0,20,False,1.681892
1,0,19,19,False,0,19,False,1.566877
2,0,18,18,False,0,18,False,1.451862
3,0,17,17,False,0,17,False,1.336848
4,0,16,16,False,0,16,False,1.221833
5,0,15,15,False,0,15,False,1.106818
6,0,14,14,False,0,14,False,0.991803
7,0,13,13,False,0,13,False,0.876788
8,0,12,12,False,0,12,False,0.761773
9,0,11,11,False,0,11,False,0.646758


### Comparing Fairness (Original Ranking - FairTopK - FairSearch)

In [17]:
# Synthetic data (Initial Rank)
M = 1000
top_n = 20
p = 0.25
rankings = generate_rankings(M, top_n, p, return_p_attr=False)

In [28]:
from holisticai.bias.mitigation.postprocessing.debiasing_exposure.algorithm_utils import exposure_metric

# Bias Mitigation Post-processing
top_n = 20
p = 0.5
alpha = 0.1

fs = FairTopK(top_n=top_n, 
              p=p, 
              alpha=alpha, 
              query_col='X', 
              doc_col='Y', 
              score_col='score', 
              group_col='protected')

dtr = DebiasingExposure(query_col = 'X', 
            doc_col = 'Y', 
            score_col = 'score', 
            group_col="protected", 
            feature_cols = ['score', 'protected'],
            gamma=10, 
            number_of_iterations=100, 
            standardize=False)

dtr.fit(rankings)

re_rankings_fs = fs.transform(ranking)
re_rankings_dtr = dtr.transform(ranking)

                                               

Exception: Error!!

In [29]:
def compare_results(vars):
    new_vars = []
    for i,var in enumerate(vars):
        var = var.copy()
        var.columns = pd.MultiIndex.from_tuples([[f'Rank {i}',col] for col in var.columns])
        var = var.reset_index(drop=True)
        new_vars.append(var)
    return pd.concat(new_vars, axis=1)

In [24]:
re_ranking_fs = fs.transform(unfair_ranking)
re_ranking_dtr = dtr.transform(unfair_ranking)
compare_results([unfair_ranking, re_ranking_fs, re_ranking_dtr])

Unnamed: 0_level_0,Rank 0,Rank 0,Rank 0,Rank 0,Rank 1,Rank 1,Rank 1,Rank 1,Rank 2,Rank 2,Rank 2,Rank 2
Unnamed: 0_level_1,X,Y,score,protected,X,Y,score,protected,X,Y,protected,score
0,0,20,20,False,0,20,20,False,0,20,False,1.349534
1,0,19,19,False,0,19,19,False,0,19,False,1.257263
2,0,18,18,False,0,18,18,False,0,18,False,1.164993
3,0,17,17,False,0,17,17,False,0,17,False,1.072723
4,0,16,16,False,0,6,6,True,0,16,False,0.980452
5,0,15,15,False,0,16,16,False,0,15,False,0.888182
6,0,14,14,False,0,15,15,False,0,14,False,0.795912
7,0,13,13,False,0,5,5,True,0,13,False,0.703642
8,0,12,12,False,0,14,14,False,0,12,False,0.611371
9,0,11,11,False,0,13,13,False,0,6,True,0.545712


In [26]:
m_org = exposure_metric(rankings, group_col='protected', query_col='X', score_col='score')
m_fs = exposure_metric(re_rankings_fs, group_col='protected', query_col='X', score_col='score')
m_dtr = exposure_metric(re_rankings_dtr, group_col='protected', query_col='X', score_col='score')

In [27]:
df_metrics = pd.concat([m_org, m_fs, m_dtr],axis=1)
df_metrics.columns = ['Original', 'FairTopK', 'FairSearch']
df_metrics

Unnamed: 0,Original,FairTopK,FairSearch
exposure_ratio,17252.551263,516682.073493,1.633279
exposure difference,0.046761,0.103049,0.031651
