In [51]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Reweighing

Is a preprocessing method. The algorithm transforms the dataset to have more equity in positive outcomes on the protected attribute for the privileged and unprivileged groups.

Example below is from the aif360 package (https://nbviewer.jupyter.org/github/IBM/AIF360/blob/master/examples/tutorial_credit_scoring.ipynb)

# 1) Load German credit dataset

In [110]:


column_names = ['status', 'month', 'credit_history',
    'purpose', 'credit_amount', 'savings', 'employment',
    'investment_as_income_percentage', 'personal_status',
    'other_debtors', 'residence_since', 'property', 'age',
    'installment_plans', 'housing', 'number_of_credits',
    'skill_level', 'people_liable_for', 'telephone',
    'foreign_worker', 'credit']
filepath = '/Users/hkromer/01_Projects/27.Fairness_Bias/AIF360/aif360/data/raw/german/german.data'
df = pd.read_csv(filepath, sep=' ', header=None, names=column_names)


In [111]:
X = df.drop('credit', axis=1)
y = df['credit']

In [112]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=42)

# 2) Consider protected attribute: age 

We will consider age being the protected attributed (threshold is 25 years).

In [113]:
df_train = X_train.copy()
df_train['credit'] = y_train

df_train

Unnamed: 0,status,month,credit_history,purpose,credit_amount,savings,employment,investment_as_income_percentage,personal_status,other_debtors,...,property,age,installment_plans,housing,number_of_credits,skill_level,people_liable_for,telephone,foreign_worker,credit
811,A12,6,A32,A43,484,A61,A74,3,A94,A103,...,A121,28,A141,A152,1,A172,1,A191,A201,1
76,A11,42,A32,A43,3965,A61,A72,4,A93,A101,...,A123,34,A143,A152,1,A173,1,A191,A201,2
636,A14,24,A32,A43,1376,A63,A74,4,A92,A101,...,A123,28,A143,A152,1,A173,1,A191,A201,1
973,A11,60,A32,A49,7297,A61,A75,4,A93,A102,...,A124,36,A143,A151,1,A173,1,A191,A201,2
938,A12,60,A32,A46,6288,A61,A73,4,A93,A101,...,A124,42,A143,A153,1,A173,1,A191,A201,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,A14,18,A31,A40,6458,A61,A75,2,A93,A101,...,A124,39,A141,A152,2,A174,2,A192,A201,2
270,A14,18,A32,A40,2662,A65,A74,4,A93,A101,...,A122,32,A143,A152,1,A173,1,A191,A202,1
860,A14,24,A34,A41,5804,A64,A73,4,A93,A101,...,A121,27,A143,A152,2,A173,1,A191,A201,1
435,A12,12,A32,A43,1484,A65,A73,2,A94,A101,...,A121,25,A143,A152,1,A173,1,A192,A201,2


In [114]:
m = df_train['age'] >= 25 # filter mask: age >= 25 is considered priviledged group
priviledged = df_train[ m ] 
unpriviledged = df_train[ ~m ] 

Compute the Statistical Parity Difference before any preprocessing.

In [115]:
def statistical_parity_difference(df_p, df_unp, fav_class):
    r"""
    Inputs
    ------------
    df_p : df
        Dataframe containing the dataset of the priviledged group. 
    df_unp : df
        Datafraem containing the dataset of the unpriviledged group.
    fav_class : tuple
        Tuple with index 0: name of the column of the dataframe that contains the favorable target class 
        (i.e., the positive prediction)
        1: value for the target class (i.e., the value of the positive prediction)
    .. math::
       Pr(Y = 1 | D = \text{unprivileged})
       - Pr(Y = 1 | D = \text{privileged})
    """
    if 'instance_weights' not in df_p.columns.tolist():
        df_p.loc[:, 'instance_weights'] = 1.0
        df_unp.loc[:, 'instance_weights'] = 1.0
        
    # filter for the priviledged class favorable
    m = df_p[fav_class[0]] == fav_class[1]
    num_instances = df_p.loc[:, 'instance_weights'].sum()
    num_fav = df_p.loc[m, 'instance_weights'].sum()
    ratio_p = num_fav / num_instances   


    # filter for the unpriviledged class favorable
    m = df_unp[fav_class[0]] == fav_class[1]
    num_instances = df_unp.loc[:, 'instance_weights'].sum()
    num_fav = df_unp.loc[m, 'instance_weights'].sum()
    ratio_unp = num_fav / num_instances   

    return ratio_unp - ratio_p


In [116]:
favorable_class = ('credit', 1.0)
statistical_parity_difference(priviledged, unpriviledged, favorable_class)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


-0.12778354865935204

# 3) Mitigate bias by Reweighing

<cite>[1] F. Kamiran and T. Calders, "Data Preprocessing Techniques for Classification without Discrimination," Knowledge and Information Systems, 2012.</cite>


In [118]:
df = df_train.copy()
priviledged_class = ('age', 25) # df column, threshold value
favorable_class = ('credit', 1.0) # df column, value

n = df.shape[0]
# Initialize weights to 1
df['instance_weights'] = np.ones((n, 1), dtype=np.float64)

# Total number of samples
n = np.sum(df['instance_weights'].values, dtype=np.float64)

# Number of priviledged instances
m =  df[priviledged_class[0]] >= priviledged_class[1]
n_p = np.sum(df[ m ].loc[:, 'instance_weights'].values, dtype=np.float64)

# Number of unpriviledged instances
n_up = np.sum(df[ ~m ].loc[:, 'instance_weights'].values, dtype=np.float64)

# Number of favorable instances
m =  df[favorable_class[0]] == favorable_class[1]
n_fav = np.sum(df[ m ].loc[:, 'instance_weights'].values, dtype=np.float64)

# Number of unfavorable instances
n_unfav = np.sum(df[ ~m ].loc[:, 'instance_weights'].values, dtype=np.float64)

# Number of priviledged and favorable instances
m =  (df[priviledged_class[0]] >= priviledged_class[1]) & (df[favorable_class[0]] == favorable_class[1])
n_p_fav = np.sum(df[ m ].loc[:, 'instance_weights'].values, dtype=np.float64)

# Number of priviledged and unfavorable instances
m =  (df[priviledged_class[0]] >= priviledged_class[1]) & (df[favorable_class[0]] != favorable_class[1])
n_p_unfav = np.sum(df[ m ].loc[:, 'instance_weights'].values, dtype=np.float64)

# Number of unpriviledged and favorable instances
m =  (df[priviledged_class[0]] < priviledged_class[1]) & (df[favorable_class[0]] == favorable_class[1])
n_up_fav = np.sum(df[ m ].loc[:, 'instance_weights'].values, dtype=np.float64)

# Number of unpriviledged and unfavorable instances
m =  (df[priviledged_class[0]] < priviledged_class[1]) & (df[favorable_class[0]] != favorable_class[1])
n_up_unfav = np.sum(df[ m ].loc[:, 'instance_weights'].values, dtype=np.float64)


# performing the reweighing
# reweighing weights

# priviledged and favorable
w_p_fav = n_fav*n_p / (n*n_p_fav)

# priviledged and unfavorable
w_p_unfav = n_unfav*n_p / (n*n_p_unfav)

# unpriviledged and favorable
w_up_fav = n_fav*n_up / (n*n_up_fav)

# unpriviledged and unfavorable
w_up_unfav = n_unfav*n_up / (n*n_up_unfav)


# apply reweighing
# priviledged and favorable
m = (df[priviledged_class[0]] >= priviledged_class[1]) & (df[favorable_class[0]] == favorable_class[1])
df.loc[m, 'instance_weights'] = df.loc[m, 'instance_weights'].values * w_p_fav

# priviledged and unfavorable
m = (df[priviledged_class[0]] >= priviledged_class[1]) & (df[favorable_class[0]] != favorable_class[1])
df.loc[m, 'instance_weights'] = df.loc[m, 'instance_weights'].values * w_p_unfav

# unpriviledged and favorable
m =  (df[priviledged_class[0]] < priviledged_class[1]) & (df[favorable_class[0]] == favorable_class[1])
df.loc[m, 'instance_weights'] = df.loc[m, 'instance_weights'].values * w_up_fav

# unpriviledged and unfavorable
m =  (df[priviledged_class[0]] < priviledged_class[1]) & (df[favorable_class[0]] != favorable_class[1])
df.loc[m, 'instance_weights'] = df.loc[m, 'instance_weights'].values * w_up_unfav

m = df['age'] >= 25 # filter mask: age >= 25 is considered priviledged group
p = df[ m ] 
unp = df[ ~m ] 

statistical_parity_difference(p, unp, favorable_class)

4.440892098500626e-16

In [121]:
unp['instance_weights'].sum(), p['instance_weights'].sum()

(146.00000000000003, 844.0000000000002)

In [124]:
p.shape[0], unp.shape[0]

(844, 146)