In [3]:
from aif360.datasets import CompasDataset

In [4]:
from aif360.algorithms.preprocessing.optim_preproc_helpers.data_preproc_functions import load_preproc_data_compas

In [5]:
from aif360.datasets import BinaryLabelDataset

In [19]:
# Load and preprocess the COMPAS dataset
dataset = load_preproc_data_compas(['race'])

In [20]:
privileged_groups = [{'race': 1}]
unprivileged_groups = [{'race': 0}]

In [21]:
protected_attribute_names=['race']

In [22]:
train, test = dataset.split([0.7], shuffle=True)

<h3>Define Protected Attributes and Measure Initial Bias</h3>

In [23]:
from aif360.metrics import BinaryLabelDatasetMetric
metric_orig_compas_race = BinaryLabelDatasetMetric(dataset, 
                                                   unprivileged_groups=[{'race': 0}], 
                                                   privileged_groups=[{'race': 1}])
print("Original Disparate Impact (Race):", metric_orig_compas_race.disparate_impact())
print("Original Statistical Parity Difference (Race):", metric_orig_compas_race.statistical_parity_difference())

Original Disparate Impact (Race): 0.7828387025392317
Original Statistical Parity Difference (Race): -0.13227942084985456


<h3>Apply Reweighing Technique</h3>

In [35]:
from aif360.algorithms.preprocessing import Reweighing
RW = Reweighing(unprivileged_groups=[{'race': 0}], privileged_groups=[{'race': 1}])
compas_dataset_rw = RW.fit_transform(train)

In [36]:
print(compas_dataset_rw)

               instance weights features                                       \
                                         protected attribute                    
                                     sex                race age_cat=25 to 45   
instance names                                                                  
9049                   1.106623      0.0                 0.0              1.0   
9440                   1.106623      0.0                 0.0              0.0   
1631                   1.195783      0.0                 1.0              0.0   
9269                   1.106623      1.0                 0.0              1.0   
5130                   1.106623      0.0                 0.0              1.0   
...                         ...      ...                 ...              ...   
8954                   0.875626      0.0                 1.0              0.0   
4262                   0.900041      0.0                 0.0              1.0   
10584                  1.195

h3>Train on the training dataset</h3>

In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import numpy as np

In [42]:
# Logistic regression classifier and predictions
scale_orig = StandardScaler()
X_train = scale_orig.fit_transform(train.features)
y_train = train.labels.ravel()
w_train = train.instance_weights.ravel()

lmod = LogisticRegression()
lmod.fit(X_train, y_train, 
         sample_weight=train.instance_weights)
y_train_pred = lmod.predict(X_train)

# positive class index
pos_ind = np.where(lmod.classes_ == train.favorable_label)[0][0]

train_pred = train.copy()
train_pred.labels = y_train_pred

In [None]:
dataset_orig_valid_pred = dataset_orig_valid.copy(deepcopy=True)
X_valid = scale_orig.transform(dataset_orig_valid_pred.features)
y_valid = dataset_orig_valid_pred.labels
dataset_orig_valid_pred.scores = lmod.predict_proba(X_valid)[:,pos_ind].reshape(-1,1)
print(dataset_orig_valid_pred.scores[:10])

dataset_orig_test_pred = dataset_orig_test.copy(deepcopy=True)
X_test = scale_orig.transform(dataset_orig_test_pred.features)
y_test = dataset_orig_test_pred.labels
dataset_orig_test_pred.scores = lmod.predict_proba(X_test)[:,pos_ind].reshape(-1,1)

In [26]:
from IPython.display import Markdown, display

In [27]:
# print out some labels, names, etc.
display(Markdown("#### Training Dataset shape"))
print(dataset.features.shape)
display(Markdown("#### Favorable and unfavorable labels"))
print(dataset.favorable_label, dataset.unfavorable_label)
display(Markdown("#### Protected attribute names"))
print(dataset.protected_attribute_names)
display(Markdown("#### Privileged and unprivileged protected attribute values"))
print(dataset.privileged_protected_attributes, 
      dataset.unprivileged_protected_attributes)
display(Markdown("#### Dataset feature names"))
print(dataset.feature_names)

#### Training Dataset shape

(5278, 10)


#### Favorable and unfavorable labels

0.0 1.0


#### Protected attribute names

['race']


#### Privileged and unprivileged protected attribute values

[array([1.])] [array([0.])]


#### Dataset feature names

['sex', 'race', 'age_cat=25 to 45', 'age_cat=Greater than 45', 'age_cat=Less than 25', 'priors_count=0', 'priors_count=1 to 3', 'priors_count=More than 3', 'c_charge_degree=F', 'c_charge_degree=M']


In [28]:
metric_rw_compas_sex = BinaryLabelDatasetMetric(compas_dataset_rw, 
                                                unprivileged_groups=[{'sex': 0}], 
                                                privileged_groups=[{'sex': 1}])
print("Disparate Impact After Reweighing (Sex):", metric_rw_compas_sex.disparate_impact())
print("Statistical Parity Difference After Reweighing (Sex):", metric_rw_compas_sex.statistical_parity_difference())

ValueError: 'sex' is not in list

In [29]:
dataset_sex = load_preproc_data_compas()

In [30]:
print(dataset)

               instance weights features                                       \
                                         protected attribute                    
                                     sex                race age_cat=25 to 45   
instance names                                                                  
3                           1.0      0.0                 0.0              1.0   
4                           1.0      0.0                 0.0              0.0   
8                           1.0      0.0                 1.0              1.0   
10                          1.0      1.0                 1.0              1.0   
14                          1.0      0.0                 1.0              1.0   
...                         ...      ...                 ...              ...   
10994                       1.0      0.0                 0.0              1.0   
10995                       1.0      0.0                 0.0              0.0   
10996                       

In [31]:
privileged_groups = [{'sex': 1}]
unprivileged_groups = [{'sex': 0}]

In [32]:
protected_attribute_names=['sex']

In [33]:
train, test = dataset_sex.split([0.7], shuffle=True)

In [34]:
metric_orig_compas_sex = BinaryLabelDatasetMetric(dataset_sex, 
                                                   unprivileged_groups=[{'sex': 0}], 
                                                   privileged_groups=[{'sex': 1}])
print("Original Disparate Impact (Sex):", metric_orig_compas_sex.disparate_impact())
print("Original Statistical Parity Difference (Sex):", metric_orig_compas_sex.statistical_parity_difference())

Original Disparate Impact (Sex): 0.788415280444698
Original Statistical Parity Difference (Sex): -0.1350366105406292


In [None]:
RW = Reweighing(unprivileged_groups=[{'sex': 0}], privileged_groups=[{'sex': 1}])
compas_dataset_rw = RW.fit_transform(dataset_sex)