In [91]:
# Load all necessary packages
import sys
sys.path.insert(1, "../")  

import numpy as np
np.random.seed(0)

from aif360.datasets import CompasDataset
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.algorithms.preprocessing import Reweighing
from aif360.algorithms.preprocessing import DisparateImpactRemover

from IPython.display import Markdown, display

# Fairness metrics
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.metrics import ClassificationMetric

# Explainers
from aif360.explainers import MetricTextExplainer

# Scalers
from sklearn.preprocessing import StandardScaler

# Classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

# Execrise 5

In [32]:
# TODO #1
# I pick the sex attribute as the protected attribute
# so accrodingly I drop the feature about "race" to avoid bias
cd = CompasDataset(protected_attribute_names=['sex'],privileged_classes=[['Female']],
                   features_to_keep=['sex','age', 'age_cat', 'juv_fel_count', 'juv_misd_count', 
                                     'juv_other_count', 'priors_count', 'c_charge_degree', 'c_charge_desc', 
                                     'two_year_recid'],metadata={'label_maps': [{1.0: 'Did recid.', 0.0: 'No recid.'}],
                                                                 'protected_attribute_maps': [{0.0: 'Male', 1.0: 'Female'}]})



In [38]:
# I set female as the priviledged groups and male as the unprivledged group
privileged_groups = [{'sex': 1}] # female
unprivileged_groups = [{'sex': 0}] # male
metric_orig_train = BinaryLabelDatasetMetric(cd, 
                                             unprivileged_groups=unprivileged_groups,
                                             privileged_groups=privileged_groups)
# use the diapate impact metric to calculate
display(Markdown("#### Original training dataset"))
print("Diparate impact between unprivileged and privileged groups = %f" % metric_orig_train.disparate_impact())

#### Original training dataset

Diparate impact between unprivileged and privileged groups = 0.802925


# Exercise 6

In [105]:
# I split the cd dataframe I have created above into train set and test set
(dataset_orig_panel19_train,
 dataset_orig_panel19_val,
 dataset_orig_panel19_test) = cd.split([0.5, 0.8], shuffle=True)

sens_ind = 0
sens_attr = dataset_orig_panel19_train.protected_attribute_names[sens_ind]

# accordingkly set priviledged and unprivileged groups
unprivileged_groups = [{sens_attr: v} for v in
                       dataset_orig_panel19_train.unprivileged_protected_attributes[sens_ind]]
privileged_groups = [{sens_attr: v} for v in
                     dataset_orig_panel19_train.privileged_protected_attributes[sens_ind]]


In [106]:
# fuction to describe the feature of the dataset
def describe(train=None, val=None, test=None):
    if train is not None:
        display(Markdown("#### Training Dataset shape"))
        print(train.features.shape)
    if val is not None:
        display(Markdown("#### Validation Dataset shape"))
        print(val.features.shape)
    display(Markdown("#### Test Dataset shape"))
    print(test.features.shape)
    display(Markdown("#### Favorable and unfavorable labels"))
    print(test.favorable_label, test.unfavorable_label)
    display(Markdown("#### Protected attribute names"))
    print(test.protected_attribute_names)
    display(Markdown("#### Privileged and unprivileged protected attribute values"))
    print(test.privileged_protected_attributes, 
          test.unprivileged_protected_attributes)
    display(Markdown("#### Dataset feature names"))
    print(test.feature_names)

describe(dataset_orig_panel19_train, dataset_orig_panel19_val, dataset_orig_panel19_test)

#### Training Dataset shape

(3083, 400)


#### Validation Dataset shape

(1850, 400)


#### Test Dataset shape

(1234, 400)


#### Favorable and unfavorable labels

0.0 1.0


#### Protected attribute names

['sex']


#### Privileged and unprivileged protected attribute values

[array([1.])] [array([0.])]


#### Dataset feature names

['sex', 'age', 'juv_fel_count', 'juv_misd_count', 'juv_other_count', 'priors_count', 'age_cat=25 - 45', 'age_cat=Greater than 45', 'age_cat=Less than 25', 'c_charge_degree=F', 'c_charge_degree=M', 'c_charge_desc=Abuse Without Great Harm', 'c_charge_desc=Agg Abuse Elderlly/Disabled Adult', 'c_charge_desc=Agg Assault W/int Com Fel Dome', 'c_charge_desc=Agg Battery Grt/Bod/Harm', 'c_charge_desc=Agg Fleeing and Eluding', 'c_charge_desc=Agg Fleeing/Eluding High Speed', 'c_charge_desc=Aggr Child Abuse-Torture,Punish', 'c_charge_desc=Aggrav Battery w/Deadly Weapon', 'c_charge_desc=Aggrav Child Abuse-Agg Battery', 'c_charge_desc=Aggrav Child Abuse-Causes Harm', 'c_charge_desc=Aggrav Stalking After Injunctn', 'c_charge_desc=Aggravated Assault', 'c_charge_desc=Aggravated Assault W/Dead Weap', 'c_charge_desc=Aggravated Assault W/dead Weap', 'c_charge_desc=Aggravated Assault W/o Firearm', 'c_charge_desc=Aggravated Assault w/Firearm', 'c_charge_desc=Aggravated Battery', 'c_charge_desc=Aggravated Ba

In [107]:
# analyse the disparate impact on the train set
metric_orig_panel19_train = BinaryLabelDatasetMetric(
        dataset_orig_panel19_train,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups)

explainer_orig_panel19_train = MetricTextExplainer(metric_orig_panel19_train)

print(explainer_orig_panel19_train.disparate_impact())

Disparate impact (probability of favorable outcome for unprivileged instances / probability of favorable outcome for privileged instances): 0.8227534408479164


In [108]:
# set the train dataset
dataset = dataset_orig_panel19_train
# bulid up the Logistic Regression
model = make_pipeline(StandardScaler(),
                      LogisticRegression(solver='liblinear', random_state=1))

fit_params = {'logisticregression__sample_weight': dataset.instance_weights}
# TODO #2 
# train the model
lr_orig_panel19 = model.fit(dataset.features, dataset.labels.ravel(), **fit_params)

In [109]:
from collections import defaultdict

def test(dataset, model, thresh_arr):
    try:
        # sklearn classifier
        # TODO #2
        # predict if a defendant is likely to re-offend
        y_val_pred_prob = model.predict_proba(dataset.features)
        pos_ind = np.where(model.classes_ == dataset.favorable_label)[0][0]
    except AttributeError:
        # aif360 inprocessing algorithm
        y_val_pred_prob = model.predict(dataset).scores
        pos_ind = 0
    
    metric_arrs = defaultdict(list)
    y_val_pred = thresh_arr

    dataset_pred = dataset.copy()
#     dataset_pred.labels = y_val_pred
    metric = ClassificationMetric(
            dataset, dataset_pred,
            unprivileged_groups=unprivileged_groups,
            privileged_groups=privileged_groups)

    # TODO #3
    # for the classifier, calculate the fairness of the classification
    metric_arrs['disp_imp'].append(metric.disparate_impact())
    metric_arrs['eq_opp_diff'].append(metric.equal_opportunity_difference())
    
    return metric_arrs

In [114]:
# set the threshold to 0.5 
thresh_arr = 0.5

val_metrics = test(dataset=dataset_orig_panel19_val,
                   model=lr_orig_panel19,
                   thresh_arr=thresh_arr)
# lr_orig_best_ind = np.argmax(val_metrics['bal_acc'])
display(Markdown("#### Fairness of the classifier"))
num1 = val_metrics['disp_imp']
num2 = val_metrics['eq_opp_diff']

# TODO #3
# print the fairness metric
print("Diparate impact between unprivileged and privileged groups =", num1)
print("Equal opportunity difference between unprivileged and privileged groups = ", num2)

#### Fairness of the classifier

Diparate impact between unprivileged and privileged groups = [0.794094794094794]
Equal opportunity difference between unprivileged and privileged groups =  [0.0]


# Exercise 7a

In [94]:
# TODO #4
# set up the prepossessing function to remove the disparate impact
R = DisparateImpactRemover(repair_level=1.0, sensitive_attribute='')
dataset_transf_trainr = R.fit_transform(cd)

In [95]:
# calculate the fairness metric on the transformed dataset
metric_transf_trainr = BinaryLabelDatasetMetric(dataset_transf_trainr, 
                                               unprivileged_groups=unprivileged_groups,
                                               privileged_groups=privileged_groups)
display(Markdown("#### Transformed training dataset"))
print("Diparate impact between unprivileged and privileged groups = %f" % metric_transf_trainr.disparate_impact())

#### Transformed training dataset

Diparate impact between unprivileged and privileged groups = 0.802925


# Exercise 7b

In [84]:
# TODO #5
# set up the prepossessing function to reweigh
RW = Reweighing(unprivileged_groups=unprivileged_groups,
                privileged_groups=privileged_groups)
dataset_transf_train = RW.fit_transform(cd)

In [89]:
# calculate the fairness metric on the transformed dataset
metric_transf_train = BinaryLabelDatasetMetric(dataset_transf_train, 
                                               unprivileged_groups=unprivileged_groups,
                                               privileged_groups=privileged_groups)
display(Markdown("#### Transformed training dataset"))
print("Diparate impact between unprivileged and privileged groups = %f" % metric_transf_train.disparate_impact())

#### Transformed training dataset

Diparate impact between unprivileged and privileged groups = 1.000000


# Exercise 8

In [97]:
# the dataset_transf_train is generated from 7b, I use the transformed data from 7b
# split train set and test set
(train, val, test) = dataset_transf_train.split([0.5, 0.8], shuffle=True)

sens_ind = 0
sens_attr = train.protected_attribute_names[sens_ind]

unprivileged_groups = [{sens_attr: v} for v in
                       train.unprivileged_protected_attributes[sens_ind]]
privileged_groups = [{sens_attr: v} for v in
                     train.privileged_protected_attributes[sens_ind]]

In [99]:
# calculate the original fairness metric on the original data
metric_train = BinaryLabelDatasetMetric(
        train,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups)

explainer_train = MetricTextExplainer(metric_train)

print(explainer_train.disparate_impact())

Disparate impact (probability of favorable outcome for unprivileged instances / probability of favorable outcome for privileged instances): 1.011948759577447


In [116]:
# TODO #6
# the model lr_orig_panel19 is generated from above
thresh_arr = 0.5
val_metrics = test(dataset=val,
                   model=lr_orig_panel19,
                   thresh_arr=thresh_arr)

display(Markdown("#### Fairness metric of classifier on transformed dataset"))
num1 = val_metrics['disp_imp']
num2 = val_metrics['eq_opp_diff']
print("Diparate impact between unprivileged and privileged groups =", num1)
print("Equal opportunity difference between unprivileged and privileged groups = ", num2)

#### Fairness metric of classifier on transformed dataset

Diparate impact between unprivileged and privileged groups = [0.9277138415567062]
Equal opportunity difference between unprivileged and privileged groups =  [0.0]
