# Exercises in Fairness in Machine Learning

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import xgboost as xgb
import setuptools.dist
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

  from .autonotebook import tqdm as notebook_tqdm


## Exercise 1

For this exercise, we will use the `adult` dataset (available on moodle or from the [UCI Machine Learning repository](https://archive.ics.uci.edu/dataset/2/adult)). Do the following:

1. Load in the dataset and correct the error in the income column (replace the "." with the empty string such that there are only two categories).
2. Create an X dataset using the variables "age", "workclass", "education", "occupation", "race", "sex", "hours-per-week". For the categorical variables with missing values, replace the missing values with a new category "Unknown". Also replace any values that are "?" with the value "Unknown (using `str.replace`, for instance)
3. Turn the five categorical variables in X into dummy variables and remove the original five variables (This will probably give you around 44 columns in X)
4. Create the response variable y, such that it is 1 if the `income` variable in the adult dataset is `>50K` and 0 if the value is `<=50K`.
5. Do a train-test split with 30% of the data for test (using `random_state=123`) and train a `XGBoost` classification model on the training data.
6. Evaluate your models using various evaluation metrics and look at the confusion matrix of your model.
7. To be able to calculate the various fairness metrics in regard to the variable `sex`, we need to construct two separate confusion matrices for the test dataset, one for `female` and one for `male`. First, create separate test sets for `female` and `male` as well as the predicted values for each gender. That is, create `X_test_female`, `X_test_male`, `y_test_female`, `y_test_male`, `y_pred_female`, and `y_pred_male`. (Hint: You can create `X_test_female` by `X_test_female = X_test[X_test["sex_Male"] == 0]` and `y_test_male` by `y_test_male = y_test[X_test["sex_Male"] == 1]`, for instance.)
8. Calculate the accuracy for female and male for the XGBoost model and comment on the results.
9. We can now create the True Positive (TP), True Negative (TN), False Positive (FP), and False Negative (FN) for each gender. That is, calculate the eight values `TP_f`, `TN_f`, `FP_f`, `FN_f`, `TP_m`, `TN_m`, `FP_m`, and `FN_m`. (Hint: You can calculate the False Positive for female (FP_f) by `FP_f = sum((y_test_female == 0) & (y_pred_female == 1))`.)
10. Is there error rate balance across different genders, i.e. are the false positive rate (FPR) and false negative rate (FNR) the same across the two genders?
11. Is there predictive parity across different genders?
12. Is there Statistical parity across different genders?
13. [Discussion question] Can your any of your models be used to make fair salary predictions?
14. [Discussion question] In what sense is the `adult` dataset biased (unfair)?
15. [Discussion question] If the dataset is biased, where could the bias potentially come from?
16. [Optional] If you balance the number of males and females in the dataset (like we balanced the response variable in the Churn example), will you model become more fair?

#### 1. Load in the dataset and correct the error in the income column (replace the "." with the empty string such that there are only two categories).

In [133]:
df = pd.read_csv("../Notebooks and data-18/adult.csv")
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
48838,64,,321403,HS-grad,9,Widowed,,Other-relative,Black,Male,0,0,40,United-States,<=50K.
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


In [43]:
df["education"].value_counts().sum()

np.int64(48842)

#### 2. Create an X dataset using the variables "age", "workclass", "education", "occupation", "race", "sex", "hours-per-week". For the categorical variables with missing values, replace the missing values with a new category "Unknown". Also replace any values that are "?" with the value "Unknown (using `str.replace`, for instance)

In [50]:
df = df.replace({"<=50K.": "<=50K", ">50K.": ">50K"})

In [164]:
X = df[["age", "workclass", "education", "occupation", "race", "sex", "hours-per-week"]].copy()
X

Unnamed: 0,age,workclass,education,occupation,race,sex,hours-per-week
0,39,State-gov,Bachelors,Adm-clerical,White,Male,40
1,50,Self-emp-not-inc,Bachelors,Exec-managerial,White,Male,13
2,38,Private,HS-grad,Handlers-cleaners,White,Male,40
3,53,Private,11th,Handlers-cleaners,Black,Male,40
4,28,Private,Bachelors,Prof-specialty,Black,Female,40
...,...,...,...,...,...,...,...
48837,39,Private,Bachelors,Prof-specialty,White,Female,36
48838,64,,HS-grad,,Black,Male,40
48839,38,Private,Bachelors,Prof-specialty,White,Male,50
48840,44,Private,Bachelors,Adm-clerical,Asian-Pac-Islander,Male,40


In [165]:
X["sex"].value_counts()

sex
Male      32650
Female    16192
Name: count, dtype: int64

In [166]:
print("workclass ? value counts", X[X["workclass"] == "?"].value_counts())
X[X["workclass"] == "?"].workclass = "Unknown"
print("workclass ? value counts", X[X["workclass"] == "?"].value_counts())

print("")

print("occupation ? value counts", X[X["occupation"] == "?"].value_counts())
X[X["occupation"] == "?"].occupation = "Unknown"
print("occupation ? value counts", X[X["occupation"] == "?"].value_counts())

workclass ? value counts age  workclass  education     occupation  race   sex     hours-per-week
20   ?          Some-college  ?           White  Female  40                15
22   ?          Some-college  ?           White  Male    40                15
19   ?          Some-college  ?           White  Male    40                14
21   ?          Some-college  ?           White  Female  40                12
                                                 Male    40                10
                                                                           ..
80   ?          HS-grad       ?           White  Male    25                 1
                                                         24                 1
19   ?          10th          ?           White  Male    30                 1
80   ?          Assoc-acdm    ?           White  Male    4                  1
82   ?          10th          ?           White  Male    20                 1
Name: count, Length: 1471, dtype: int64
workc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[X["workclass"] == "?"].workclass = "Unknown"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[X["occupation"] == "?"].occupation = "Unknown"


In [167]:
X["sex"].value_counts()

sex
Male      32650
Female    16192
Name: count, dtype: int64

In [151]:
X

Unnamed: 0,age,workclass,education,occupation,race,sex,hours-per-week
0,39,State-gov,Bachelors,Adm-clerical,White,Male,40
1,50,Self-emp-not-inc,Bachelors,Exec-managerial,White,Male,13
2,38,Private,HS-grad,Handlers-cleaners,White,Male,40
3,53,Private,11th,Handlers-cleaners,Black,Male,40
4,28,Private,Bachelors,Prof-specialty,Black,Female,40
...,...,...,...,...,...,...,...
48837,39,Private,Bachelors,Prof-specialty,White,Female,36
48838,64,,HS-grad,,Black,Male,40
48839,38,Private,Bachelors,Prof-specialty,White,Male,50
48840,44,Private,Bachelors,Adm-clerical,Asian-Pac-Islander,Male,40


In [168]:
print(X.isna().sum())

X = X.fillna("Unknown") 

print(X.isna().sum())

age                 0
workclass         963
education           0
occupation        966
race                0
sex                 0
hours-per-week      0
dtype: int64
age               0
workclass         0
education         0
occupation        0
race              0
sex               0
hours-per-week    0
dtype: int64


In [170]:
X

Unnamed: 0,age,workclass,education,occupation,race,sex,hours-per-week
0,39,State-gov,Bachelors,Adm-clerical,White,Male,40
1,50,Self-emp-not-inc,Bachelors,Exec-managerial,White,Male,13
2,38,Private,HS-grad,Handlers-cleaners,White,Male,40
3,53,Private,11th,Handlers-cleaners,Black,Male,40
4,28,Private,Bachelors,Prof-specialty,Black,Female,40
...,...,...,...,...,...,...,...
48837,39,Private,Bachelors,Prof-specialty,White,Female,36
48838,64,Unknown,HS-grad,Unknown,Black,Male,40
48839,38,Private,Bachelors,Prof-specialty,White,Male,50
48840,44,Private,Bachelors,Adm-clerical,Asian-Pac-Islander,Male,40


#### 3. Turn the five categorical variables in X into dummy variables and remove the original five variables (This will probably give you around 44 columns in X)

In [142]:
X = X.join(pd.get_dummies(X["workclass"], drop_first=True, dtype=int, prefix="wc"))
X = X.join(pd.get_dummies(X["education"], drop_first=True, dtype=int, prefix="ed"))
X = X.join(pd.get_dummies(X["occupation"], drop_first=True, dtype=int, prefix="oc"))
X = X.join(pd.get_dummies(X["race"], drop_first=True, dtype=int))
X = X.join(pd.get_dummies(X["sex"], drop_first=True, dtype=int, prefix="sex"))

X = X.drop(columns = ["workclass", "education", "occupation", "race", "sex"])
X

Unnamed: 0,age,hours-per-week,wc_Local-gov,wc_Never-worked,wc_Private,wc_Self-emp-inc,wc_Self-emp-not-inc,wc_State-gov,wc_Unknown,wc_Without-pay,...,oc_Tech-support,oc_Transport-moving,oc_Unknown,Asian-Pac-Islander,Black,Other,Unknown,White,sex_Male,sex_Unknown
0,39,40,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,1,0
1,50,13,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,1,0
2,38,40,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
3,53,40,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4,28,40,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,36,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
48838,64,40,0,0,0,0,0,0,1,0,...,0,0,1,0,1,0,0,0,1,0
48839,38,50,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
48840,44,40,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


#### 4. Create the response variable y, such that it is 1 if the `income` variable in the adult dataset is `>50K` and 0 if the value is `<=50K`.

In [70]:
y = df["income"]

In [93]:
y = y.replace({"<=50K": 0, ">50K": 1})
y

  y = y.replace({"<=50K": 0, ">50K": 1})


0        0
1        0
2        0
3        0
4        0
        ..
48837    0
48838    0
48839    0
48840    0
48841    1
Name: income, Length: 48842, dtype: int64