The data from this exercise comes from the UCI Machine Learning Repository: https://archive.ics.uci.edu/ml/datasets/Car+Evaluation For more details on the data set see the included documentation.

In [None]:
# Ensure that Aequitas dependency is installed
!pip install aequitas==0.42

In [49]:
# Imports
from aequitas.plotting import Plot
ap = Plot()
import pandas as pd

from aequitas.group import Group
from aequitas.bias import Bias 
from aequitas.fairness import Fairness

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, label_binarize, LabelBinarizer
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [50]:
# I manually added the headers to the data set.
df = pd.read_csv("./car.csv")
df.info()
df['car'].unique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   doors     1728 non-null   object
 3   persons   1728 non-null   object
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
 6   car       1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


array(['unacc', 'acc', 'vgood', 'good'], dtype=object)

In [51]:
# We'll modify the data to make it a binary problem of acceptable or unacceptable car.
df = df.where(df != 'good', 'acc')
df = df.where(df != 'vgood', 'acc')

y = df.pop('car')
X = df

df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,vhigh,vhigh,2,2,small,low
1,vhigh,vhigh,2,2,small,med
2,vhigh,vhigh,2,2,small,high
3,vhigh,vhigh,2,2,med,low
4,vhigh,vhigh,2,2,med,med


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=23)
print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)

# Use this later to construct the DataFrame Aequitas requires.
df_aq = X_test.copy()

ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
X_train = ohe.fit_transform(X_train.values)
X_test = ohe.transform(X_test.values)

lb = LabelBinarizer()
y_train = label_binarize(y_train.values, classes=['unacc', 'acc']).ravel()
y_test = label_binarize(y_test.values, classes=['unacc', 'acc']).ravel()

X_train shape:  (1296, 6)
X_test shape:  (432, 6)


In [19]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

scores = lr.predict_proba(X_test)
pred = lr.predict(X_test)

f1 = f1_score(y_test, pred)

print(f"F1 score: {f1:.4f}")

F1 score: 0.8960


In [None]:
# Construct the dataframe that Aequitas will use.
# You can draw inspiration from examples present here: https://github.com/dssg/aequitas/blob/master/docs/source/examples/compas_demo.ipynb 

In [24]:
df_aq['score'] = pred
df_aq['label_value'] = y_test
df_aq.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 432 entries, 1278 to 212
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   buying       432 non-null    object
 1   maint        432 non-null    object
 2   doors        432 non-null    object
 3   persons      432 non-null    object
 4   lug_boot     432 non-null    object
 5   safety       432 non-null    object
 6   score        432 non-null    int64 
 7   label_value  432 non-null    int64 
dtypes: int64(2), object(6)
memory usage: 30.4+ KB


In [28]:
pd.crosstab(df_aq.score, df_aq.persons)

persons,2,4,more
score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,143,79,79
1,0,70,61


In [29]:
pd.crosstab(df_aq.label_value, df_aq.persons)

persons,2,4,more
label_value,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,143,87,83
1,0,62,57


In [52]:
pd.crosstab(df_aq.score, df_aq.doors)

doors,2,3,4,5more
score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,79,78,76,68
1,22,40,34,35


In [53]:
pd.crosstab(df_aq.label_value, df_aq.doors)

doors,2,3,4,5more
label_value,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,81,80,81,71
1,20,38,29,32


In [48]:
df_aq.reset_index(drop=True, inline=True)
df_aq.loc[df_aq.persons == "2"].index

Int64Index([ 650,  922,  493, 1410, 1003,  789, 1244,  520,  893,  710,
            ...
            1141, 1385,  326,  164,  303,  112,  547,  406,  196,  434],
           dtype='int64', length=143)

In [46]:
# select predictions where 
y_test_person_eq_2 = y_test[df_aq.loc[df_aq.persons == "2"].index]
y_pred_person_eq_2 = pred[df_aq.loc[df_aq.persons == "2"].index]

IndexError: index 650 is out of bounds for axis 0 with size 432

In [None]:
# Run Aequitas.
# Summarize: Aequitas classes provides a few functions that provide a high level summary of fairness and disparity, such as 
# plot_fairness_group_all()
# plot_fairness_disparity_all()
# plot_disparity_all()
