# Simple FairGBM example on the UCI Adult dataset

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

In [2]:
SEED = 42

### Load data

In [3]:
from utils import load_uci_adult
train_set, test_set = load_uci_adult()
train_set.head()

100% [................................................................................] 5229 / 5229

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


Split features (X), labels (Y), and sensitive attributes (S).

**NOTE**: the labels (Y) and sensitive attributes (S) must be in numeric format!

In [4]:
UCI_ADULT_TARGET_COL = "target"
UCI_ADULT_SENSITIVE_COL = "sex"

def split_X_Y_S_uci_adult(data) -> tuple:
    """Splits the given UCI Adult data into features, target, and sensitive attributes.
    
    Returns
    -------
    X, Y, S : tuple[pd.DataFrame, pd.Series, pd.Series]
        A tuple conmtaining the dataset's features, labels, and sensitive attributes.
    """
    ignored_cols = [UCI_ADULT_TARGET_COL, UCI_ADULT_SENSITIVE_COL, "fnlwgt"]
    feature_cols = [col for col in data.columns if col not in ignored_cols]

    X = data[feature_cols]
    
    
    # Convert label to numeric
    Y = pd.Series(
        data=[
            0. if "<=50K" in val.strip() else 1.
            for val in data[UCI_ADULT_TARGET_COL]
        ],
        dtype=float)

    # Convert sensitive attributes to numeric
    # NOTE! There may be two or more sensitive groups
    S = pd.Series(
        data=[1. if val == "Female" else 0. for val in data[UCI_ADULT_SENSITIVE_COL]],
        dtype=float)
    
    return X, Y, S

In this example we're using only two sensitive attributes (sex="Female" or sex="Male").

However, _FairGBM is **not** restricted to binary sensitive attributes_: you can use two or more as long as they're encoded in a single numeric column.

In [5]:
X_train, Y_train, S_train = split_X_Y_S_uci_adult(train_set)
X_test, Y_test, S_test = split_X_Y_S_uci_adult(test_set);

###  Construct FairGBM model

In [6]:
from fairgbm import FairGBMClassifier

core_lgbm_params = {
    "n_estimators": 200,
    "random_state": SEED,
    "n_jobs": -2,
}

# Instantiate
fairgbm_clf = FairGBMClassifier(
    constraint_type="FNR",    # constraint on equal group-wise TPR (equal opportunity)
    multiplier_learning_rate=0.2,
    constraint_fnr_slack=0.05,
    **core_lgbm_params,
)

### Train model

In [7]:
%%time
# Train using features (X), labels (Y), and sensitive attributes (S)
fairgbm_clf.fit(X_train, Y_train, constraint_group=S_train)

CPU times: user 35.3 s, sys: 159 ms, total: 35.4 s
Wall time: 5.8 s


### Compute binary predictions
Note that the sensitive attributes are only used in training and not for predicting!

In [8]:
Y_test_pred = fairgbm_clf.predict(X_test)

### Compute Performance and Fairness metrics

In [9]:
from sklearn.metrics import accuracy_score
from utils import compute_fairness_ratio

acc_val = accuracy_score(Y_test, Y_test_pred)
eq_op_val = compute_fairness_ratio(Y_test, Y_test_pred, S_test, "TPR")

In [10]:
print(f"FairGBM:")
print(f"\tAccuracy: {acc_val:.1%}")
print(f"\tFairness: {eq_op_val:.1%}")

FairGBM:
	Accuracy: 87.2%
	Fairness: 96.1%
