# Imports

In [7]:
from folktables import folktables
from folktables import ACSDataSource
import numpy as np
from custom_functions import *
from aif360.datasets import StandardDataset
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from aif360.metrics import ClassificationMetric
import pandas as pd
import joblib
from aif360.algorithms.preprocessing import Reweighing

# Data Setup Code (from Spec Sheet)

In [2]:
# (Age) must be greater than 16 and less than 90,
# and (Person weight) must be greater than or equal to 1
def employment_filter(data):
    """Filters for the employment prediction task"""
    df = data
    df = df[df['AGEP'] > 16]
    df = df[df['AGEP'] < 90]
    df = df[df['PWGTP'] >= 1]
    return df

ACSEmployment = folktables.BasicProblem(
    features=[
        'AGEP',  # age; for range of values of features please check Appendix B.4 of Retiring Adult: New Datasets for Fair Machine Learning NeurIPS 2021 paper
        'SCHL',  # educational attainment
        'MAR',   # marital status
        'RELP',  # relationship
        'DIS',   # disability recode
        'ESP',   # employment status of parents
        'CIT',   # citizenship status
        'MIG',   # mobility status (lived here 1 year ago)
        'MIL',   # military service
        'ANC',   # ancestry recode
        'NATIVITY',  # nativity
        'DEAR',   # hearing difficulty
        'DEYE',   # vision difficulty
        'DREM',   # cognitive difficulty
        'SEX',    # sex
        'RAC1P',  # recoded detailed race code
        'GCL',    # grandparents living with grandchildren
    ],
    target='ESR',  # employment status recode
    target_transform=lambda x: x == 1,
    group='DIS',
    preprocess=employment_filter,
    postprocess=lambda x: np.nan_to_num(x, -1),
)

data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
acs_data = data_source.get_data(states=["FL"], download=True)  # data for Florida state

features, label, group = ACSEmployment.df_to_numpy(acs_data)

data = pd.DataFrame(features, columns=ACSEmployment.features)
data['label'] = label

favorable_classes = [True]
protected_attribute_names = [ACSEmployment.group]
privileged_classes = np.array([[1]])

data_for_aif = StandardDataset(
    data,
    label_name='label',
    favorable_classes=favorable_classes,
    protected_attribute_names=protected_attribute_names,
    privileged_classes=privileged_classes
)

privileged_groups = [{'DIS': 1}]
unprivileged_groups = [{'DIS': 2}]

  df.loc[pos, label_name] = favorable_label


# Task 1

In [3]:
# Split the dataset into train-val and test sets
train_and_val_data, test_data = data_for_aif.split([0.7], shuffle=True, seed=0)

results = grid_search_models(train_and_val_data)

Training model with C = 1e-08 and solver = newton-cg


  results = pd.concat([results, new_result], ignore_index=True)


Training model with C = 1e-08 and solver = lbfgs
Training model with C = 1e-08 and solver = liblinear
Training model with C = 1e-08 and solver = sag
Training model with C = 1e-08 and solver = saga
Training model with C = 1e-07 and solver = newton-cg
Training model with C = 1e-07 and solver = lbfgs
Training model with C = 1e-07 and solver = liblinear
Training model with C = 1e-07 and solver = sag
Training model with C = 1e-07 and solver = saga
Training model with C = 1e-06 and solver = newton-cg
Training model with C = 1e-06 and solver = lbfgs
Training model with C = 1e-06 and solver = liblinear
Training model with C = 1e-06 and solver = sag
Training model with C = 1e-06 and solver = saga
Training model with C = 1e-05 and solver = newton-cg
Training model with C = 1e-05 and solver = lbfgs
Training model with C = 1e-05 and solver = liblinear
Training model with C = 1e-05 and solver = sag
Training model with C = 1e-05 and solver = saga
Training model with C = 0.0001 and solver = newton-cg

### Training and Validation Results

In [4]:
# Find highest accuracy and lowest EOD
highest_accuracy = results['Mean accuracy'].max()
lowest_eod = results['Mean EOD'].min() 

# Find lowest EOD that is not 0
lowest_nonzero_eod = results.loc[results['Mean EOD'] != 0]['Mean EOD'].abs().min()

# Find the corresponding C and solver values
best_accuracy = results.loc[results['Mean accuracy'] == highest_accuracy]
best_eod = results.loc[results['Mean EOD'] == lowest_eod]
best_nonzero_eod = results.loc[(results['Mean EOD'] == lowest_nonzero_eod) | (results['Mean EOD'] == -lowest_nonzero_eod)]

print(f"Best accuracy:\n", best_accuracy, f'\n')
print(f"Best EOD:\n", best_eod, f'\n')
print(f"Lowest non-zero EOD:\n", best_nonzero_eod, f'\n')

Best accuracy:
        C Solver  Mean accuracy  Mean EOD
31  0.01  lbfgs       0.751253  0.615774 

Best EOD:
               C     Solver  Mean accuracy  Mean EOD
0  1.000000e-08  newton-cg       0.526081       0.0
1  1.000000e-08      lbfgs       0.526081       0.0
3  1.000000e-08        sag       0.526081       0.0
4  1.000000e-08       saga       0.526081       0.0
5  1.000000e-07  newton-cg       0.526081       0.0
6  1.000000e-07      lbfgs       0.526081       0.0
8  1.000000e-07        sag       0.526081       0.0
9  1.000000e-07       saga       0.526081       0.0 

Lowest non-zero EOD:
            C     Solver  Mean accuracy  Mean EOD
10  0.000001  newton-cg       0.542484  0.019263
11  0.000001      lbfgs       0.542262  0.019263
14  0.000001       saga       0.542390  0.019263 



### Test Results

In [5]:
# Load the best model
model_accuracy = joblib.load('std_model_accuracy.joblib')
model_eod = joblib.load('std_model_eod.joblib')

# Normalize the test dataset
scale_orig = StandardScaler()
x_test = scale_orig.fit_transform(test_data.features)
y_test = test_data.labels.ravel()

# Model prediction
predictions_accuracy = model_accuracy.predict(x_test)
predictions_eod = model_eod.predict(x_test)

# Create test dataset with predictions
test_pred_accuracy = test_data.copy()
test_pred_accuracy.labels = predictions_accuracy
test_pred_eod = test_data.copy()
test_pred_eod.labels = predictions_eod

# Metrics
metrics_best_accuracy = ClassificationMetric(test_data, test_pred_accuracy, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)
best_accuracy_accuracy = metrics_best_accuracy.accuracy()
best_accuracy_eod = metrics_best_accuracy.equal_opportunity_difference()

metrics_best_eod = ClassificationMetric(test_data, test_pred_eod, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)
best_eod_accuracy = metrics_best_eod.accuracy()
best_eod_eod = metrics_best_eod.equal_opportunity_difference()

# Print results
print("Most accurate model:")
print(f"Accuracy: {best_accuracy_accuracy}")
print(f"EOD: {best_accuracy_eod}", f'\n')
print("Best EOD model:")
print(f"Accuracy: {best_eod_accuracy}")
print(f"EOD: {best_eod_eod}")


Most accurate model:
Accuracy: 0.7523310487727127
EOD: 0.6736529561484942 

Best EOD model:
Accuracy: 0.525880618425247
EOD: 0.0


# Task 2

In [8]:
# Split the dataset into train-val and test sets
train_and_val_data, test_data = data_for_aif.split([0.7], shuffle=True, seed=0)

results = grid_search_models(train_and_val_data, reweight=True)

Training model with C = 1e-08 and solver = newton-cg


  results = pd.concat([results, new_result], ignore_index=True)


Training model with C = 1e-08 and solver = lbfgs
Training model with C = 1e-08 and solver = liblinear
Training model with C = 1e-08 and solver = sag
Training model with C = 1e-08 and solver = saga
Training model with C = 1e-07 and solver = newton-cg
Training model with C = 1e-07 and solver = lbfgs
Training model with C = 1e-07 and solver = liblinear
Training model with C = 1e-07 and solver = sag
Training model with C = 1e-07 and solver = saga
Training model with C = 1e-06 and solver = newton-cg
Training model with C = 1e-06 and solver = lbfgs
Training model with C = 1e-06 and solver = liblinear
Training model with C = 1e-06 and solver = sag
Training model with C = 1e-06 and solver = saga
Training model with C = 1e-05 and solver = newton-cg
Training model with C = 1e-05 and solver = lbfgs
Training model with C = 1e-05 and solver = liblinear
Training model with C = 1e-05 and solver = sag
Training model with C = 1e-05 and solver = saga
Training model with C = 0.0001 and solver = newton-cg

### Results Analysis

In [None]:
# find highest accuracy and lowest EOD
highest_accuracy = results['Mean accuracy'].max()
lowest_eod = results['Mean EOD'].min() 

# find lowest EOD that is not 0
lowest_nonzero_eod = results.loc[results['Mean EOD'] != 0]['Mean EOD'].abs().min()

# find the corresponding C and solver values
best_accuracy = results.loc[results['Mean accuracy'] == highest_accuracy]
best_eod = results.loc[results['Mean EOD'] == lowest_eod]
best_nonzero_eod = results.loc[(results['Mean EOD'] == lowest_nonzero_eod) | (results['Mean EOD'] == -lowest_nonzero_eod)]

print(f"Best accuracy:\n", best_accuracy, f'\n')
print(f"Best EOD:\n", best_eod, f'\n')
print(f"Lowest non-zero EOD:\n", best_nonzero_eod, f'\n')

Best accuracy:
         C     Solver  Mean accuracy  Mean EOD
25  0.001  newton-cg        0.72007 -0.018474
29  0.001       saga        0.72007 -0.018457 

Best EOD:
        C Solver  Mean accuracy  Mean EOD
34  0.01   saga       0.719925 -0.022143 

Lowest non-zero EOD:
         C Solver  Mean accuracy  Mean EOD
28  0.001    sag       0.720036 -0.018224 



### Test Results

In [None]:
# Load the best model
model_accuracy = joblib.load('fair_model_accuracy.joblib')
model_eod = joblib.load('fair_model_eod.joblib')

# Normalize the test dataset
scale_orig = StandardScaler()
x_test = scale_orig.fit_transform(test_data.features)
y_test = test_data.labels.ravel()

# Model prediction
predictions_accuracy = model_accuracy.predict(x_test)
predictions_eod = model_eod.predict(x_test)

# Create test dataset with predictions
test_pred_accuracy = test_data.copy()
test_pred_accuracy.labels = predictions_accuracy
test_pred_eod = test_data.copy()
test_pred_eod.labels = predictions_eod

# Metrics
metrics_best_accuracy = ClassificationMetric(test_data, test_pred_accuracy, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)
best_accuracy_accuracy = metrics_best_accuracy.accuracy()
best_accuracy_eod = metrics_best_accuracy.equal_opportunity_difference()

metrics_best_eod = ClassificationMetric(test_data, test_pred_eod, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)
best_eod_accuracy = metrics_best_eod.accuracy()
best_eod_eod = metrics_best_eod.equal_opportunity_difference()

# Print results
print("Most accurate model:")
print(f"Accuracy: {best_accuracy_accuracy}")
print(f"EOD: {best_accuracy_eod}", f'\n')
print("Best EOD model:")
print(f"Accuracy: {best_eod_accuracy}")
print(f"EOD: {best_eod_eod}")

Most accurate model:
Accuracy: 0.7197760599298693
EOD: 0.004663503874752228 

Best EOD model:
Accuracy: 0.525880618425247
EOD: 0.0


# Task 3