# Imports and Data Setup Code (from Spec Sheet)

In [11]:
from folktables import folktables
from folktables import ACSDataSource
import numpy as np

# (Age) must be greater than 16 and less than 90,
# and (Person weight) must be greater than or equal to 1
def employment_filter(data):
    """Filters for the employment prediction task"""
    df = data
    df = df[df['AGEP'] > 16]
    df = df[df['AGEP'] < 90]
    df = df[df['PWGTP'] >= 1]
    return df

ACSEmployment = folktables.BasicProblem(
    features=[
        'AGEP',  # age; for range of values of features please check Appendix B.4 of Retiring Adult: New Datasets for Fair Machine Learning NeurIPS 2021 paper
        'SCHL',  # educational attainment
        'MAR',   # marital status
        'RELP',  # relationship
        'DIS',   # disability recode
        'ESP',   # employment status of parents
        'CIT',   # citizenship status
        'MIG',   # mobility status (lived here 1 year ago)
        'MIL',   # military service
        'ANC',   # ancestry recode
        'NATIVITY',  # nativity
        'DEAR',   # hearing difficulty
        'DEYE',   # vision difficulty
        'DREM',   # cognitive difficulty
        'SEX',    # sex
        'RAC1P',  # recoded detailed race code
        'GCL',    # grandparents living with grandchildren
    ],
    target='ESR',  # employment status recode
    target_transform=lambda x: x == 1,
    group='DIS',
    preprocess=employment_filter,
    postprocess=lambda x: np.nan_to_num(x, -1),
)

data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
acs_data = data_source.get_data(states=["FL"], download=True)  # data for Florida state

features, label, group = ACSEmployment.df_to_numpy(acs_data)

from aif360.datasets import StandardDataset
import pandas as pd
import numpy as np

data = pd.DataFrame(features, columns=ACSEmployment.features)
data['label'] = label

favorable_classes = [True]
protected_attribute_names = [ACSEmployment.group]
privileged_classes = np.array([[1]])

data_for_aif = StandardDataset(
    data,
    label_name='label',
    favorable_classes=favorable_classes,
    protected_attribute_names=protected_attribute_names,
    privileged_classes=privileged_classes
)

privileged_groups = [{'DIS': 1}]
unprivileged_groups = [{'DIS': 2}]

  df.loc[pos, label_name] = favorable_label


# Train test split

In [77]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from aif360.metrics import ClassificationMetric

# Split the dataset into train-val and test sets
train_and_val_data, test_data = data_for_aif.split([0.7], shuffle=True, seed=0)

# Split train-val set into train and val sets
train_data, val_data = train_and_val_data.split([0.8], shuffle=True, seed=0)

# Normalize the train and val datasets
scale_orig = StandardScaler()
x_train = scale_orig.fit_transform(train_data.features)
y_train = train_data.labels.ravel()
x_val = scale_orig.fit_transform(val_data.features)
y_val = val_data.labels.ravel()

# Model
learner = LogisticRegression(solver='liblinear', random_state=0)
learner.fit(x_train,y_train)
predictions = learner.predict(x_val)

val_pred = val_data.copy()
val_pred.labels = predictions
print("Accuracy", sum(predictions==y_val)/len(y_val))

metric = ClassificationMetric(val_data, val_pred, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)
metric_arrs = {}
#Statistical Parity Difference measures the difference of the above values instead of ratios, hence we
#would like it to be close to 0.
metric_arrs['stat_par_diff']=(metric.statistical_parity_difference())
#Equal opportunity difference measures the ability of the classifier to accurately classify a datapoint as positive
#regardless of the presence of the unpriviliged feature. We would like it to be close to 0. A negative value signals bias
#towards priviliged.
metric_arrs['eq_opp_diff']=(metric.equal_opportunity_difference())
#Average of difference in FPR and TPR for unprivileged and privileged groups. A value of 0 indicates equality of odds.
metric_arrs['avg_odds_diff']=(metric.average_odds_difference())
#Balanced accuracy is a general metric, not dependent on bias. We would like to have it close to 1, meaning
#that the classifier can equally detect positive and negative classes.
metric_arrs['bal_acc']=((metric.true_positive_rate() + metric.true_negative_rate()) / 2)
#We would like Disparate Impact to be close to 1. It measures the ratio between the likelihood of the class being
#predicted as positive if we have the unpriviliged feature and the the same likelihood with the priviliged feature.
#Values close to 0 indicate strong bias.
metric_arrs['disp_imp']=(metric.disparate_impact())
print(metric_arrs)





Accuracy 0.7481962173931606
{'stat_par_diff': 0.5933665124477165, 'eq_opp_diff': 0.6025930026276827, 'avg_odds_diff': 0.49263644954306124, 'bal_acc': 0.7457300442774388, 'disp_imp': 9.355185504501474}
