# Auditing Allocative Bias

In [17]:
from folktables import ACSDataSource, ACSEmployment, BasicProblem, adult_filter
import numpy as np

STATE = "MA"

data_source = ACSDataSource(survey_year='2018', 
                            horizon='1-Year', 
                            survey='person')

acs_data = data_source.get_data(states=[STATE], download=True)

acs_data.head()

Downloading data for 2018 1-Year person survey for MA...


Unnamed: 0,RT,SERIALNO,DIVISION,SPORDER,PUMA,REGION,ST,ADJINC,PWGTP,AGEP,...,PWGTP71,PWGTP72,PWGTP73,PWGTP74,PWGTP75,PWGTP76,PWGTP77,PWGTP78,PWGTP79,PWGTP80
0,P,2018GQ0000024,1,1,3301,1,25,1013097,47,77,...,47,46,4,92,46,50,49,4,89,4
1,P,2018GQ0000063,1,1,1600,1,25,1013097,16,18,...,33,30,16,16,18,2,18,31,16,15
2,P,2018GQ0000075,1,1,703,1,25,1013097,60,28,...,110,116,57,8,60,107,60,62,109,110
3,P,2018GQ0000088,1,1,3301,1,25,1013097,72,22,...,71,74,10,10,129,128,10,73,128,70
4,P,2018GQ0000098,1,1,701,1,25,1013097,21,50,...,37,18,24,0,39,19,20,39,19,36


In [18]:
possible_features=['AGEP', 'SCHL', 'MAR', 'RELP', 'DIS', 'ESP', 'CIT', 'MIG', 'MIL', 'ANC', 'NATIVITY', 'DEAR', 'DEYE', 'DREM', 'SEX', 'RAC1P', 'ESR']
acs_data[possible_features].head()


Unnamed: 0,AGEP,SCHL,MAR,RELP,DIS,ESP,CIT,MIG,MIL,ANC,NATIVITY,DEAR,DEYE,DREM,SEX,RAC1P,ESR
0,77,19.0,3,16,2,,1,3.0,4.0,1,1,2,2,2.0,2,1,6.0
1,18,18.0,5,17,2,,1,1.0,4.0,2,1,2,2,2.0,2,9,1.0
2,28,21.0,5,17,2,,1,1.0,4.0,2,1,2,2,2.0,1,1,1.0
3,22,19.0,5,17,2,,1,1.0,4.0,1,1,2,2,2.0,1,1,6.0
4,50,1.0,5,17,1,,1,1.0,4.0,1,1,2,1,1.0,2,1,6.0


In [19]:
features_to_use = [f for f in possible_features if f not in ["ESR", "RAC1P"]]

In [20]:
EmploymentProblem = BasicProblem(
    features=features_to_use,
    target='ESR',
    target_transform=lambda x: x == 1,
    group='RAC1P',
    preprocess=lambda x: x,
    postprocess=lambda x: np.nan_to_num(x, -1),
)

features, label, group = EmploymentProblem.df_to_numpy(acs_data)

In [21]:
for obj in [features, label, group]:
  print(obj.shape)

(70131, 15)
(70131,)
(70131,)


In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
    features, label, group, test_size=0.2, random_state=0)

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

model = make_pipeline(StandardScaler(), LogisticRegression())
model.fit(X_train, y_train)

In [24]:
y_hat = model.predict(X_test)

In [25]:
(y_hat == y_test).mean()

0.7803521779425394

In [26]:
(y_hat == y_test)[group_test == 1].mean()

0.7833114897335081

In [27]:
(y_hat == y_test)[group_test == 2].mean()

0.7806122448979592