Script to create feature files for case study on ACS Public Coverage for Figure 5b
and retraining analysis in Section 6 of the manuscript

In [29]:
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score

In [None]:
!pip install folktables whyshift

In [30]:
from whyshift import get_data
from folktables import ACSDataSource

def prepare_data_acs(outcome, state, year=2018):
    data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
    _ = data_source.get_data(states=[state], download=True)
    
    X, y, feature_names = get_data(outcome, state, True, './data/', year)
    Xy = np.concatenate([X,y[:,np.newaxis]],axis=1)
    df = pd.DataFrame(Xy, columns=feature_names+["target"])
    features = [i for i in feature_names+["target"] if (('relp' not in i) and ('occp' not in i) and ('cow' not in i))]
    print(len(features))
    df[features].to_csv("%s_%s_%s.csv" % (outcome, state, year))
    return df[features]

In [31]:
sourceXWY_full = prepare_data_acs("pubcov", "NE", 2018)
targetXWY_full = prepare_data_acs("pubcov", "LA", 2018)
np.random.seed(0)

43
43


In [19]:
sourceXWY_full = pd.read_csv("pubcov_NE_2018.csv", index_col=0)
targetXWY_full = pd.read_csv("pubcov_LA_2018.csv", index_col=0)

In [20]:
targetXWY_full = targetXWY_full.drop([
    'race_black',
    'race_am_ind',
    'race_alaska',
    'race_am_alaska',
    'race_asian',
    'race_hawaiian',
    'race_other',
    'race_two_or_more',
], axis=1)
sourceXWY_full = sourceXWY_full.drop([
    'race_black',
    'race_am_ind',
    'race_alaska',
    'race_am_alaska',
    'race_asian',
    'race_hawaiian',
    'race_other',
    'race_two_or_more',
], axis=1)

In [21]:
feature_names_XWY = sourceXWY_full.columns
sourceXWY_full = sourceXWY_full.to_numpy()
targetXWY_full = targetXWY_full.to_numpy()

In [22]:
list(enumerate(feature_names_XWY))

[(0, 'SEX'),
 (1, 'AGEP'),
 (2, 'DIS'),
 (3, 'ESP'),
 (4, 'MIG'),
 (5, 'MIL'),
 (6, 'ANC'),
 (7, 'NATIVITY'),
 (8, 'DEAR'),
 (9, 'DEYE'),
 (10, 'DREM'),
 (11, 'PINCP'),
 (12, 'FER'),
 (13, 'married'),
 (14, 'widowed'),
 (15, 'divorced'),
 (16, 'separated'),
 (17, 'never'),
 (18, 'race_white'),
 (19, 'SCHL'),
 (20, 'schl_at_least_bachelor'),
 (21, 'schl_at_least_high_school_or_ged'),
 (22, 'schl_postgrad'),
 (23, 'CIT_us'),
 (24, 'CIT_pr'),
 (25, 'CIT_abroad'),
 (26, 'CIT_citizen'),
 (27, 'CIT_not'),
 (28, 'ESR_employed'),
 (29, 'ESR_partial_employed'),
 (30, 'ESR_unemployed'),
 (31, 'ESR_armed'),
 (32, 'ESR_partial_armed'),
 (33, 'ESR_no'),
 (34, 'target')]

In [23]:
feature_df = pd.DataFrame(feature_names_XWY, columns=["vars_name"])
feature_df = feature_df[~feature_df["vars_name"].isin(['SEX','AGEP','race_white'])]
feature_df.insert(loc=0, column="vars", value=np.arange(feature_df.shape[0])+1)
feature_df["vars"] = "X"+feature_df["vars"].astype(str)
feature_df["y_axis_name"] = "Variable"
feature_df.iloc[:-1,:].to_csv("acs_pubcov_feature_names.csv", index=False)

In [24]:
feature_df

Unnamed: 0,vars,vars_name,y_axis_name
2,X1,DIS,Variable
3,X2,ESP,Variable
4,X3,MIG,Variable
5,X4,MIL,Variable
6,X5,ANC,Variable
7,X6,NATIVITY,Variable
8,X7,DEAR,Variable
9,X8,DEYE,Variable
10,X9,DREM,Variable
11,X10,PINCP,Variable


In [34]:
sourceXWY_full.shape, targetXWY_full.shape

((6332, 43), (16879, 43))

In [35]:
source_val_size = 3000
target_size = 6000
source_train_size = 3300

sourceXWY_train, sourceXWY_val = train_test_split(
    sourceXWY_full, train_size=source_train_size, test_size=source_val_size, random_state=0)
_, targetXWY_val = train_test_split(
    targetXWY_full, test_size=target_size, random_state=0)

In [36]:
pd.DataFrame(
    targetXWY_val.astype(float), 
    columns=feature_names_XWY
).to_csv('acs_pubcov_target.csv', index=False)
pd.DataFrame(
    sourceXWY_val.astype(float), 
    columns=feature_names_XWY
).to_csv('acs_pubcov_source_val.csv', index=False)
pd.DataFrame(
    sourceXWY_train.astype(float), 
    columns=feature_names_XWY
).to_csv('acs_pubcov_source_train.csv', index=False)