# 2022 American Community Survey (ACS)
### Here we create the ACS Income and ACS Travel datasets
#### https://en.wikipedia.org/wiki/American_Community_Survey
#### See also the data dictionary: https://www2.census.gov/programs-surveys/acs/tech_docs/pums/data_dict/PUMS_Data_Dictionary_2022.pdf

In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm

import folktables
from folktables import ACSDataSource

In [2]:
data_source = ACSDataSource(survey_year='2022', horizon='1-Year', survey='person')
ca_data = data_source.get_data(states=["CA"], download=True)

In [165]:
# the variable 'RELP' is not available in the 2022 1-Year ACS. so we remove it
ACSIncomeNew = folktables.BasicProblem(
    features=[
        'AGEP',
        'COW',
        'SCHL',
        'MAR',
        'OCCP',
        'POBP',
 #       'RELP',
        'WKHP',
        'SEX',
        'RAC1P',
    ],
    target='PINCP',
    target_transform=lambda x: x > 50000,    
    group='RAC1P',
    preprocess=folktables.adult_filter,
    postprocess=lambda x: np.nan_to_num(x, -1),
)
X_data, y_data, _ = ACSIncomeNew.df_to_pandas(ca_data)

In [67]:
feature_names = ["Age",
                 "Class of worker",
                 "Educational attainment",
                 "Marital status",
                 "Occupation",
                 "Place of birth",
                 "Usual hours worked per week past 12 months",
                 "Sex",
                 "Recoded race"]

In [167]:
# COW
with open('PUMS_Data_Dictionary_2022/COW.txt', 'r') as file:
    filedata = file.read()
COW_dict = {}
for x in filedata.split('\n'):
    COW_dict[int(x[:1])] = x[2:]

# SCHL
with open('PUMS_Data_Dictionary_2022/SCHL.txt', 'r') as file:
    filedata = file.read()
SCHL_dict = {}    
for x in filedata.split('\n'):
    SCHL_dict[int(x[:2])] = x[3:]

# MAR
with open('PUMS_Data_Dictionary_2022/MAR.txt', 'r') as file:
    filedata = file.read()
MAR_dict = {}
for x in filedata.split('\n'):
    MAR_dict[int(x[:1])] = x[2:]

# OCCP 
with open('PUMS_Data_Dictionary_2022/OCCP.txt', 'r') as file:
    filedata = file.read()
OCCP_dict = {}
for x in filedata.split('\n'):
    OCCP_dict[int(x[:4])] = x[10:]
OCCP_dict[9920] = "Unemployed, With No Work Experience In The Last 5 Years Or Earlier Or Never Worked"

# POBP
with open('PUMS_Data_Dictionary_2022/POBP.txt', 'r') as file:
    filedata = file.read()
POBP_dict = {}
for x in filedata.split('\n'):
    POBP_dict[int(x[:3])] = x[5:] 

# SEX
SEX_dict = {1: "Male", 2: "Female"}

# RAC1P
with open('PUMS_Data_Dictionary_2022/RAC1P.txt', 'r') as file:
    filedata = file.read()
RAC1P_dict = {}
for x in filedata.split('\n'):
    RAC1P_dict[int(x[:1])] = x[3:]

In [69]:
X_data_llm = X_data.copy()

X_data_llm['AGEP'] = X_data_llm['AGEP'].astype(int)
X_data_llm['WKHP'] = X_data_llm['WKHP'].astype(int)
# replace entries in column COW with the entries in COW_dict
X_data_llm['COW'] = X_data_llm['COW'].map(COW_dict)
X_data_llm['SCHL'] = X_data_llm['SCHL'].map(SCHL_dict)
X_data_llm['MAR'] = X_data_llm['MAR'].map(MAR_dict)
X_data_llm['OCCP'] = X_data_llm['OCCP'].map(OCCP_dict)
X_data_llm['POBP'] = X_data_llm['POBP'].map(POBP_dict)
X_data_llm['SEX'] = X_data_llm['SEX'].map(SEX_dict)
X_data_llm['RAC1P'] = X_data_llm['RAC1P'].map(RAC1P_dict)

In [70]:
#y_data_llm = y_data['PINCP'].map({True: "Makes more than $50,000 per year.", False: "Makes less than $50,000 per year."})
y_data_llm = y_data['PINCP'].map({True: "More than $50,000 per year.", False: "Less than $50,000 per year."})

In [71]:
X_data_llm.head()

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,WKHP,SEX,RAC1P
0,26,Employee of a private for-profit company or bu...,"1 or more years of college credit, no degree",Never married or under 15 years old,Food Service Managers,Mexico,30,Female,Some Other Race alone
1,38,Federal government employee,Regular high school diploma,Divorced,Photographers,Arizona/AZ,40,Female,White alone
2,23,Federal government employee,Regular high school diploma,Never married or under 15 years old,Military Enlisted Tactical Operations And Air/...,Montana/MT,40,Male,Two or More Races
3,20,Employee of a private for-profit company or bu...,"1 or more years of college credit, no degree",Never married or under 15 years old,Fast Food And Counter Workers,Nevada/NV,20,Male,Some Other Race alone
4,20,Federal government employee,Regular high school diploma,Never married or under 15 years old,"Military, Rank Not Specified",Tennessee/TN,50,Female,Two or More Races


### save the llm data to csv

In [None]:
df = pd.concat([X_data_llm, y_data_llm], axis=1)
# set column names to feature names
df.columns = feature_names + ['Target']
df.to_csv('../csv/acs-income-2022.csv', index=False)
df.head()

# ACSTravelTime

In [184]:
# from https://github.com/socialfoundations/folktables/blob/731b8d1470d36bbc1821e1953ba1308303eef95f/folktables/acs.py#L213
def travel_time_filter(data):
    """
    Filters for the employment prediction task
    """
    df = data
    df = df[df['AGEP'] > 16]
    df = df[df['PWGTP'] >= 1]
    df = df[df['ESR'] == 1]
    return df

ACSTravelTimeNew = folktables.BasicProblem(
    features=[
        'AGEP',
        'SCHL',
        'MAR',
        'SEX',
        'DIS',
        'ESP',
        'MIG',
#        'RELP',
        'RAC1P',
        'PUMA',
        'ST',
        'CIT',
        'OCCP',
 #       'JWTR',
       'POWPUMA',
       'POWSP'              # needed for pre-processing POWPUMA
 #       'POVPIP',
    ],
    target="JWMNP",
    target_transform=lambda x: x > 20,
    group='RAC1P',
    preprocess=travel_time_filter,
    postprocess=lambda x: np.nan_to_num(x, -1),
)
X_data, y_data, _ = ACSTravelTimeNew.df_to_pandas(ca_data)

In [185]:
X_data

Unnamed: 0,AGEP,SCHL,MAR,SEX,DIS,ESP,MIG,RAC1P,PUMA,ST,CIT,OCCP,POWPUMA,POWSP
0,26.0,19.0,5.0,2.0,1.0,0.0,1.0,8.0,8303.0,6.0,4.0,310.0,8300.0,6.0
1,47.0,14.0,3.0,2.0,2.0,0.0,3.0,1.0,3712.0,6.0,1.0,8990.0,7300.0,6.0
2,18.0,16.0,5.0,2.0,2.0,0.0,3.0,1.0,11301.0,6.0,1.0,3960.0,5900.0,6.0
3,21.0,16.0,5.0,1.0,2.0,0.0,1.0,2.0,116.0,6.0,1.0,4720.0,100.0,6.0
4,23.0,20.0,5.0,1.0,2.0,0.0,1.0,1.0,8702.0,6.0,1.0,5850.0,8700.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177719,32.0,22.0,5.0,2.0,1.0,0.0,1.0,6.0,5918.0,6.0,1.0,2004.0,5900.0,6.0
177720,33.0,19.0,1.0,2.0,2.0,0.0,1.0,9.0,2908.0,6.0,1.0,2016.0,2900.0,6.0
177721,53.0,16.0,1.0,2.0,2.0,0.0,1.0,8.0,3752.0,6.0,5.0,5000.0,3700.0,6.0
177722,51.0,11.0,1.0,1.0,2.0,0.0,1.0,8.0,3752.0,6.0,5.0,9620.0,3700.0,6.0


In [186]:
# train a gradient boosted tree to ballpark the amount of signal in the data
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=42)

model = xgb.XGBClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.7010268673512449

In [187]:
# ESP
with open('PUMS_Data_Dictionary_2022/ESP.txt', 'r') as file:
    filedata = file.read()
ESP_dict = {}
for x in filedata.split('\n'):
    ESP_dict[int(x[:1].strip())] = x[6:]
    
# DIS
DIS_dict = {1: "With a disability", 2: "Without a disability"}

# PUMA
with open('PUMS_Data_Dictionary_2022/PUMA.txt', 'r') as file:
    filedata = file.read()
PUMA_dict = {}
for x in filedata.split('\n'):
    st = int(x[:2])
    puma = int(x[3:8])
    v = x[9:].strip()
    if st in PUMA_dict:
        PUMA_dict[st][puma] = v
    else:
        PUMA_dict[st] = {puma: v}

# ST
with open('PUMS_Data_Dictionary_2022/ST.txt', 'r') as file:
    filedata = file.read()
ST_dict = {}
for x in filedata.split('\n'):
    ST_dict[int(x[:2].strip())] = x[3:].strip()[1:]

# CIT        
CIT_dict = {1: "Born in the United States", 2: "Born in Puerto Rico, Guam, the U.S. Virgin Islands, or Northern Marianas", 3: "Born abroad of U.S. citizen parent or parents", 4: "U.S. citizen by naturalization", 5: "Not a U.S. citizen"}

# MIG
MIG_dict = {0: ".N/A (less than 1 year old)", 1: "Yes, same house (nonmovers)", 2: "No, outside US and Puerto Rico", 3: "No, different house in US or Puerto Rico"}

In [188]:
X_data_llm = X_data.copy()

X_data_llm['AGEP'] = X_data_llm['AGEP'].astype(int)
X_data_llm['SCHL'] = X_data_llm['SCHL'].map(SCHL_dict)
X_data_llm['MAR'] = X_data_llm['MAR'].map(MAR_dict)
X_data_llm['SEX'] = X_data_llm['SEX'].map(SEX_dict)
X_data_llm['ESP'] = X_data_llm['ESP'].map(ESP_dict)
X_data_llm['DIS'] = X_data_llm['DIS'].map(DIS_dict)

# PUMA
values = []
for idx in range(X_data_llm.shape[0]):
    st = X_data_llm.iloc[idx]['ST']
    puma = X_data_llm.iloc[idx]['PUMA']
    values.append(PUMA_dict[int(st)][int(puma)])
X_data_llm['PUMA'] = values

# POWPUMA
# powpuma in the census data is less specific and does not contain the last digit code. we extract the larger area string from the more specific subregions
from os.path import commonprefix
values = []
for idx in range(X_data_llm.shape[0]):
    st = X_data_llm.iloc[idx]['POWSP']
    puma = X_data_llm.iloc[idx]['POWPUMA']
    if not int(st) in PUMA_dict:
        values.append("N/A")
        continue
    sub_values = []
    for i in range(100):
        if (int(puma)+i) in PUMA_dict[int(st)]:
            sub_values.append(PUMA_dict[int(st)][int(puma)+i])
    values.append(commonprefix(sub_values).strip().strip('('))
X_data_llm['POWPUMA'] = values

# drop powsp
X_data_llm = X_data_llm.drop(columns=['POWSP'])

X_data_llm['MIG'] = X_data_llm['MIG'].map(MIG_dict)
X_data_llm['CIT'] = X_data_llm['CIT'].map(CIT_dict)
X_data_llm['ST'] = X_data_llm['ST'].map(ST_dict)
X_data_llm['RAC1P'] = X_data_llm['RAC1P'].map(RAC1P_dict)
X_data_llm['OCCP'] = X_data_llm['OCCP'].map(OCCP_dict)

In [189]:
X_data_llm.head()

Unnamed: 0,AGEP,SCHL,MAR,SEX,DIS,ESP,MIG,RAC1P,PUMA,ST,CIT,OCCP,POWPUMA
0,26,"1 or more years of college credit, no degree",Never married or under 15 years old,Female,With a disability,"N/A (not own child of householder, and not chi...","Yes, same house (nonmovers)",Some Other Race alone,Santa Barbara County--South Coast Region,California/CA,U.S. citizen by naturalization,Food Service Managers,Santa Barbara County
1,47,Grade 11,Divorced,Female,Without a disability,"N/A (not own child of householder, and not chi...","No, different house in US or Puerto Rico",White alone,Los Angeles County (East Central)--Pomona City,California/CA,Born in the United States,"Miscellaneous Production Workers, Including Eq...",San Diego County
2,18,Regular high school diploma,Never married or under 15 years old,Female,Without a disability,"N/A (not own child of householder, and not chi...","No, different house in US or Puerto Rico",White alone,Yolo County (North)--Woodland & Winters Cities...,California/CA,Born in the United States,Other Protective Service Workers,Orange County
3,21,Regular high school diploma,Never married or under 15 years old,Male,Without a disability,"N/A (not own child of householder, and not chi...","Yes, same house (nonmovers)",Black or African American alone,Alameda County (Central)--Hayward City (East),California/CA,Born in the United States,Cashiers,Alameda County
4,23,Associate's degree,Never married or under 15 years old,Male,Without a disability,"N/A (not own child of householder, and not chi...","Yes, same house (nonmovers)",White alone,Santa Cruz County (South & Coastal)--Santa Cru...,California/CA,Born in the United States,"Mail Clerks And Mail Machine Operators, Except...",Santa Cruz County


In [190]:
X_data_llm.columns = ["Age", "Educational attainment", "Marital status", "Sex", "Disability", "Employment status of parents", "Lived here 1 year ago", "Recorded race", "Living Area", "State", "Citizenship", "Occupation", "Place of Work Area"]

In [191]:
X_data_llm

Unnamed: 0,Age,Educational attainment,Marital status,Sex,Disability,Employment status of parents,Lived here 1 year ago,Recorded race,Living Area,State,Citizenship,Occupation,Place of Work Area
0,26,"1 or more years of college credit, no degree",Never married or under 15 years old,Female,With a disability,"N/A (not own child of householder, and not chi...","Yes, same house (nonmovers)",Some Other Race alone,Santa Barbara County--South Coast Region,California/CA,U.S. citizen by naturalization,Food Service Managers,Santa Barbara County
1,47,Grade 11,Divorced,Female,Without a disability,"N/A (not own child of householder, and not chi...","No, different house in US or Puerto Rico",White alone,Los Angeles County (East Central)--Pomona City,California/CA,Born in the United States,"Miscellaneous Production Workers, Including Eq...",San Diego County
2,18,Regular high school diploma,Never married or under 15 years old,Female,Without a disability,"N/A (not own child of householder, and not chi...","No, different house in US or Puerto Rico",White alone,Yolo County (North)--Woodland & Winters Cities...,California/CA,Born in the United States,Other Protective Service Workers,Orange County
3,21,Regular high school diploma,Never married or under 15 years old,Male,Without a disability,"N/A (not own child of householder, and not chi...","Yes, same house (nonmovers)",Black or African American alone,Alameda County (Central)--Hayward City (East),California/CA,Born in the United States,Cashiers,Alameda County
4,23,Associate's degree,Never married or under 15 years old,Male,Without a disability,"N/A (not own child of householder, and not chi...","Yes, same house (nonmovers)",White alone,Santa Cruz County (South & Coastal)--Santa Cru...,California/CA,Born in the United States,"Mail Clerks And Mail Machine Operators, Except...",Santa Cruz County
...,...,...,...,...,...,...,...,...,...,...,...,...,...
177719,32,Master's degree,Never married or under 15 years old,Female,With a disability,"N/A (not own child of householder, and not chi...","Yes, same house (nonmovers)",Asian alone,Orange County (Central)--Costa Mesa & Fountain...,California/CA,Born in the United States,Mental Health Counselors,Orange County
177720,33,"1 or more years of college credit, no degree",Married,Female,Without a disability,"N/A (not own child of householder, and not chi...","Yes, same house (nonmovers)",Two or More Races,Kern County (Central)--Bakersfield City (South...,California/CA,Born in the United States,Social And Human Service Assistants,Kern County
177721,53,Regular high school diploma,Married,Female,Without a disability,"N/A (not own child of householder, and not chi...","Yes, same house (nonmovers)",Some Other Race alone,Los Angeles County (South)--South Gate & Lynwo...,California/CA,Not a U.S. citizen,First-Line Supervisors Of Office And Administr...,Los Angeles County
177722,51,Grade 8,Married,Male,Without a disability,"N/A (not own child of householder, and not chi...","Yes, same house (nonmovers)",Some Other Race alone,Los Angeles County (South)--South Gate & Lynwo...,California/CA,Not a U.S. citizen,"Laborers And Freight, Stock, And Material Move...",Los Angeles County


In [192]:
y_data.columns = ["Travel Time to Work"]

# re-encode target
y_data = y_data['Travel Time to Work'].map({True: "More than 20 minutes", False: "Less than 20 minutes"})

In [193]:
df = pd.concat([X_data_llm, y_data], axis=1)
# set column names to feature names

df.to_csv('../acs-travel-2022.csv', index=False)
df.head()

Unnamed: 0,Age,Educational attainment,Marital status,Sex,Disability,Employment status of parents,Lived here 1 year ago,Recorded race,Living Area,State,Citizenship,Occupation,Place of Work Area,Travel Time to Work
0,26,"1 or more years of college credit, no degree",Never married or under 15 years old,Female,With a disability,"N/A (not own child of householder, and not chi...","Yes, same house (nonmovers)",Some Other Race alone,Santa Barbara County--South Coast Region,California/CA,U.S. citizen by naturalization,Food Service Managers,Santa Barbara County,Less than 20 minutes
1,47,Grade 11,Divorced,Female,Without a disability,"N/A (not own child of householder, and not chi...","No, different house in US or Puerto Rico",White alone,Los Angeles County (East Central)--Pomona City,California/CA,Born in the United States,"Miscellaneous Production Workers, Including Eq...",San Diego County,More than 20 minutes
2,18,Regular high school diploma,Never married or under 15 years old,Female,Without a disability,"N/A (not own child of householder, and not chi...","No, different house in US or Puerto Rico",White alone,Yolo County (North)--Woodland & Winters Cities...,California/CA,Born in the United States,Other Protective Service Workers,Orange County,Less than 20 minutes
3,21,Regular high school diploma,Never married or under 15 years old,Male,Without a disability,"N/A (not own child of householder, and not chi...","Yes, same house (nonmovers)",Black or African American alone,Alameda County (Central)--Hayward City (East),California/CA,Born in the United States,Cashiers,Alameda County,Less than 20 minutes
4,23,Associate's degree,Never married or under 15 years old,Male,Without a disability,"N/A (not own child of householder, and not chi...","Yes, same house (nonmovers)",White alone,Santa Cruz County (South & Coastal)--Santa Cru...,California/CA,Born in the United States,"Mail Clerks And Mail Machine Operators, Except...",Santa Cruz County,Less than 20 minutes
