In [1]:
import pandas as pd
import numpy as np
import pickle
from datetime import datetime

# ACS API Queries
Variables: https://api.census.gov/data/2019/acs/acs1/variables.html

In [8]:
import requests
import json
df_model = pd.DataFrame()
host = 'https://api.census.gov/data'
years = ['2010','2011','2012','2013','2014','2015','2016','2017','2018','2019','2020','2021','2022']
dataset_acronym = '/acs/acs5'
g = '?get='
variables = 'NAME,B02001_001E,B02001_002E,B02001_003E,B02001_004E,B02001_005E,B02001_006E,B03001_003E,B06009_001E,B06009_005E,B06009_006E,B06009_007E,B19013_001E,B25035_001E,B25036_002E,B25036_013E,B25077_001E,B25039_001E'
location = '&for=TRACT:*&in=state:36%county:005,047,061,081,085'
usr_key = f"&key=332de56715b7f43a68ee347091c29afad759be55"

for year in years:
    print(year)
    query_url = f"{host}/{year}{dataset_acronym}{g}{variables}{location}{usr_key}"
    
    # Use requests package to call out to the API
    response = requests.get(query_url)
    
    ## Rename columns for clarity
    
    table_lists = json.loads(response.text)
    df = pd.DataFrame(table_lists[1:], columns=table_lists[0])
    df['year'] = year
    
    col_names = {"B02001_001E":"total_pop",
                 "B02001_002E":"percent_white",
                 "B02001_003E":"percent_black",
                 "B02001_004E":"percent_native",
                 "B02001_005E":"percent_asian",
                 "B02001_006E":"percent_pacific",
                 "B03001_003E":"percent_latino",
                 "B06009_001E":"total_edu",
                 "B06009_005E":"percent_bachelors",
                 "B06009_006E":"percent_grad",
                 "B06009_007E":"percent_instate",
                 "B19013_001E":"med_income",
                 "B25035_001E":"med_yr_built",
                 "B25036_002E":"percent_owned",
                 "B25036_013E":"percent_rented",
                 "B25077_001E":"med_value",
                 "B25039_001E":"med_yr_moved_in"}
    df.rename(columns= col_names, inplace=True)

    
    ## GENERATE FEATURES
    
    # change counts into percentages
    for idx in [2,3,4,5,6,7,11]:
        df.iloc[:,idx] = df.iloc[:,idx].astype(float)/df.iloc[:,1].astype(float)
    
    for idx in [9,10]:
        df.iloc[:,idx] = df.iloc[:,idx].astype(float)/df.iloc[:,8].astype(float)
    
    df['total_hh'] = df.iloc[:,14].astype(float) +df.iloc[:,15].astype(float)
    for idx in [14,15]:
        df.iloc[:,idx] = df.iloc[:,idx].astype(float)/df['total_hh']
    
    # change year into number of years from present
    
    df['med_yr_built'] = df['med_yr_built'].replace("-",np.nan)
    df['med_yr_built'] = df['med_yr_built'].apply(lambda x: str(x).replace("-",""))
    df['med_yr_built'] = df['med_yr_built'].apply(lambda x: str(x).replace("+",""))
    current_year = datetime.now().year
    df['med_hh_age'] = current_year - df['med_yr_built'].astype(float)
    
    df['med_yr_moved_in']= df['med_yr_moved_in'].replace("-",np.nan)
    df['med_yr_moved_in'] = df['med_yr_moved_in'].apply(lambda x: str(x).replace("+",""))
    current_year = datetime.now().year
    df['med_hh_residence'] = current_year - df['med_yr_moved_in'].astype(float)
    
    df.drop(["total_pop", "total_edu", "med_yr_moved_in","med_yr_built", "state","county","total_hh"],axis=1,inplace=True)
    
    # JOIN THIS YEAR INTO THE REST
    df_model=pd.concat([df_model,df])
    

2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022


In [9]:
### encode the counties

conditions = [df_model['NAME'].str.contains("Bronx"),
             df_model['NAME'].str.contains("New York County"),
             df_model['NAME'].str.contains("Kings"),
             df_model['NAME'].str.contains("Queens"),
             df_model['NAME'].str.contains("Richmond")]
choices = ['005','061','047','081','085']

df_model['county'] = np.select(conditions, choices, -1)
df_model['full_tract'] = df_model.county + df_model.tract

In [11]:
# remove nulls
df_model.drop(df_model[df_model.percent_white.isnull()].index, inplace=True) # drop these bc they are entirely null

# remove rows with no med_income (no label)
df_model.drop(df_model[df_model.med_income.astype(float)<0].index, inplace=True)

# sort dataframe on tracts and years, then backfill missing vals
df_model.sort_values(['full_tract','year'], inplace=True)
df_model = df_model.bfill(limit=5)

df_model.drop(['tract','county'], axis=1, inplace=True)

### TODO -> address -66666666 values later!!!
df_model['med_value'] = df_model['med_value'].astype(float)

  df_model = df_model.bfill(limit=5)


In [12]:
df_model

Unnamed: 0,NAME,percent_white,percent_black,percent_native,percent_asian,percent_pacific,percent_latino,percent_bachelors,percent_grad,percent_instate,med_income,percent_owned,percent_rented,med_value,year,med_hh_age,med_hh_residence,full_tract
203,"Census Tract 2, Bronx County, New York",0.145124,0.309146,0.008866,0.042697,0.0,0.715119,0.102853,0.039775,0.231918,59826,0.992405,0.007595,392500.0,2010,76.0,22.0,005000200
1182,"Census Tract 2, Bronx County, New York",0.165115,0.321145,0.008858,0.036339,0.0,0.695889,0.141304,0.036879,0.214626,63449,0.956407,0.043593,422900.0,2011,72.0,20.0,005000200
1142,"Census Tract 2, Bronx County, New York",0.176806,0.318547,0.007816,0.057879,0.0,0.658428,0.157368,0.042918,0.250739,71250,1.000000,0.000000,422200.0,2012,74.0,22.0,005000200
1082,"Census Tract 2, Bronx County, New York",0.278304,0.309592,0.007205,0.053314,0.0,0.666118,0.131664,0.036086,0.237546,69514,1.000000,0.000000,418100.0,2013,71.0,22.0,005000200
95,"Census Tract 2, Bronx County, New York",0.315211,0.279293,0.001374,0.064966,0.0,0.687929,0.123925,0.022828,0.244553,74837,1.000000,0.000000,410400.0,2014,64.0,21.0,005000200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
730,"Census Tract 323, Richmond County, New York",0.399824,0.509666,0.000000,0.006151,0.0,0.318102,0.127686,0.069532,0.358524,71250,0.756818,0.243182,270200.0,2018,31.0,18.0,085032300
46,"Census Tract 323, Richmond County, New York",0.465983,0.391426,0.000000,0.014911,0.0,0.422181,0.132791,0.062331,0.375582,65089,0.694013,0.305987,267000.0,2019,33.0,19.0,085032300
1419,"Census Tract 323, Richmond County, New York",0.386667,0.345833,0.000000,0.012500,0.0,0.323333,0.123762,0.237624,0.417500,95469,0.838202,0.161798,256600.0,2020,33.0,19.0,085032300
2325,"Census Tract 323, Richmond County, New York",0.305882,0.390045,0.000000,0.029864,0.0,0.297738,0.093315,0.207521,0.396380,86471,0.752381,0.247619,237100.0,2021,33.0,19.0,085032300


In [14]:
# df_model.to_parquet("Data/Cleaned/ACS.parquet")