# Census Bureau ACS 1-Year - ETL

This notebook extracts ACS Supplemental Data at the 1-year estimate level of analysis for targeted geographies at meso levels of analysis. 

This ETL process uses `COUNTY` as the reference geography from which all other geographies are based. For example: if the Texas county of Bexar is the `COUNTY` of reference, data associated with any `CD<current_congress>`, `PLACE`, `PUMA20`, and `ZIP` geographies that intersect with Bexar `COUNTY` (have any overlapping areas) will also be collected for analysis. 

*Note: running this notebook requires Shapefiles for `CD<current_congress>`, `COUNTY`, `PLACE` (Texas), and `PUMA20`.*

## Extract

### Preparation

In [1]:
import pandas as pd
import geopandas as gpd
import warnings
import requests
import os
import sqlalchemy

In [2]:
"""
user variables
"""
# if initial run (if True, initializes databases, etc.)
initializing = True
# reference county or counties, as FIPS state + county code
county_or_counties = ['48029']
# specify the data source by year
year = '2022'
ignore_database = True
debug = True
"""
end user variables
"""

'\nend user variables\n'

In [3]:
# list of variables for ACS SE data set: https://api.census.gov/data/2019/acs/acsse/variables.html - 49 most relevant data sets below
tables = {'population_by_sex': ['K200101'],
              'population_by_age': ['K200104'],
              'race': ['K200201'],
              'hispanic_or_latino_origin': ['K200301'],
              "citizenship_status": ['K200501'],
              "place_of_birth_within_US": ['K200503'],
              "geographic_mobility_within_US": ['K200701'],
              "means_of_transport_to_work": ['K200801'],
              "travel_time_to_work": ['K200802'],
              "household_type": ['K200901'],
              'marital_status': ['K201001'],
              'own_children_under_18_by_family_type': ['K201101'],
              'presence_of_people_over_60yo_by_household_type': ['K201102'],
              'school_enrollment_by_school_lvl_for_population_3yo+': ['K201401'],
              'educational_attainment_for_population_25yo+': ['K201501'],
              'household_language': ['K201601'],
              'poverty_status_past_12_months_by_age': ['K201701'],
              'ratio_income_to_poverty_past_12_months': ['K201702'],
              'poverty_status_by_household_type_past_12_months': ['K201703'],
              'disability_status_by_age': ['K201801'],
              'work_experience_by_disability_status': ['K201802'],
              'types_of_disabilities': ['K201803'],
              'household_income_past_12_months': ['K201901'],
              'median_household_income_past_12_months': ['K201902'],
              'family_income_past_12_months': ['K201903'],   # see https://www2.census.gov/library/publications/decennial/2020/census-briefs/c2020br-10.pdf for difference between "household" and "family"
              'median_family_income_past_12_months': ['K201904'],
              'median_nonfamily_household_income_past_12_months': ['K201905'],
              'median_earnings_past_12_months_by_sex_by_work_experience_16yo+': ['K202002'],
              'veteran_status': ['K202101'],
              'service_connected_disability_rating_status_for_veterans': ['K202102'],
              'receipt_of_SNAP_benefits_past_12_months_by_presence_of_children': ['K202201'],
              'employment_status_population_16yo+': ['K202301'],
              'sex_by_full-time_work_status_population_16-to-64yo': ['K202302'],
              'occupation_for_employed_population_16yo+': ['K202401'],
              'class_of_worker_for_employed_population_16-to-64yo': ['K202402'],
              'industry_for_employed_population_16-to-64yo': ['K202403'],
              'housing_occupancy_status': ['K202501'],
              'housing_tenure': ['K202502'],    # see https://www2.census.gov/library/publications/decennial/2020/census-briefs/c2020br-09.pdf for more definition: "housing tenure identifies whether a housing unit is owner- or renter-occupied"
              'total_population_in_occupied_housing_units_by_tenure': ['K202503'],
              'year_householder_moved_into_unit': ['K202506'],
              'gross_rent': ['K202507'],
              'mortgage_status': ['K202508'],
              'housing_value': ['K202509'],
              'median_housing_value': ['K202510'],
              'median_gross_rent': ['K202511'],
              'age_by_health_insurance_coverage_status': ['K202701'],
              'private_health_insurance_status': ['K202702'],
              'public_health_insurance_status': ['K202703'],
              'computer_presence_and_internet_subscription_type_in_household': ['K202801']}

In [4]:
# loads GeoDataFrame from Shapefiles for reference geographies and turns county UCGIDs into an iterable list
county_ucgids_list = []
county_ucgids_list_of_lists = []
target_counties_gdf = gpd.GeoDataFrame()

counties_gdf = gpd.read_file('data/geospatial_files/shapefiles/counties/tl_2023_us_county.shp')
counties_gdf.set_crs(epsg='3395', inplace=True)
for county in county_or_counties:
    county_gdf = counties_gdf[counties_gdf['GEOID'] == county]
    county_ucgids_list_of_lists.append(list(counties_gdf['GEOIDFQ'][counties_gdf['GEOID'] == county])) 
    target_counties_gdf = pd.concat([target_counties_gdf, county_gdf])

for ucgid in county_ucgids_list_of_lists:
    county_ucgids_list.append(ucgid[0])

target_counties_gdf.head()

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,GEOID,GEOIDFQ,NAME,NAMELSAD,LSAD,CLASSFP,MTFCC,CSAFP,CBSAFP,METDIVFP,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry
615,48,29,1383800,48029,0500000US48029,Bexar,Bexar County,6,H1,G4020,484,41700,,A,3212426728,40788666,29.4486708,-98.5201465,"POLYGON ((-98.327 29.210, -98.327 29.210, -98...."


In [5]:
# loads GeoDataFrame from Shapefiles for `CD<current_congress>` geographies based on reference geographies
congressional_districts_gdf = gpd.read_file('data/geospatial_files/shapefiles/congressional_districts/118th_congress/tl_2023_48_cd118.shp')
congressional_districts_gdf.set_crs(epsg='3395', inplace=True)
# creates overlay, keeping only polygons that exist in both GeoDataFrames
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    congressional_districts_by_county_gdf = congressional_districts_gdf.overlay(target_counties_gdf, how='intersection')
# creates list of UCGIDs to use as inputs for API caller
congressional_districts_by_county_ucgid_list = list(congressional_districts_by_county_gdf['GEOIDFQ_1'])

congressional_districts_by_county_gdf.head()

Unnamed: 0,STATEFP_1,CD118FP,GEOID_1,GEOIDFQ_1,NAMELSAD_1,LSAD_1,CDSESSN,MTFCC_1,FUNCSTAT_1,ALAND_1,...,MTFCC_2,CSAFP,CBSAFP,METDIVFP,FUNCSTAT_2,ALAND_2,AWATER_2,INTPTLAT_2,INTPTLON_2,geometry
0,48,23,4823,5001800US4823,Congressional District 23,C2,118,G5200,N,152261432812,...,G4020,484,41700,,A,3212426728,40788666,29.4486708,-98.5201465,"POLYGON ((-98.805 29.692, -98.803 29.695, -98...."
1,48,28,4828,5001800US4828,Congressional District 28,C2,118,G5200,N,29415114978,...,G4020,484,41700,,A,3212426728,40788666,29.4486708,-98.5201465,"POLYGON ((-98.484 29.141, -98.484 29.142, -98...."
2,48,35,4835,5001800US4835,Congressional District 35,C2,118,G5200,N,1348685093,...,G4020,484,41700,,A,3212426728,40788666,29.4486708,-98.5201465,"POLYGON ((-98.543 29.428, -98.543 29.428, -98...."
3,48,20,4820,5001800US4820,Congressional District 20,C2,118,G5200,N,464891989,...,G4020,484,41700,,A,3212426728,40788666,29.4486708,-98.5201465,"POLYGON ((-98.788 29.501, -98.788 29.501, -98...."
4,48,21,4821,5001800US4821,Congressional District 21,C2,118,G5200,N,16309930932,...,G4020,484,41700,,A,3212426728,40788666,29.4486708,-98.5201465,"POLYGON ((-98.314 29.602, -98.314 29.602, -98...."


In [6]:
# loads GeoDataFrame from Shapefiles for `PLACE` geographies based on reference geographies
places_gdf = gpd.read_file('data/geospatial_files/shapefiles/places/tl_2023_48_place.shp')
places_gdf.set_crs(epsg='3395', inplace=True)
# creates overlay, keeping only polygons that exist in both GeoDataFrames
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    places_by_county_gdf = places_gdf.overlay(target_counties_gdf, how='intersection')
# creates list of UCGIDs to use as inputs for API caller
places_by_county_ucgid_list = list(places_by_county_gdf['GEOIDFQ_1'])

places_by_county_gdf.head()

Unnamed: 0,STATEFP_1,PLACEFP,PLACENS,GEOID_1,GEOIDFQ_1,NAME_1,NAMELSAD_1,LSAD_1,CLASSFP_1,PCICBSA,...,MTFCC_2,CSAFP,CBSAFP,METDIVFP,FUNCSTAT_2,ALAND_2,AWATER_2,INTPTLAT_2,INTPTLON_2,geometry
0,48,67268,2411878,4867268,1600000US4867268,Shavano Park,Shavano Park city,25,C1,N,...,G4020,484,41700,,A,3212426728,40788666,29.4486708,-98.5201465,"POLYGON ((-98.576 29.592, -98.576 29.592, -98...."
1,48,64172,2412593,4864172,1600000US4864172,St. Hedwig,St. Hedwig town,43,C1,N,...,G4020,484,41700,,A,3212426728,40788666,29.4486708,-98.5201465,"POLYGON ((-98.272 29.421, -98.272 29.421, -98...."
2,48,74408,2412134,4874408,1600000US4874408,Universal City,Universal City city,25,C1,N,...,G4020,484,41700,,A,3212426728,40788666,29.4486708,-98.5201465,"POLYGON ((-98.330 29.539, -98.327 29.541, -98...."
3,48,33146,2410736,4833146,1600000US4833146,Helotes,Helotes city,25,C1,N,...,G4020,484,41700,,A,3212426728,40788666,29.4486708,-98.5201465,"POLYGON ((-98.728 29.531, -98.727 29.532, -98...."
4,48,68708,2411926,4868708,1600000US4868708,Somerset,Somerset city,25,C1,N,...,G4020,484,41700,,A,3212426728,40788666,29.4486708,-98.5201465,"POLYGON ((-98.674 29.228, -98.674 29.228, -98...."


In [7]:
# loads GeoDataFrame from Shapefiles for `PUMA20` geographies based on reference geographies
pumas_gdf = gpd.read_file('data/geospatial_files/shapefiles/pumas/tl_2023_48_puma20.shp')
pumas_gdf.set_crs(epsg='3395', inplace=True)
# creates overlay, keeping only polygons that exist in both GeoDataFrames
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    pumas_by_county_gdf = pumas_gdf.overlay(target_counties_gdf, how='intersection')
# creates list of UCGIDs to use as inputs for API caller
pumas_by_county_ucgid_list = list(pumas_by_county_gdf['GEOIDFQ20'])

pumas_by_county_gdf.head()

Unnamed: 0,STATEFP20,PUMACE20,GEOID20,GEOIDFQ20,NAMELSAD20,MTFCC20,FUNCSTAT20,ALAND20,AWATER20,INTPTLAT20,...,MTFCC,CSAFP,CBSAFP,METDIVFP,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry
0,48,5907,4805907,795P200US4805907,Bexar County (South)--San Antonio City (Far So...,G6120,S,1271464920,33914665,29.3069306,...,G4020,484,41700,,A,3212426728,40788666,29.4486708,-98.5201465,"POLYGON ((-98.305 29.237, -98.307 29.235, -98...."
1,48,5908,4805908,795P200US4805908,San Antonio City (West)--Between Loop TX-1604 ...,G6120,S,68356557,216036,29.4400578,...,G4020,484,41700,,A,3212426728,40788666,29.4486708,-98.5201465,"POLYGON ((-98.671 29.389, -98.672 29.389, -98...."
2,48,5914,4805914,795P200US4805914,Bexar County (Northwest)--San Antonio (Far Nor...,G6120,S,473825286,874684,29.595835,...,G4020,484,41700,,A,3212426728,40788666,29.4486708,-98.5201465,"POLYGON ((-98.716 29.435, -98.716 29.434, -98...."
3,48,5903,4805903,795P200US4805903,San Antonio City (Southeast)--Inside Loop I-41...,G6120,S,89668983,423359,29.3672916,...,G4020,484,41700,,A,3212426728,40788666,29.4486708,-98.5201465,"POLYGON ((-98.437 29.413, -98.437 29.413, -98...."
4,48,5906,4805906,795P200US4805906,San Antonio City (Southwest)--Inside Loop I-41...,G6120,S,81573819,255474,29.3425496,...,G4020,484,41700,,A,3212426728,40788666,29.4486708,-98.5201465,"POLYGON ((-98.504 29.320, -98.504 29.320, -98...."


In [8]:
# grabs "crosswalk" table for name-label-concept list available through Census Bureau website that contains names for each individual field in each table, which will be used to programmatically give human-readable names to DataFrame/database columns
# crosswalk_df = pd.DataFrame()
crosswalk_url = f'https://api.census.gov/data/{year}/acs/acsse/variables/'

crosswalk_response = requests.get(crosswalk_url)
if crosswalk_response.status_code == 200:
    crosswalk_df = pd.DataFrame(crosswalk_response.json())
else:
    print('API mapper failure', crosswalk_response.status_code, crosswalk_response.text)

# convert first row into column headers, then deletes the row
crosswalk_df.columns = crosswalk_df.iloc[0]
crosswalk_df = crosswalk_df[1:]
# removes `ucgid` row, since this DataFrame is used as source to convert primary DataFrame column dtypes from objects to ints, and converting this column to int will fail
crosswalk_df.drop([3], inplace=True)

crosswalk_df.head()

Unnamed: 0,name,label,concept
1,for,Census API FIPS 'for' clause,Census API Geography Specification
2,in,Census API FIPS 'in' clause,Census API Geography Specification
4,K202101_002E,Estimate!!Total:!!Veteran:,Veteran Status for the Civilian Population 18 ...
5,K200201_006E,Estimate!!Total:!!Native Hawaiian and Other Pa...,Race
6,K202505_006E,Estimate!!Total:!!Built 1940 to 1959,Year Structure Built


In [9]:
# iterates through list of tables being collected AND through the list of GEOIDs collected from all targeted geographies to create an API URL call for each
base_url = f'https://api.census.gov/data/{year}/acs/acsse'

ucgid_list = county_ucgids_list + congressional_districts_by_county_ucgid_list + places_by_county_ucgid_list + pumas_by_county_ucgid_list

api_call_url_list = []
for key, value in tables.items():
    for county in county_ucgids_list:
        # ucgid docs - https://www.census.gov/data/developers/guidance/api-user-guide.Ucgid_Predicate.html 
        # ucgid ex.  - api.census.gov/data/2022/acs/acs1/profile?get=NAME,DP05_0001E&ucgid=0400000US06,0400000US41
        data_url = f'{base_url}?get=group({value[0]})&ucgid={",".join(ucgid_list)}'
        api_call_url_list.append(data_url)
        
api_call_url_list[0]

'https://api.census.gov/data/2022/acs/acsse?get=group(K200101)&ucgid=0500000US48029,5001800US4823,5001800US4828,5001800US4835,5001800US4820,5001800US4821,1600000US4867268,1600000US4864172,1600000US4874408,1600000US4833146,1600000US4868708,1600000US4865344,1600000US4866704,1600000US4865000,1600000US4875764,1600000US4872296,1600000US4879672,1600000US4825168,1600000US4801600,1600000US4805384,1600000US4813276,1600000US4814716,1600000US4816468,1600000US4823272,1600000US4845288,1600000US4831100,1600000US4833968,1600000US4834628,1600000US4839448,1600000US4842388,1600000US4843096,1600000US4866128,1600000US4814920,1600000US4853988,1600000US4817811,1600000US4845576,1600000US4840036,1600000US4866089,1600000US4860608,1600000US4873057,795P200US4805907,795P200US4805908,795P200US4805914,795P200US4805903,795P200US4805906,795P200US4805916,795P200US4805913,795P200US4805902,795P200US4805905,795P200US4805915,795P200US4805901,795P200US4805909,795P200US4805910,795P200US4805911,795P200US4805912,795P200US4805

### Extraction

In [10]:
# calls the API with a single URL containing one group of tables (max allowed) and COUNTY, PLACE, and CD<current_congress> geographies (only returns any of these geographies containing more than 20,000 total population)
def api_caller(url):
    r = requests.get(url)
    if r.status_code == 200:
        return r
    else:
        print(r.status_code)
        print(r.text)

In [11]:
# iterates through the list of URLs, calling the API caller once for each URL, and joins all the results into one DataFrame - joining process requires removing any columns that will be duplicated, else DataFrame merge will fail
df = pd.DataFrame()

for index, url in enumerate(api_call_url_list):
    # calls the API caller
    response = api_caller(url)
    # converts API response JSON object into a local-scope DataFrame
    temp_df = pd.DataFrame(response.json())
    # converts first row into column headers, then deletes row
    temp_df.columns = temp_df.iloc[0]
    temp_df = temp_df[1:]
    for series_name, series in temp_df.items():
        # drops the 'NAME' column for all but the first DataFrame
        if series_name == 'NAME':
            if index > 0:
                temp_df.drop(columns=[series_name], inplace=True)
        if series_name == 'GEO_ID':
            temp_df.drop(columns=[series_name], inplace=True)
    # if this is the first run, set non-local scope DataFrame, otherwise merge local and non-local DataFrames
    if index == 0:
        df = temp_df
    else:
        try: 
            df = df.merge(temp_df, on='ucgid')
        except (KeyError, IndexError):
            print('error on merge')
    if debug is True:
        if index > 1:
            break

In [12]:
df.shape

(27, 78)

## Transform

Once we've loaded the API data into memory, we can modify the data to exclude unnecesary fields before saving to the database. 

In [13]:
# remove columns representing annotations of estimates (*EA), margins of error (*M), and annotations of margins of error (*MA)
df.drop(columns=df.columns[df.columns.str.endswith(('EA', 'M', 'MA'))], inplace=True)

for series_name in df.columns:
    with warnings.catch_warnings():
        if crosswalk_df['name'].str.contains(series_name).any():
            # if the crosswalk contains the name of the DataFrame column (i.e., for any data column as opposed to names, descriptors, etc. ) replace table name based on key to one based on descriptor, then strip spaces, punctuation, etc. and replace with underscores for easier data manipulation and normalization, then convert from Series object to int dtype
            new_label = str(crosswalk_df[crosswalk_df['name'].str.contains(series_name)][['concept', 'label']].values)
            new_label = new_label.replace('[', '').replace(']', '').replace('\' \'', '__').replace(' ', '_').replace('\'', '').replace('!!', '_').replace(':', '').lower()
            try:
                df = df.astype({series_name: 'int'})
            except TypeError:
                pass
            df.rename(columns={series_name: new_label + '__' + series_name}, inplace=True, errors='raise')

In [14]:
df.shape

(27, 21)

The following cells separate out each geographic level of analysis into their own DataFrame - one each for `COUNTY`, `PLACE`, `CD<congressional_term>`, and `PUMA`. 

Once they are separated out, they are merged with their associated GeoDataFrame in order to save the GeoDataFrame's `geography` column, which contains the Shapefile polygons that can be used for geospatial analysis. 

In [15]:
# the following cells separate out each geography level of analysis into its own DataFrame - one each for COUNTY, PLACE, CD<congressional_term>, and PUMA
county_df = df[df['ucgid'].str.startswith('050')]
final_county_df = pd.merge(county_df, county_gdf[['GEOIDFQ', 'geometry']], left_on='ucgid', right_on='GEOIDFQ')

final_county_df.head()

Unnamed: 0,NAME,population_by_sex__estimate_total__K200101_001E,population_by_sex__estimate_total_male__K200101_002E,population_by_sex__estimate_total_female__K200101_003E,ucgid,population_by_age__estimate_total__K200104_001E,population_by_age__estimate_total_under_18_years__K200104_002E,population_by_age__estimate_total_18_to_24_years__K200104_003E,population_by_age__estimate_total_25_to_34_years__K200104_004E,population_by_age__estimate_total_35_to_44_years__K200104_005E,...,race__estimate_total__K200201_001E,race__estimate_total_white_alone__K200201_002E,race__estimate_total_black_or_african_american_alone__K200201_003E,race__estimate_total_american_indian_and_alaska_native_alone__K200201_004E,race__estimate_total_asian_alone__K200201_005E,race\n__estimate_total_native_hawaiian_and_other_pacific_islander_alone__K200201_006E,race__estimate_total_some_other_race_alone__K200201_007E,race__estimate_total_two_or_more_races__K200201_008E,GEOIDFQ,geometry
0,"Bexar County, Texas",2059530,1024634,1034896,0500000US48029,2059530,506192,215838,320508,296861,...,2059530,858997,160295,27710,66438,3075,231716,711299,0500000US48029,"POLYGON ((-98.327 29.210, -98.327 29.210, -98...."


In [16]:
place_df = df[df['ucgid'].str.startswith('160')]
final_place_df = pd.merge(place_df, places_gdf[['GEOIDFQ', 'geometry']], left_on='ucgid', right_on='GEOIDFQ')

final_place_df.head()

Unnamed: 0,NAME,population_by_sex__estimate_total__K200101_001E,population_by_sex__estimate_total_male__K200101_002E,population_by_sex__estimate_total_female__K200101_003E,ucgid,population_by_age__estimate_total__K200104_001E,population_by_age__estimate_total_under_18_years__K200104_002E,population_by_age__estimate_total_18_to_24_years__K200104_003E,population_by_age__estimate_total_25_to_34_years__K200104_004E,population_by_age__estimate_total_35_to_44_years__K200104_005E,...,race__estimate_total__K200201_001E,race__estimate_total_white_alone__K200201_002E,race__estimate_total_black_or_african_american_alone__K200201_003E,race__estimate_total_american_indian_and_alaska_native_alone__K200201_004E,race__estimate_total_asian_alone__K200201_005E,race\n__estimate_total_native_hawaiian_and_other_pacific_islander_alone__K200201_006E,race__estimate_total_some_other_race_alone__K200201_007E,race__estimate_total_two_or_more_races__K200201_008E,GEOIDFQ,geometry
0,"Cibolo city, Texas",34807,17825,16982,1600000US4814920,34807,9377,4075,3118,5878,...,34807,16029,6126,1217,1742,0,3211,6482,1600000US4814920,"MULTIPOLYGON (((-98.146 29.532, -98.145 29.532..."
1,"Converse city, Texas",29597,13336,16261,1600000US4816468,29597,9216,1879,4948,4660,...,29597,9714,6604,0,1136,0,2139,10004,1600000US4816468,"POLYGON ((-98.342 29.536, -98.342 29.536, -98...."
2,"San Antonio city, Texas",1472904,736985,735919,1600000US4865000,1472904,352299,163135,240895,203135,...,1472904,585287,99643,17594,45471,1789,182468,540652,1600000US4865000,"MULTIPOLYGON (((-98.305 29.455, -98.304 29.456..."
3,"Schertz city, Texas",45567,23010,22557,1600000US4866128,45567,10794,2983,3600,9914,...,45567,19325,4153,518,1637,0,5190,14744,1600000US4866128,"MULTIPOLYGON (((-98.201 29.509, -98.201 29.509..."
4,"Timberwood Park CDP, Texas",40601,18702,21899,1600000US4873057,40601,12299,3193,3846,6402,...,40601,21913,2099,0,802,0,1264,14523,1600000US4873057,"POLYGON ((-98.523 29.678, -98.523 29.678, -98...."


In [17]:
congressional_district_df = df[df['ucgid'].str.startswith('500')]
final_congressional_district_df = pd.merge(congressional_district_df, congressional_districts_gdf[['GEOIDFQ', 'geometry']], left_on='ucgid', right_on='GEOIDFQ')

final_congressional_district_df

Unnamed: 0,NAME,population_by_sex__estimate_total__K200101_001E,population_by_sex__estimate_total_male__K200101_002E,population_by_sex__estimate_total_female__K200101_003E,ucgid,population_by_age__estimate_total__K200104_001E,population_by_age__estimate_total_under_18_years__K200104_002E,population_by_age__estimate_total_18_to_24_years__K200104_003E,population_by_age__estimate_total_25_to_34_years__K200104_004E,population_by_age__estimate_total_35_to_44_years__K200104_005E,...,race__estimate_total__K200201_001E,race__estimate_total_white_alone__K200201_002E,race__estimate_total_black_or_african_american_alone__K200201_003E,race__estimate_total_american_indian_and_alaska_native_alone__K200201_004E,race__estimate_total_asian_alone__K200201_005E,race\n__estimate_total_native_hawaiian_and_other_pacific_islander_alone__K200201_006E,race__estimate_total_some_other_race_alone__K200201_007E,race__estimate_total_two_or_more_races__K200201_008E,GEOIDFQ,geometry
0,"Congressional District 20 (118th Congress), Texas",781188,392007,389181,5001800US4820,781188,189637,91966,122944,115857,...,781188,296043,48095,9425,33417,487,98412,295309,5001800US4820,"POLYGON ((-98.788 29.501, -98.788 29.501, -98...."
1,"Congressional District 23 (118th Congress), Texas",778355,396569,381786,5001800US4823,778355,203320,80918,109003,108626,...,778355,339088,26683,10982,17723,750,94047,289082,5001800US4823,"POLYGON ((-106.514 32.001, -106.510 32.001, -1..."
2,"Congressional District 28 (118th Congress), Texas",777758,386933,390825,5001800US4828,777758,217927,76243,108506,103976,...,777758,248104,36557,8341,7458,903,93044,383351,5001800US4828,"POLYGON ((-100.212 28.197, -100.212 28.197, -1..."
3,"Congressional District 35 (118th Congress), Texas",802077,403919,398158,5001800US4835,802077,173723,96034,163125,119418,...,802077,322027,94886,8388,29017,749,103894,243116,5001800US4835,"POLYGON ((-98.543 29.427, -98.543 29.428, -98...."
4,"Congressional District 21 (118th Congress), Texas",807859,402499,405360,5001800US4821,807859,175078,70471,101525,108224,...,807859,523543,31169,5672,15981,955,48611,181928,5001800US4821,"POLYGON ((-100.064 29.711, -100.064 29.711, -1..."


In [19]:
puma_df = df[df['ucgid'].str.startswith('795')]
final_puma_df = pd.merge(puma_df, pumas_gdf[['GEOIDFQ20', 'geometry']], left_on='ucgid', right_on='GEOIDFQ20')

final_puma_df.head()

Unnamed: 0,NAME,population_by_sex__estimate_total__K200101_001E,population_by_sex__estimate_total_male__K200101_002E,population_by_sex__estimate_total_female__K200101_003E,ucgid,population_by_age__estimate_total__K200104_001E,population_by_age__estimate_total_under_18_years__K200104_002E,population_by_age__estimate_total_18_to_24_years__K200104_003E,population_by_age__estimate_total_25_to_34_years__K200104_004E,population_by_age__estimate_total_35_to_44_years__K200104_005E,...,race__estimate_total__K200201_001E,race__estimate_total_white_alone__K200201_002E,race__estimate_total_black_or_african_american_alone__K200201_003E,race__estimate_total_american_indian_and_alaska_native_alone__K200201_004E,race__estimate_total_asian_alone__K200201_005E,race\n__estimate_total_native_hawaiian_and_other_pacific_islander_alone__K200201_006E,race__estimate_total_some_other_race_alone__K200201_007E,race__estimate_total_two_or_more_races__K200201_008E,GEOIDFQ20,geometry
0,San Antonio City (Southwest)--Inside Loop I-41...,116206,59026,57180,795P200US4805906,116206,32103,14417,13727,14397,...,116206,40651,3664,1263,937,274,27907,41510,795P200US4805906,"POLYGON ((-98.504 29.320, -98.504 29.320, -98...."
1,Bexar County (South)--San Antonio City (Far So...,155224,79243,75981,795P200US4805907,155224,38131,16498,25816,25524,...,155224,70927,12639,2864,2119,0,15238,51437,795P200US4805907,"POLYGON ((-98.305 29.237, -98.305 29.237, -98...."
2,"San Antonio (Northeast), Kirby & Windcrest Cit...",135956,67598,68358,795P200US4805913,135956,30607,13982,21932,16461,...,135956,44962,25751,1775,4233,336,21124,37775,795P200US4805913,"POLYGON ((-98.317 29.459, -98.317 29.459, -98...."
3,San Antonio City (Northwest)--Inside Loop I-41...,101698,52478,49220,795P200US4805904,101698,24053,10393,12265,15770,...,101698,30071,1467,1065,135,245,8187,60528,795P200US4805904,"POLYGON ((-98.526 29.517, -98.526 29.518, -98...."
4,San Antonio City (Southeast)--Inside Loop I-41...,120621,59441,61180,795P200US4805903,120621,30282,10140,18694,12813,...,120621,38926,5629,2823,452,152,25416,47223,795P200US4805903,"POLYGON ((-98.439 29.413, -98.437 29.413, -98...."


## Load

The following code loads the DataFrame/GeoDataFrames into the database for future analysis.

In [21]:
# creates connection to local SQLite database
databases_dirpath = os.path.join('data/databases')
demographics_db_filepath = os.path.join(databases_dirpath, 'census_acs_1yr_2022.db')
# uses user flag from first notebook cell to determine whether to replace database contents or add to each table
if initializing:
    replace_or_append = 'replace'
else:
    # todo: add code to check if fields exist before appending to writers
    replace_or_append = 'append'

# creates connection to SQLite database
sql_engine = sqlalchemy.create_engine('sqlite:///' + demographics_db_filepath)

The following cells modify the DataFrame to ensure column dtype compatibility with SQLAlchemy ORM (Polygon object must be changed to object), then write each DataFrame to the database into their own table (one each for `COUNTY`, `CD<current_congress>`, `PLACE`, and `PUMA`), and  returns the number of rows successfully written.

In [21]:
if ignore_database is False:    
    final_county_df['geometry'] = final_county_df['geometry'].astype(str)
    final_county_df.to_sql('county', sql_engine, if_exists=replace_or_append)

1

In [22]:
if ignore_database is False:    
    final_congressional_district_df['geometry'] = final_congressional_district_df['geometry'].astype(str)
    final_congressional_district_df.to_sql('congressional_district', sql_engine, if_exists=replace_or_append)

5

In [23]:
if ignore_database is False:    
    final_place_df['geometry'] = final_place_df['geometry'].astype(str)
    final_place_df.to_sql('place', sql_engine, if_exists=replace_or_append)

5

In [24]:
if ignore_database is False:    
    final_puma_df['geometry'] = final_puma_df['geometry'].astype(str)
    final_puma_df.to_sql('puma', sql_engine, if_exists=replace_or_append)

16