# Geography and Table Export Selector

This notebook allows users to select multiple geographies and demographic tables and extracts data in format that can be imported into Tableau Public Desktop for data visualization 

## Data Preparation
Import required libraries and make data calls that do not require user input. 


In [1]:
import pandas as pd
import os
import sqlalchemy
import datetime
import geopandas as gpd
import ipywidgets as widgets

In [2]:
# enables pandas copy-on-write globally
pd.set_option("mode.copy_on_write", True)

## User Input

Select `Survey Year`, `Survey Type`, and `Geography Type`.

If `Survey Type` is `5 Year Estimates`, then select the overarching geography type and (in a later selection box) the specific geography from which 
the block groups will be selected from. All block groups will be extracted for the targets overarching geography. 

In [3]:
years_available = ['2022']
acs_survey_year = widgets.Dropdown(
    options = years_available,
    value = '2022',
    disabled=False
)

survey_type = widgets.Dropdown(
    options=[('1-Year Estimates', 'acsse'), ('5-Year Estimate', 'acs5')],
    disabled=False,
    layout=widgets.Layout(width='max-content'),
)

geography_type = widgets.Dropdown(
        options=[('Congressional District', 'congressional_district'), ('County', 'county'), ('Place', 'place'), ('PUMA', 'puma')],
        value='county',
        disabled=False
    )

accordion = widgets.Accordion(
    children=[acs_survey_year, survey_type, geography_type], 
    titles=('ACS Survey Year', 'Survey Type', 'Geography Type'), 
    selected_index=0)

# todo: add toggle for selecting between `All Available Tables` and `Cross-Referenced Tables Only`

accordion

Accordion(children=(Dropdown(options=('2022',), value='2022'), Dropdown(layout=Layout(width='max-content'), op…

In [4]:
# todo: add logic for showing tables based on user-selected `All Available Tables` or `Cross-Referenced Tables Only`
# imports crosswalked survey data to math ACS SE 1yr and ACS 5yr data
db_filepath = os.path.join(f'data/databases/census_bureau/census_acs-acsse_crosswalk.db')
sql_engine = sqlalchemy.create_engine('sqlite:///' + db_filepath)
crosswalk_tables = pd.read_sql_table(f'acs5_{acs_survey_year.value}-acsse_{acs_survey_year.value}_crosswalk', sql_engine)
concepts = list(crosswalk_tables.concept)

# imports Census survey data - both survey types require ACS SE table data, but ACS5 data is only imported if required
acsse_survey_db_filepath = os.path.join(f'data/databases/census_bureau/census_acsse_{acs_survey_year.value}_{geography_type.value}.db')
acsse_survey_sql_engine = sqlalchemy.create_engine('sqlite:///' + acsse_survey_db_filepath)
if survey_type.value == 'acs5':
    acs5_survey_db_filepath = os.path.join(f'data/databases/census_bureau/census_{survey_type.value}_{acs_survey_year.value}_block_group.db')
    acs5_survey_sql_engine = sqlalchemy.create_engine('sqlite:///' + acs5_survey_db_filepath)
    tables = pd.read_sql_table('tables', acs5_survey_sql_engine)
else:
    tables = pd.read_sql_table('tables', acsse_survey_sql_engine)


# adds index as second item in each tuple for loading into widget
concepts_for_widget = []
for index, concept in enumerate(concepts):
    concept = str(concept).capitalize().replace('_', ' ')
    concepts_for_widget.append((concept, index))

select_widget_tables = widgets.SelectMultiple(
    options=list(concepts_for_widget),
    rows=len(concepts_for_widget),
    layout=widgets.Layout(width='max-content'),
    disabled=False
)

select_widget_tables

SelectMultiple(layout=Layout(width='max-content'), options=(('Race', 0), ('Year structure built', 1), ('School…

### Select Specific Geography

In [9]:
# loads all database data locally and conducts any further filtering from this notebook locally
selected_tables = []
for table in select_widget_tables.value:
    selected_tables.append(crosswalk_tables.iloc[table][f'{survey_type.value}_table_name'])

list_of_dataframes = []
# selects name of geographies available
for table in selected_tables:
    if survey_type.value == 'acs5':
        df = pd.read_sql_table(table, acs5_survey_sql_engine)
    elif survey_type.value == 'acsse':
        # calls appropriate database and extracts selected table from each into its own dataframe
        df = pd.read_sql_table(table, acsse_survey_sql_engine)
    # adds each crated dataframe to a list of dataframes
    list_of_dataframes.append(df)

# pulls specific geographies from the first dataframe for user selection
if survey_type.value == 'acsse':
    geography_names = list(list_of_dataframes[0]['NAME'])
elif survey_type.value == 'acs5':
    acsse_correlate = crosswalk_tables.loc[crosswalk_tables['acs5_table_name'] == selected_tables[0]]
    table_for_geography = (acsse_correlate['acsse_table_name'].to_string(index=False))
    available_geography_df = pd.read_sql_table(table_for_geography, acsse_survey_sql_engine)
    geography_names = list(available_geography_df['NAME'])

select_geography_tables = widgets.SelectMultiple(
    options=geography_names,
    rows=len(geography_names),
    layout=widgets.Layout(width='max-content'),
    disabled=False
)
select_geography_tables

SelectMultiple(layout=Layout(width='max-content'), options=('Travis County, Texas',), rows=1, value=())

In [10]:
# loads crosswalk table from the selected survey's database that correlates table column code `name` fields with human-readable `label`
select_geography_tables_list = select_geography_tables.value

crosswalk_df = pd.DataFrame()

# filters available data by the geographies we selected and creates crosswalk dataframe to match table 
if survey_type.value == 'acs5':
    crosswalk_df = pd.read_sql_table('crosswalk', acs5_survey_sql_engine)
    if geography_type.value == 'county':
        # parses state and county FIPS codes from available geography dataframe used for selecting geography 
        state_fips = available_geography_df['ucgid'].to_string(index=False)[9:11]
        county_fips = available_geography_df['ucgid'].to_string(index=False)[11:]

        block_group_gpd = gpd.read_file(
            f'data/geospatial_files/census_bureau/block_groups/{acs_survey_year.value}/tl_{acs_survey_year.value}_{state_fips}_bg.shp')
        block_group_gpd.set_crs(epsg=3395, inplace=True)
                
        block_group_gpd = block_group_gpd[block_group_gpd['COUNTYFP'] == county_fips]
        selected_ucgids = block_group_gpd['GEOID'].tolist()
elif survey_type.value == 'acsse':
    crosswalk_df = pd.read_sql_table('crosswalk', acsse_survey_sql_engine)

crosswalk_df.head()

Unnamed: 0,index,name,label,concept
0,4,B24022_060E,Estimate!!Total:!!Female:!!Service occupations...,Sex by Occupation and Median Earnings in the P...
1,5,B19001B_014E,"Estimate!!Total:!!$100,000 to $124,999",Household Income in the Past 12 Months (in 202...
2,6,B07007PR_019E,Estimate!!Total:!!Moved from different municip...,Geographical Mobility in the Past Year by Citi...
3,7,B19101A_004E,"Estimate!!Total:!!$15,000 to $19,999",Family Income in the Past 12 Months (in 2022 I...
4,8,B24022_061E,Estimate!!Total:!!Female:!!Service occupations...,Sex by Occupation and Median Earnings in the P...


In [11]:
# list to append human-readable table names to for printing later
table_names = []

# sets timestamp and geography type values to create unique directory name for each set of files created
timestamp = datetime.datetime.strftime(datetime.datetime.now(), "%m-%d-%Y %I:%M:%S%p")
timestamp = timestamp.lower()
output_path_dir = ''
if survey_type.value == 'acs5':
    output_path_dir = os.path.join('data', 'outputs', 'census_bureau', f'{timestamp} - block group.csv')
elif survey_type.value == 'acsse':
    output_path_dir = os.path.join('data', 'outputs', 'census_bureau', f'{timestamp} - {geography_type.value}.csv')
os.makedirs(output_path_dir, exist_ok=True)

combined_dataframe = pd.DataFrame()

# subset dataframe by selected geographies in select geography list
for dataframe in list_of_dataframes:    
    if survey_type.value == 'acsse':
        dataframe = dataframe[dataframe['NAME'].isin(select_geography_tables_list)]
    elif survey_type.value == 'acs5':
        dataframe = dataframe[dataframe['ucgid'].str.endswith(tuple(selected_ucgids))]

    # parses table name from table column, before it gets renamed using crosswalk
    table_code = str(dataframe.columns[2]).split('_')[0]
    table_name_df = tables[tables['name'].isin([table_code])]
    table_name = table_name_df['concept'].iloc[0]
    table_names.append(table_name)

    # renames table column names with data from crosswalk 
    for series_name in dataframe.columns:
        if crosswalk_df['name'].str.contains(series_name).any():
            # if the crosswalk contains the name of the DataFrame column (i.e., for any data column as opposed to names, descriptors, etc.) replace table name based on key to one based on descriptor, then strip spaces, punctuation, etc. and replace with underscores for easier data manipulation and normalization, then convert from Series object to int dtype
            new_label = str(crosswalk_df[crosswalk_df['name'].str.startswith(series_name)][['label']].values)
            new_label = new_label.replace('[', '').replace(']', '').replace('\' \'', ': ').replace('\'', '').replace('!!', ': ').replace(':', '').replace('\n', '').replace('Estimate ', '').replace('Total ', '').replace('"', '')
            try:
                if series_name != 'ucgid':
                    dataframe.rename(columns={series_name: new_label}, inplace=True, errors='raise')
                    dataframe = dataframe.astype({series_name: 'int'})
            except (TypeError, ValueError, KeyError):
                pass
        elif series_name == 'NAME':
            dataframe = dataframe.rename(columns={series_name: str(series_name).title()}, errors='raise')
            
        geodf = gpd.GeoDataFrame()
        # adds selected geography shapefile to analysis directory 
        if survey_type.value == 'acsse':
            dataframe_ucgids = dataframe['ucgid'].values.tolist()
        elif survey_type.value == 'acs5':
            dataframe['ucgid'] = dataframe['ucgid'].str.lstrip('1500000US')
            dataframe_ucgids = dataframe['ucgid'].values.tolist()
        
        # loads shapefiles, extracts key information, then merges that data with our dataframe
        if survey_type.value == 'acsse':
            if geography_type.value == 'puma':
                shapefile = f'data/geospatial_files/shapefiles/census_bureau/pumas/tl_2023_{state_fips}_puma20.shp'
                shapefile_ucgid_column_name = 'GEOIDFQ20'
            elif geography_type.value == 'congressional_district':
                shapefile = f'data/geospatial_files/shapefiles/census_bureau/congressional_districts/118th_congress/tl_2023_{state_fips}_cd118.shp'
                shapefile_ucgid_column_name = 'GEOIDFQ'
            elif geography_type.value == 'county':
                shapefile = f'data/geospatial_files/shapefiles/census_bureau/counties/tl_2023_us_county.shp'
                shapefile_ucgid_column_name = 'GEOIDFQ'
            elif geography_type.value == 'place':
                shapefile = f'data/geospatial_files/shapefiles/census_bureau/places/tl_2023_{state_fips}_place.shp'
                shapefile_ucgid_column_name = 'GEOIDFQ'
        elif survey_type.value == 'acs5':
            shapefile = f'data/geospatial_files/shapefiles/census_bureau/block_groups/{acs_survey_year.value}/tl_{acs_survey_year.value}_{state_fips}_bg.shp'
            shapefile_ucgid_column_name = 'GEOID'
            
        geodf = gpd.read_file(shapefile)
        
        # searches shapefile UCGID field for correlates from dataframe UCGID field
        geodf = geodf[geodf[shapefile_ucgid_column_name].isin(dataframe_ucgids)]
        # drops all columns that are not necessary for merging or geospatial analysis 
        geodf = geodf.loc[:,geodf.columns.isin([shapefile_ucgid_column_name, 'geometry'])]
        # merges the geodataframe and dataframe 
        combined_dataframe = dataframe.merge(geodf, how='right', left_on='ucgid', right_on=shapefile_ucgid_column_name)
        
        # drops `index` and non-standard UCGID columns (does not drop *actual* index, just the column named `index`
        columns_to_remove = combined_dataframe.columns[combined_dataframe.columns.str.startswith(('index', 'GEOID'))]
        combined_dataframe = combined_dataframe.drop(columns_to_remove, axis=1)
        
        combined_dataframe.to_csv(os.path.join(output_path_dir, f'{table_name}.csv'))

print(f'{len(table_names)} table(s) saved: {" | ".join(table_names)}')
print(f'\n\tTable shown below (head only): {table_names[-1]}')
print(f'\tTable shape: {combined_dataframe.shape[0]} rows, {combined_dataframe.shape[1]} columns')
combined_dataframe.head()

2 table(s) saved: race | educational_attainment_for_the_population_25_years_and_over

	Table shown below (head only): educational_attainment_for_the_population_25_years_and_over
	Table shape: 0 rows, 28 columns


Unnamed: 0,Total,No schooling completed,Nursery school,Kindergarten,1st grade,2nd grade,3rd grade,4th grade,5th grade,6th grade,...,"Some college, less than 1 year","Some college, 1 or more years, no degree",Associates degree,Bachelors degree,Masters degree,Professional school degree,Doctorate degree,Name,ucgid,geometry
