# Demographics by Geography EDA

This notebook allows users to select multiple geographies and demographic tables and saves data in format that can be imported into Tableau Public Desktop for data visualization 

## Data Preparation
Import required libraries and make data calls that do not require user input. 


In [7]:
from unittest.mock import right

import pandas as pd
import os
import sqlalchemy
import datetime
import geopandas as gpd
import ipywidgets as widgets

## User Input

Select `Survey Year`, `Survey Type`, and `Geography Type` 

In [55]:
years_available = ['2022']
acs_survey_year = widgets.Dropdown(
    options = years_available,
    value = '2022',
    disabled=False
)

survey_type = widgets.Dropdown(
    # todo: add logic for `acs5` survey
    # options=[('1-Year Estimates', 'acsse'), ('5-Year Estimate', 'acs5')],
    options=[('1-Year Estimates', 'acsse')],
    disabled=False,
    layout=widgets.Layout(width='max-content'),
)

if survey_type.value == 'acsse':
    geography_type = widgets.Dropdown(
        options=[('Congressional District', 'congressional_district'), ('County', 'county'), ('Place', 'place'), ('PUMA', 'puma')],
        value='county',
        disabled=False
    )
elif survey_type.value == 'acs5':
    geography_type = ('Block Group', 'block_group')[1]
# todo: add logic for toggling between `All Available Tables` and `Cross-Referenced Tables Only`

accordion = widgets.Accordion(
    children=[acs_survey_year, survey_type, geography_type], 
    titles=('ACS Survey Year', 'Survey Type', 'Geography Type'), 
    selected_index=0)
accordion

Accordion(children=(Dropdown(options=('2022',), value='2022'), Dropdown(layout=Layout(width='max-content'), op…

In [58]:
# todo: add logic for showing tables based on user-selected `All Available Tables` or `Cross-Referenced Tables Only`
# imports crosswalked survey data to math ACS SE 1yr and ACS 5yr data
db_filepath = os.path.join(f'data/databases/census_bureau/census_acs-acsse_crosswalk.db')
sql_engine = sqlalchemy.create_engine('sqlite:///' + db_filepath)
crosswalk_tables = pd.read_sql_table(f'acs5_{acs_survey_year.value}-acsse_{acs_survey_year.value}_crosswalk', sql_engine)
concepts = list(crosswalk_tables.concept)

# imports Census survey data
survey_db_filepath = os.path.join(f'data/databases/census_bureau/census_{survey_type.value}_{acs_survey_year.value}_{geography_type.value}.db')
survey_sql_engine = sqlalchemy.create_engine('sqlite:///' + survey_db_filepath)
tables = pd.read_sql_table('tables', survey_sql_engine)

# adds index as second item in each tuple for loading into widget
concepts_for_widget = []
for index, concept in enumerate(concepts):
    concept = str(concept).capitalize().replace('_', ' ')
    concepts_for_widget.append((concept, index))

select_widget_tables = widgets.SelectMultiple(
    options=list(concepts_for_widget),
    rows=len(concepts_for_widget),
    # description='Select Concept',
    layout=widgets.Layout(width='max-content'),
    disabled=False
)

select_widget_tables

SelectMultiple(layout=Layout(width='max-content'), options=(('Race', 0), ('Year structure built', 1), ('School…

### Select Specific Geography

In [59]:
# loads all database data locally and conducts any further filtering from this notebook locally
selected_tables = []
for table in select_widget_tables.value:
    selected_tables.append(crosswalk_tables.iloc[table][f'{survey_type.value}_table_name'])

list_of_dataframes = []
if geography_type.value == 'block_group':
    # todo: add `stack` widget to display block group within geography type selector 
    #  geography_selector = widgets.Text(
    #     value = '23893US9834092834',
    #     placeholder = '23893US9834092834',
    #     # description = 'Block Group UCGID',
    #     disabled=False
    # )
     pass
else:
    # selects name of geographies available
    for table in selected_tables:
        df = pd.read_sql_table(table, survey_sql_engine)
        list_of_dataframes.append(df)

geography_names = list(list_of_dataframes[0]['NAME'])

select_geography_tables = widgets.SelectMultiple(
    options=geography_names,
    rows=len(geography_names),
    # description='Select Concept',
    layout=widgets.Layout(width='max-content'),
    disabled=False
)
select_geography_tables

SelectMultiple(layout=Layout(width='max-content'), options=('San Antonio City (Southwest)--Inside Loop I-410 P…

In [60]:
select_geography_tables_list = select_geography_tables.value
crosswalk_df = pd.read_sql_table('crosswalk', survey_sql_engine)

timestamp = datetime.datetime.strftime(datetime.datetime.now(), "%a %b %d %Y %I:%M:%S%p")
output_path_dir = f'data/outputs/{timestamp}'
os.makedirs(output_path_dir, exist_ok=True)

for dataframe in list_of_dataframes:
    
    # subset dataframe by selected geographies in select geography list
    dataframe = dataframe[dataframe['NAME'].isin(select_geography_tables_list)]
    
    # parses table name from table column, before it gets renamed using crosswalk
    table_code = str(dataframe.columns[2]).split('_')[0]
    
    table_name_df = tables[tables['name'].isin([table_code])]
    table_name = table_name_df['concept'].iloc[0]

    # renames table column names with data from crosswalk 
    for series_name in dataframe.columns:
        if crosswalk_df['name'].str.contains(series_name).any():
            # if the crosswalk contains the name of the DataFrame column (i.e., for any data column as opposed to names, descriptors, etc. ) replace table name based on key to one based on descriptor, then strip spaces, punctuation, etc. and replace with underscores for easier data manipulation and normalization, then convert from Series object to int dtype
            new_label = str(crosswalk_df[crosswalk_df['name'].str.startswith(series_name)][['label']].values)
            new_label = new_label.replace('[', '').replace(']', '').replace('\' \'', ': ').replace('\'', '').replace('!!', ': ').replace(':', '').replace('\n', '').replace('Estimate ', '').replace('Total ', '')
            try:
                if series_name != 'ucgid':
                    dataframe.rename(columns={series_name: new_label}, inplace=True, errors='raise')
                    dataframe = dataframe.astype({series_name: 'int'})
            except (TypeError, ValueError, KeyError):
                pass
        elif series_name == 'NAME':
            dataframe = dataframe.rename(columns={series_name: str(series_name).title()}, errors='raise')
            
            
        # loads shapefiles, extracts key information, then merges that data with our dataframe
        
        geodf = gpd.GeoDataFrame()
        # adds selected geography shapefile to analysis directory 
        dataframe_ucgids = dataframe['ucgid'].values.tolist()
        
        if geography_type.value == 'puma':
            shapefile = f'data/geospatial_files/shapefiles/census_bureau/pumas/tl_2023_48_puma20.shp'
            shapefile_ucgid_column_name = 'GEOIDFQ20'
        elif geography_type.value == 'congressional_district':
            shapefile = f'data/geospatial_files/shapefiles/census_bureau/congressional_districts/118th_congress/tl_2023_48_cd118.shp'
            shapefile_ucgid_column_name = 'GEOIDFQ'
        elif geography_type.value == 'county':
            shapefile = f'data/geospatial_files/shapefiles/census_bureau/counties/tl_2023_us_county.shp'
            shapefile_ucgid_column_name = 'GEOIDFQ'
        elif geography_type.value == 'place':
            shapefile = f'data/geospatial_files/shapefiles/census_bureau/places/tl_2023_48_place.shp'
            shapefile_ucgid_column_name = 'GEOIDFQ'
        elif geography_type.value == 'block_group':
            shapefile = f'data/geospatial_files/shapefiles/census_bureau/block_groups/2022/tl_2022_48_bg.shp'
            # todo: add column combining '1500000US' and GEOID_1 column
            shapefile_ucgid_column_name = 'ucgid'
            
        geodf = gpd.read_file(shapefile)
        # searches shapefile UCGID field for correlates from dataframe UCGID field
        geodf = geodf[geodf[shapefile_ucgid_column_name].isin(dataframe_ucgids)]
        # drops all columns that are not necessary for merging or geospatial analysis 
        geodf = geodf.loc[:,geodf.columns.isin([shapefile_ucgid_column_name, 'geometry'])]
        # merges the geodataframe and dataframe 
        combined_dataframe = dataframe.merge(geodf, how='right', left_on='ucgid', right_on=shapefile_ucgid_column_name)
        
        # drops `index` and non-standard UCGID columns (does not drop *actual* index, just the column named `index`
        columns_to_remove = combined_dataframe.columns[combined_dataframe.columns.str.startswith(('index', 'GEOID'))]
        combined_dataframe = combined_dataframe.drop(columns_to_remove, axis=1)
        
        combined_dataframe.to_csv(os.path.join(output_path_dir, f'{table_name}.csv'))

combined_dataframe

Unnamed: 0,Name,Total,Less than 9th grade,"9th to 12th grade, no diploma",High school graduate (includes equivalency),"Some college, no degree","""Associates degree""","""Bachelors degree""",Graduate or professional degree,ucgid,geometry
0,Bexar County (South)--San Antonio City (Far So...,100595,4675,7617,34319,19183,13087,16332,5382,795P200US4805907,"POLYGON ((-98.30456 29.23700, -98.30470 29.236..."
1,San Antonio City (West)--Between Loop TX-1604 ...,82676,3382,3473,23569,22752,9167,15120,5213,795P200US4805908,"POLYGON ((-98.67001 29.38974, -98.67137 29.389..."
2,Bexar County (Northwest)--San Antonio (Far Nor...,130117,1946,3460,19807,24777,13098,40165,26864,795P200US4805914,"POLYGON ((-98.71555 29.43463, -98.71570 29.434..."
3,San Antonio City (Southeast)--Inside Loop I-41...,80199,11158,8892,30212,17598,4947,4890,2502,795P200US4805903,"POLYGON ((-98.43892 29.41342, -98.43722 29.413..."
4,San Antonio City (Southwest)--Inside Loop I-41...,69686,8165,10419,23643,13660,5188,5902,2709,795P200US4805906,"POLYGON ((-98.50378 29.32028, -98.50380 29.320..."
5,Bexar County (Northeast)--San Antonio (Northea...,83451,1859,3802,17449,19059,10799,19089,11394,795P200US4805916,"POLYGON ((-98.22136 29.53166, -98.22120 29.531..."
6,"San Antonio (Northeast), Kirby & Windcrest Cit...",91367,6008,5774,26426,20406,10023,14619,8111,795P200US4805913,"POLYGON ((-98.31702 29.45933, -98.31714 29.458..."
7,San Antonio City (Northeast)--Inside Loop I-41...,71326,5245,3595,15775,13578,4374,14095,14664,795P200US4805902,"POLYGON ((-98.49931 29.46107, -98.49930 29.461..."
8,San Antonio City (West)--Inside Loop I-410 PUM...,70733,10437,7910,25906,12645,4753,6992,2090,795P200US4805905,"POLYGON ((-98.57285 29.47057, -98.57135 29.469..."
9,Bexar County (North)--San Antonio City (Far No...,107216,1515,1776,11117,21438,9376,37239,24755,795P200US4805915,"POLYGON ((-98.44425 29.63604, -98.44448 29.635..."
