# Demographics by Geography EDA

This notebook allows users to select multiple geographies and demographic tables and saves data in format that can be imported into Tableau Public Desktop for data visualization 

## Data Preparation
Import required libraries and make data calls that do not require user input. 


In [1]:
import pandas as pd
import os
import sqlalchemy
import datetime
import ipywidgets as widgets

## User Input

 ### Select Year of Analysis

In [2]:
years_available = ['2022']
acs_survey_year = ''
acs_survey_year = widgets.Dropdown(
    options = years_available,
    value = '2022',
    # description = 'Select Year of Analysis',
    disabled=False
)
acs_survey_year

Dropdown(options=('2022',), value='2022')

### Select Survey
Select whether to view ACS 1yr geographies (congressional district, county, place, or PUMA), or a specific block group at the 5yr level.

In [3]:
survey_type = widgets.Dropdown(
    options=[('1-Year Estimates', 'acsse'), ('5-Year Estimate', 'acs5')],
    # description='ACS Survey Type',
    disabled=False,
    layout=widgets.Layout(width='max-content'),
    tooltips=['Congressional District, County, Place, or PUMA', 'Block Group']
)
survey_type

Dropdown(layout=Layout(width='max-content'), options=(('1-Year Estimates', 'acsse'), ('5-Year Estimate', 'acs5…

### Select Geographic Level of Analysis

In [4]:
geography_type = ''
if survey_type.value == 'acsse':
    geography_type = widgets.Dropdown(
        options = [('Congressional District', 'congressional_district'), ('County', 'county'), ('Place', 'place'), ('PUMA', 'puma')],
        value = 'county',
        # description = 'Select Geography',
        disabled=False
    )
elif survey_type.value == 'acs5':
    geography_type = ('Block Group', 'block_group')[1]
geography_type

Dropdown(index=1, options=(('Congressional District', 'congressional_district'), ('County', 'county'), ('Place…

### Select Tables From Survey for Selected Geography

In [6]:
# imports Census survey data
db_filepath = os.path.join(f'data/databases/census_{survey_type.value}_{acs_survey_year.value}_{geography_type.value}.db')

sql_engine = sqlalchemy.create_engine('sqlite:///' + db_filepath)
tables = pd.read_sql_table('tables', sql_engine)
concepts = list(tables.concept)

# adds index as second item in each tuple for loading into widget
concepts_for_widget = []
for index, concept in enumerate(concepts):
    concept = str(concept).capitalize().replace('_', ' ')
    concepts_for_widget.append((concept, index))

select_widget_tables = widgets.SelectMultiple(
    options=list(concepts_for_widget),
    rows=len(concepts_for_widget),
    # description='Select Concept',
    layout=widgets.Layout(width='max-content'),
    disabled=False
)

select_widget_tables

SelectMultiple(layout=Layout(width='max-content'), options=(('Veteran status for the civilian population 18 ye…

### Select Specific Geography

In [7]:
# loads all database data locally and conducts any further filtering from this notebook locally
selected_tables = []
for table in select_widget_tables.value:
    selected_tables.append(tables.iloc[table]['name'])

list_of_dataframes = []
df = pd.DataFrame()
if geography_type.value == 'block_group':    
    #  geography_selector = widgets.Text(
    #     value = '23893US9834092834',
    #     placeholder = '23893US9834092834',
    #     # description = 'Block Group UCGID',
    #     disabled=False
    # )
     pass
else:
    # selects name of geographies available
    for table in selected_tables:
        df = pd.read_sql_table(table, sql_engine)
        list_of_dataframes.append(df)

geography_names = list(list_of_dataframes[0]['NAME'])

select_geography_tables = widgets.SelectMultiple(
    options=geography_names,
    rows=len(geography_names),
    # description='Select Concept',
    layout=widgets.Layout(width='max-content'),
    disabled=False
)
select_geography_tables

SelectMultiple(layout=Layout(width='max-content'), options=('San Antonio City (Southwest)--Inside Loop I-410 P…

In [8]:
select_geography_tables_list = select_geography_tables.value
crosswalk_df = pd.read_sql_table('crosswalk', sql_engine)

output_path_dir = f'data/outputs/{str(datetime.datetime.now())}'
os.makedirs(output_path_dir, exist_ok=True)

for dataframe in list_of_dataframes:
    for table in select_geography_tables_list:
        # todo: drop all non-selected geographies

        table_name = str(dataframe.columns[3]).split('_')[0]

        # renames table column names with data from crosswalk database
        for series_name in dataframe.columns:
            if crosswalk_df['name'].str.contains(series_name).any():
                # if the crosswalk contains the name of the DataFrame column (i.e., for any data column as opposed to names, descriptors, etc. ) replace table name based on key to one based on descriptor, then strip spaces, punctuation, etc. and replace with underscores for easier data manipulation and normalization, then convert from Series object to int dtype
                new_label = str(crosswalk_df[crosswalk_df['name'].str.startswith(series_name)][['label']].values)
                new_label = new_label.replace('[', '').replace(']', '').replace('\' \'', ': ').replace('\'', '').replace('!!', ': ').replace(':', '').replace('\n', '').replace('Estimate ', '').title()
                try:
                    if series_name != 'ucgid':
                        dataframe.rename(columns={series_name: new_label}, inplace=True, errors='raise')
                        dataframe = dataframe.astype({series_name: 'int'})
                except (TypeError, ValueError, KeyError):
                    pass
            elif series_name == 'NAME':
                dataframe.rename(columns={series_name: str(series_name).title()}, inplace=True, errors='raise')
            
            # todo: drop 'index' column
            # dataframe.drop(['index'], axis=1, inplace=True)
        
        # todo: convert to JSON/GeoJSON
        dataframe.to_csv(os.path.join(output_path_dir, f'{table_name}.csv'))

dataframe

Unnamed: 0,index,Name,Total,Total White Alone,Total Black Or African American Alone,Total American Indian And Alaska Native Alone,Total Asian Alone,Total Native Hawaiian And Other Pacific Islander Alone,Total Some Other Race Alone,Total Two Or More Races,ucgid
0,4,San Antonio City (Southwest)--Inside Loop I-41...,116206,40651,3664,1263,937,274,27907,41510,795P200US4805906
1,5,Bexar County (South)--San Antonio City (Far So...,155224,70927,12639,2864,2119,0,15238,51437,795P200US4805907
2,6,"San Antonio (Northeast), Kirby & Windcrest Cit...",135956,44962,25751,1775,4233,336,21124,37775,795P200US4805913
3,8,San Antonio City (Northwest)--Inside Loop I-41...,101698,30071,1467,1065,135,245,8187,60528,795P200US4805904
4,11,San Antonio City (Southeast)--Inside Loop I-41...,120621,38926,5629,2823,452,152,25416,47223,795P200US4805903
5,12,San Antonio City (Northwest)--Between Loop TX-...,140920,57275,13638,1210,17324,0,15257,36216,795P200US4805910
6,13,San Antonio City (Northeast)--Inside Loop I-41...,112214,52660,12913,160,3795,253,10423,32010,795P200US4805902
7,14,Bexar County (North)--San Antonio City (Far No...,159997,84888,6396,2188,7498,0,10486,48541,795P200US4805915
8,15,San Antonio City (West)--Between Loop TX-1604 ...,127760,58786,11398,2977,5374,0,11540,37685,795P200US4805908
9,20,San Antonio (Northwest) & Leon Valley Cities P...,110651,46685,3732,930,4228,0,11024,44052,795P200US4805909
