# Demographics by Geography EDA

This notebook allows users to select multiple geographies and demographic tables, and provides visualizations for a single combination of those at a time. 

## Data Preparation
Import required libraries and make data calls that do not require user input. 


In [1]:
from audioop import reverse

import pandas as pd
import os
import sqlalchemy
import ipywidgets as widgets
from pandas.errors import MergeError

## User Input

 ### Select Year of Analysis

In [2]:
years_available = ['2022']
acs_survey_year = ''
acs_survey_year = widgets.Dropdown(
    options = years_available,
    value = '2022',
    # description = 'Select Year of Analysis',
    disabled=False
)
acs_survey_year

Dropdown(options=('2022',), value='2022')

### Select Survey
Select whether to view ACS 1yr geographies (congressional district, county, place, or PUMA), or a specific block group at the 5yr level.

In [3]:
survey_type = widgets.Dropdown(
    options=[('1-Year Estimates', 'acsse'), ('5-Year Estimate', 'acs5')],
    # description='ACS Survey Type',
    disabled=False,
    layout=widgets.Layout(width='max-content'),
    tooltips=['Congressional District, County, Place, or PUMA', 'Block Group']
)
survey_type

Dropdown(layout=Layout(width='max-content'), options=(('1-Year Estimates', 'acsse'), ('5-Year Estimate', 'acs5…

### Select Geographic Level of Analysis

In [4]:
geography_type = ''
if survey_type.value == 'acsse':
    geography_type = widgets.Dropdown(
        options = [('Congressional District', 'congressional_district'), ('County', 'county'), ('Place', 'place'), ('PUMA', 'puma')],
        value = 'county',
        # description = 'Select Geography',
        disabled=False
    )
elif survey_type.value == 'acs5':
    geography_type = ('Block Group', 'block_group')[1]
geography_type

Dropdown(index=1, options=(('Congressional District', 'congressional_district'), ('County', 'county'), ('Place…

### Select Tables From Survey for Selected Geography

In [5]:
# imports Census survey data
db_filepath = os.path.join(f'data/databases/census_{survey_type.value}_{acs_survey_year.value}_{geography_type.value}.db')

sql_engine = sqlalchemy.create_engine('sqlite:///' + db_filepath)
tables = pd.read_sql_table('tables', sql_engine)
concepts = list(tables.concept)

# adds index as second item in each tuple for loading into widget
concepts_for_widget = []
for index, concept in enumerate(concepts):
    concept = str(concept).capitalize().replace('_', ' ')
    concepts_for_widget.append((concept, index))

select_widget_tables = widgets.SelectMultiple(
    options=list(concepts_for_widget),
    rows=len(concepts_for_widget),
    # description='Select Concept',
    layout=widgets.Layout(width='max-content'),
    disabled=False
)

select_widget_tables

SelectMultiple(layout=Layout(width='max-content'), options=(('Veteran status for the civilian population 18 ye…

### Select Specific Geography

In [9]:
# loads all database data locally and conducts any further filtering from this notebook locally
selected_tables = []
for table in select_widget_tables.value:
    selected_tables.append(tables.iloc[table]['name'])

list_of_dataframes = []
df = pd.DataFrame()
if geography_type.value == 'block_group':    
    #  geography_selector = widgets.Text(
    #     value = '23893US9834092834',
    #     placeholder = '23893US9834092834',
    #     # description = 'Block Group UCGID',
    #     disabled=False
    # )
     pass
else:
    # selects name of geographies available
    for table in selected_tables:
        df = pd.read_sql_table(table, sql_engine)
        list_of_dataframes.append(df)

geography_names = list(list_of_dataframes[0]['NAME'])

select_geography_tables = widgets.SelectMultiple(
    options=geography_names,
    rows=len(geography_names),
    # description='Select Concept',
    layout=widgets.Layout(width='max-content'),
    disabled=False
)
select_geography_tables

SelectMultiple(layout=Layout(width='max-content'), options=('San Antonio City (Central) PUMA; Texas', 'San Ant…

In [66]:
select_geography_tables_list = select_geography_tables.value
finalized_list_of_dataframes = []

crosswalk_df = pd.read_sql_table('crosswalk', sql_engine)

for dataframe in list_of_dataframes:
    for table in select_geography_tables_list:
        table_name = str(dataframe.columns[3]).split('_')[0]
        concept_name = (tables[tables['name'] == table_name]['concept']).to_string(index=False).capitalize().replace('_', ' ')
        
        dataframe['table'] = table_name
        dataframe['concept'] = concept_name
        finalized_list_of_dataframes.append(dataframe[dataframe['NAME'] == table])
    
finalized_list_of_dataframes[1]

Unnamed: 0,index,NAME,K201703_001E,K201703_002E,K201703_003E,K201703_004E,K201703_005E,K201703_006E,K201703_007E,K201703_008E,K201703_009E,K201703_010E,K201703_011E,ucgid,table_name,concept,table
8,20,San Antonio (Northwest) & Leon Valley Cities P...,26126,1708,821,887,0,887,24418,18894,5524,1587,3937,795P200US4805909,K201703,Poverty status in the past 12 months of famili...,K201703


## Visualizations

In [None]:
# todo: for each table, add row for totals
# todo: for each table, create bar chart to show distributions
# todo: for each geography, display map of state / county (depending on geography) with selected geography highlighted
# todo: rename each column with crosswalk info
# selected_geography_df = pd.DataFrame()

# for table in select_geography_tables_list:
#     temp_df = df[df['NAME'] == table]
#     selected_geography_df = pd.concat([selected_geography_df, temp_df])
#     
# # renames table column names with data from crosswalk database
# for series_name in selected_geography_df.columns:
#     if crosswalk_df['name'].str.contains(series_name).any():
#         # if the crosswalk contains the name of the DataFrame column (i.e., for any data column as opposed to names, descriptors, etc. ) replace table name based on key to one based on descriptor, then strip spaces, punctuation, etc. and replace with underscores for easier data manipulation and normalization, then convert from Series object to int dtype
#         new_label = str(crosswalk_df[crosswalk_df['name'].str.startswith(series_name)][['concept', 'label']].values)
#         new_label = new_label.replace('[', '').replace(']', '').replace('\' \'', '__').replace(' ', '_').replace('\'', '').replace('!!', '_').replace(':', '').replace('\n', '').lower()
#         try:
#             if series_name != 'ucgid':
#                 selected_geography_df.rename(columns={series_name: new_label + '::' + series_name}, inplace=True, errors='raise')
#                 selected_geography_df.rename(columns={series_name: new_label}, inplace=True, errors='raise')
#                 selected_geography_df = selected_geography_df.astype({series_name: 'int'})
#         except (TypeError, ValueError, KeyError):
#             pass
#     elif series_name == 'NAME':
#         selected_geography_df.rename(columns={series_name: str(series_name).lower()}, inplace=True, errors='raise')