# Demographics by Geography EDA

This notebook looks the available demographic data available collected geographies. 

## Data Preparation
First, we import required libraries and make data calls that do not require user input. 


In [1]:
from audioop import reverse

import pandas as pd
import os
import sqlalchemy
import ipywidgets as widgets
from pandas.errors import MergeError

## User Input

 ### Select Year of Analysis

In [2]:
years_available = ['2022']
acs_survey_year = ''
acs_survey_year = widgets.Dropdown(
    options = years_available,
    value = '2022',
    # description = 'Select Year of Analysis',
    disabled=False
)
acs_survey_year

Dropdown(options=('2022',), value='2022')

### Select Survey
Select whether to view ACS 1yr geographies (congressional district, county, place, or PUMA), or a specific block group at the 5yr level.

In [3]:
survey_type = widgets.Dropdown(
    options=[('1-Year Estimates', 'acsse'), ('5-Year Estimate', 'acs5')],
    # description='ACS Survey Type',
    disabled=False,
    layout=widgets.Layout(width='max-content'),
    tooltips=['Congressional District, County, Place, or PUMA', 'Block Group']
)
survey_type

Dropdown(layout=Layout(width='max-content'), options=(('1-Year Estimates', 'acsse'), ('5-Year Estimate', 'acs5…

### Select Geographic Level of Analysis

In [4]:
geography_type = ''
if survey_type.value == 'acsse':
    geography_type = widgets.Dropdown(
        options = [('Congressional District', 'congressional_district'), ('County', 'county'), ('Place', 'place'), ('PUMA', 'puma')],
        value = 'county',
        # description = 'Select Geography',
        disabled=False
    )
elif survey_type.value == 'acs5':
    geography_type = ('Block Group', 'block_group')[1]
geography_type

Dropdown(index=1, options=(('Congressional District', 'congressional_district'), ('County', 'county'), ('Place…

### Select Tables From Survey for Selected Geography

In [9]:
# imports Census survey data
db_filepath = os.path.join(f'data/databases/census_{survey_type.value}_{acs_survey_year.value}_{geography_type.value}.db')

sql_engine = sqlalchemy.create_engine('sqlite:///' + db_filepath)
tables = pd.read_sql_table('tables', sql_engine)
concepts = list(tables.concept)

# adds index as second item in each tuple for loading into widget
concepts_for_widget = []
for index, concept in enumerate(concepts):
    concept = str(concept).capitalize().replace('_', ' ')
    concepts_for_widget.append((concept, index))

select_widget_tables = widgets.SelectMultiple(
    options=list(concepts_for_widget),
    rows=len(concepts_for_widget),
    # description='Select Concept',
    layout=widgets.Layout(width='max-content'),
    disabled=False
)

select_widget_tables

SelectMultiple(layout=Layout(width='max-content'), options=(('Veteran status for the civilian population 18 ye…

### Select Specific Geography

In [20]:
# loads all database data locally and conducts any further filtering from this notebook locally
selected_tables = []
for table in select_widget_tables.value:
    selected_tables.append(tables.iloc[table]['name'])
    
df = pd.DataFrame()
if geography_type.value == 'block_group':    
     geography_selector = widgets.Text(
        value = '23893US9834092834',
        placeholder = '23893US9834092834',
        # description = 'Block Group UCGID',
        disabled=False
    )
else:
    # selects name of geographies available (each table in a given survey will have the same, so data only needs to be extracted from the first
    for table in selected_tables:
        try:
            df = pd.merge(df, (pd.read_sql_table(table, sql_engine)), on=['ucgid', 'NAME'])
            try:    
                df.drop(['index_x'], inplace=True, axis=1)
            except:
                pass
            try:
                df.drop(['index_y'], inplace=True, axis=1)
            except:
                pass
        except Exception as e:
            df = pd.read_sql_table(table, sql_engine)    

geography_names = list(df['NAME'])

select_geography_tables = widgets.SelectMultiple(
    options=geography_names,
    rows=len(geography_names),
    # description='Select Concept',
    layout=widgets.Layout(width='max-content'),
    disabled=False
)
select_geography_tables

SelectMultiple(layout=Layout(width='max-content'), options=('Congressional District 20 (118th Congress), Texas…

In [31]:
select_geography_tables_list = select_geography_tables.value
selected_geography_df = pd.DataFrame()

for table in select_geography_tables_list:
    temp_df = df[df['NAME'] == table]
    selected_geography_df = pd.concat([selected_geography_df, temp_df])
    
# renames table column names with data from crosswalk database
crosswalk_df = pd.read_sql_table('crosswalk', sql_engine)

for series_name in selected_geography_df.columns:
    if crosswalk_df['name'].str.contains(series_name).any():
        # if the crosswalk contains the name of the DataFrame column (i.e., for any data column as opposed to names, descriptors, etc. ) replace table name based on key to one based on descriptor, then strip spaces, punctuation, etc. and replace with underscores for easier data manipulation and normalization, then convert from Series object to int dtype
        new_label = str(crosswalk_df[crosswalk_df['name'].str.startswith(series_name)][['concept', 'label']].values)
        new_label = new_label.replace('[', '').replace(']', '').replace('\' \'', '__').replace(' ', '_').replace('\'', '').replace('!!', '_').replace(':', '').replace('\n', '').lower()
        try:
            if series_name != 'ucgid':
                selected_geography_df.rename(columns={series_name: new_label + '::' + series_name}, inplace=True, errors='raise')
                selected_geography_df.rename(columns={series_name: new_label}, inplace=True, errors='raise')
                selected_geography_df = selected_geography_df.astype({series_name: 'int'})
        except (TypeError, ValueError, KeyError):
            pass
    elif series_name == 'NAME':
        selected_geography_df.rename(columns={series_name: str(series_name).lower()}, inplace=True, errors='raise')

selected_geography_df

Unnamed: 0,name,race__estimate_total::K200201_001E,race__estimate_total_white_alone::K200201_002E,race__estimate_total_black_or_african_american_alone::K200201_003E,race__estimate_total_american_indian_and_alaska_native_alone::K200201_004E,race__estimate_total_asian_alone::K200201_005E,race__estimate_total_native_hawaiian_and_other_pacific_islander_alone::K200201_006E,race__estimate_total_some_other_race_alone::K200201_007E,race__estimate_total_two_or_more_races::K200201_008E,ucgid,...,population_by_age__estimate_total_65_years_and_over::K200104_008E,index,educational_attainment_for_the_population_25_years_and_over__estimate_total::K201501_001E,educational_attainment_for_the_population_25_years_and_over__estimate_total_less_than_9th_grade::K201501_002E,"educational_attainment_for_the_population_25_years_and_over__estimate_total_9th_to_12th_grade,_no_diploma::K201501_003E",educational_attainment_for_the_population_25_years_and_over__estimate_total_high_school_graduate_(includes_equivalency)::K201501_004E,"educational_attainment_for_the_population_25_years_and_over__estimate_total_some_college,_no_degree::K201501_005E","educational_attainment_for_the_population_25_years_and_over__""estimate_total_associates_degree""::K201501_006E","educational_attainment_for_the_population_25_years_and_over__""estimate_total_bachelors_degree""::K201501_007E",educational_attainment_for_the_population_25_years_and_over__estimate_total_graduate_or_professional_degree::K201501_008E
0,"Congressional District 20 (118th Congress), Texas",781188,296043,48095,9425,33417,487,98412,295309,5001800US4820,...,95295,7,499585,36455,37804,139182,113997,45328,82239,44580
1,"Congressional District 23 (118th Congress), Texas",778355,339088,26683,10982,17723,750,94047,289082,5001800US4823,...,106198,9,494117,38065,35651,123924,98384,43780,96308,58005
2,"Congressional District 28 (118th Congress), Texas",777758,248104,36557,8341,7458,903,93044,383351,5001800US4828,...,97208,10,483588,53291,51965,134121,97416,42800,72869,31126
3,"Congressional District 35 (118th Congress), Texas",802077,322027,94886,8388,29017,749,103894,243116,5001800US4835,...,74073,11,532320,36526,35635,126553,104509,42786,122817,63494
4,"Congressional District 21 (118th Congress), Texas",807859,523543,31169,5672,15981,955,48611,181928,5001800US4821,...,152369,8,562310,14461,18080,99684,112936,44805,167138,105206
