In [2]:
import pandas as pd
import os
from collections import OrderedDict
from sqlalchemy import create_engine, inspect
import pymysql
pymysql.install_as_MySQLdb()

In [3]:
TBL3_POVERTY_BY_RACE = os.path.join('data', 'raw', 'Table_3_Poverty Status of People_Age-Race-Hispanic Origin-hstpov3.xls')
TBL3_POVERTY_BY_SELECTED_CHAR = os.path.join('data', 'raw', 'Table_3_People_Poverty_Selected_Characteristics_2014-2015 .xls')

### Load Excel Files into DataFrames

This step loads the Excel files into memory as Pandas Data Frames. Here we pass sheet_name parameter as None, which enables load of all worksheets into a dictionary of data frames.

In [6]:
pbr_dfs = pd.read_excel(TBL3_POVERTY_BY_RACE, sheet_name=None)
pbsc_dfs = pd.read_excel(TBL3_POVERTY_BY_SELECTED_CHAR, sheet_name=None)

In [7]:
print("TBL3_POVERTY_BY_RACE:")
for key, df in pbr_dfs.items():
    print(f"    {key} - {df.columns}\n")
    
print("TBL3_POVERTY_BY_SELECTED_CHARS:")
for key, df in pbsc_dfs.items():
    print(f"    {key} - {df.columns}\n")

TBL3_POVERTY_BY_RACE:
    all races - Index(['ALL RACES- Year and Characteristic',
       'Under 18 years - All People - Total ',
       'Under 18 years - All People - Below Poverty - Number',
       'Under 18 years - All People - Below Poverty - Percent',
       'Under 18 years - Related Children in Families - Total',
       'Under 18 years - Related Children in Families - Below Poverty - Number',
       'Under 18 years - Related Children in Families - Below Poverty - Percent',
       '18 to 64 years - Total', '18 to 64 years - Below Poverty - Number',
       '18 to 64 years - Below Poverty - Percent', '65 years and over - Total',
       '65 years and over - Below Poverty - Number',
       '65 years and over - Below Poverty - Percent'],
      dtype='object')

    WHITE, NOT HISPANIC - Index(['WHITE, NOT HISPANIC- Year and Characteristic',
       'Under 18 years - All People - Total ',
       'Under 18 years - All People - Below Poverty - Number',
       'Under 18 years - All People - 

In [8]:
pbsc_dfs.keys()

odict_keys(['All People', 'Education Attainment', 'Work Experience', 'Disability Status', 'Residence', 'Age', 'Sex', 'Family Status', 'Race and Hispanic Origin', 'Naitvity', 'Region'])

In [14]:
all_pbsc_df = pd.DataFrame({"Year":[], "Race":[], 
                            "Education":[], "WorkExperience":[], "DisabilityStatus":[], 
                            "Residence":[], "AgeBand":[], "Nativity":[],
                            "Gender":[], "FamilyStatus":[], "Region":[],
                            "Total":[], "BelowPoverty":[]})

char_key_to_value_map = {
    "All People": "DROP",
    "Education Attainment": "Education",
    "Work Experience": "WorkExperience",
    "Disability Status": "DisabilityStatus",
    "Residence": "Residence",
    "Age": "AgeBand",
    "Sex": "Gender",
    "Family Status": "FamilyStatus",
    "Race and Hispanic Origin": "Race",
    "Naitvity": "Nativity",
    "Region": "Region"
}

for key, df in pbsc_dfs.items():
    if key == "All People":
        continue;
    clean_column_names = {df.columns[0]:char_key_to_value_map[key], 
                          df.columns[1]: "2014 - Total", 
                          df.columns[2]: "2014 - Below Poverty", 
                          df.columns[3]: "2014 - Below Poverty MOE", 
                          df.columns[4]: "2014 - Below Poverty %", 
                          df.columns[5]: "2014 - Below Poverty % MOE", 
                          df.columns[6]: "2015 - Total", 
                          df.columns[7]: "2015 - Below Poverty", 
                          df.columns[8]: "2015 - Below Poverty MOE", 
                          df.columns[9]: "2015 - Below Poverty %", 
                          df.columns[10]: "2015 - Below Poverty % MOE", 
                          df.columns[11]: "2014-15 - Below Poverty Change", 
                          df.columns[12]: "2014-15 - Below Poverty % Change"
                         }
    clean_df = df.rename(columns=clean_column_names)[[char_key_to_value_map[key],
                                                      "2014 - Total", "2014 - Below Poverty",
                                                      "2015 - Total", "2015 - Below Poverty"]]
    
    y2014_clean_df = clean_df[[char_key_to_value_map[key],"2014 - Total", "2014 - Below Poverty"]]
    y2014_normalized_columns = {"2014 - Total": "Total", "2014 - Below Poverty":"BelowPoverty"}
    y2014_clean_df = y2014_clean_df.rename(columns=y2014_normalized_columns)
    y2015_clean_df = clean_df[[char_key_to_value_map[key],"2015 - Total", "2015 - Below Poverty"]]
    y2015_normalized_columns = {"2015 - Total": "Total", "2015 - Below Poverty":"BelowPoverty"}
    y2015_clean_df = y2015_clean_df.rename(columns=y2015_normalized_columns)
    
    clean_df = pd.concat([y2014_clean_df, y2015_clean_df])
    
    for col in all_pbsc_df.columns:
        if col not in clean_df.columns:
            clean_df[col] = "" ## Add empty columns to transform to common structure
    
    all_pbsc_df = pd.concat([all_pbsc_df, clean_df], sort=False)
    
print(all_pbsc_df.count())
all_pbsc_df.head()

Year                86
Race                86
Education           86
WorkExperience      86
DisabilityStatus    86
Residence           86
AgeBand             86
Nativity            86
Gender              86
FamilyStatus        86
Region              86
Total               86
BelowPoverty        86
dtype: int64


Unnamed: 0,Year,Race,Education,WorkExperience,DisabilityStatus,Residence,AgeBand,Nativity,Gender,FamilyStatus,Region,Total,BelowPoverty
0,,,"Total, aged 25 and older",,,,,,,,,212132.0,25163.0
1,,,No high school diploma,,,,,,,,,24582.0,7098.0
2,,,"High school, no college",,,,,,,,,62575.0,8898.0
3,,,"Some college, no degree",,,,,,,,,56031.0,5719.0
4,,,Bachelor's degree or higher,,,,,,,,,68945.0,3449.0


In [16]:
def prepare_dim_df(dim):
    """
       This function returns a dimesion dataframe with unique dimension records
    """
    dim_info = []
    for index, value in enumerate(all_pbsc_df[dim].unique()):
        if value is None or (type(value) == str and len(value) == 0):
            continue
        dim_info_row = OrderedDict()
        dim_info_row['ID'] =  index
        if (isinstance(value, str)):
            dim_info_row['CODE'] =  (value.replace(',', '').replace(' ', '').upper())
        else:
            dim_info_row['CODE'] =  str(value)
        dim_info_row['DESCRIPTION'] = value
        dim_info.append(dim_info_row)
    
    dim_df = pd.DataFrame(dim_info)

    return dim_df

### Prepare Region

In [17]:
region_df = prepare_dim_df('Region')
region_df.head()

Unnamed: 0,ID,CODE,DESCRIPTION
0,1,NORTHEAST,Northeast
1,2,MIDWEST,Midwest
2,3,SOUTH,South
3,4,WEST,West


### Prepare Residence

In [18]:
residence_df = prepare_dim_df('Residence')
residence_df.head()

Unnamed: 0,ID,CODE,DESCRIPTION
0,1,INSIDEMETROPOLITANSTATISTICALAREAS,Inside metropolitan statistical areas
1,2,INSIDEPRINCIPALCITIES,Inside principal cities
2,3,OUTSIDEPRINCIPALCITIES,Outside principal cities
3,4,OUTSIDEMETROPOLITANSTATISTICALAREAS4,Outside metropolitan statistical areas4
