In [1]:
import pandas as pd
import os

In [2]:
# Read the workbook with Pandas
filepath = os.path.join("data", "raw", "Table_3_Poverty Status of People_Age-Race-Hispanic Origin-hstpov3.xls")
pbr_dfs = pd.read_excel(filepath, sheet_name=None)

In [3]:
# Looks at all the excel sheet column headers.  They're a mess.
for k, df in pbr_dfs.items():
    print(f"{k} - {df.columns}\n")

all races - Index(['ALL RACES- Year and Characteristic',
       'Under 18 years - All People - Total ',
       'Under 18 years - All People - Below Poverty - Number',
       'Under 18 years - All People - Below Poverty - Percent',
       'Under 18 years - Related Children in Families - Total',
       'Under 18 years - Related Children in Families - Below Poverty - Number',
       'Under 18 years - Related Children in Families - Below Poverty - Percent',
       '18 to 64 years - Total', '18 to 64 years - Below Poverty - Number',
       '18 to 64 years - Below Poverty - Percent', '65 years and over - Total',
       '65 years and over - Below Poverty - Number',
       '65 years and over - Below Poverty - Percent'],
      dtype='object')

WHITE, NOT HISPANIC - Index(['WHITE, NOT HISPANIC- Year and Characteristic',
       'Under 18 years - All People - Total ',
       'Under 18 years - All People - Below Poverty - Number',
       'Under 18 years - All People - Below Poverty - Percent',
    

In [4]:
# Sets the overall dataframe that'll combine all the Excel sheets.  We'll only need to select the year,
# age ranges, race, total, and population below the poverty line for the final dataframe.
all_pbr_df = pd.DataFrame({"Year":[], "AgeBand":[], "Race":[], "Total":[], "BelowPoverty":[]})

# For loop to go through each Excel sheet.
for key, df in pbr_dfs.items():
    
    # Better column names so that we don't see "Under 18 years" before every single column header from the
    # Asian Alone sheet onward.
    clean_column_names = {df.columns[0]:"Year", 
                          df.columns[1]: "Under 18 - Total", 
                          df.columns[2]: "Under 18 - Below Poverty", 
                          df.columns[3]: "Under 18 - Below Poverty %", 
                          df.columns[4]: "Under 18 - Children Total", 
                          df.columns[5]: "Under 18 - Children Below Poverty", 
                          df.columns[6]: "Under 18 - Children Below Poverty %", 
                          df.columns[7]: "18 to 64 - Total", 
                          df.columns[8]: "18 to 64 - Below Poverty", 
                          df.columns[9]: "18 to 64 - Below Poverty %", 
                          df.columns[10]: "65 and Over - Total", 
                          df.columns[11]: "65 and Over - Below Poverty", 
                          df.columns[12]: "65 and Over - Below Poverty %"}
    
    # We don't need the percentages for the final database, so this selects just the columns we need.  The
    # "Under 18 years - Related Children in Families" columns are assumed to be only part of the Under 18 population,
    # while the "Under 18 years - All People" are assumed to be the entire Under 18 population, so we went with the
    # entire Under 18 population and rejected the "in families" column to simplify the database.
    clean_df = df.rename(columns=clean_column_names)[["Year",
                    "Under 18 - Total", "Under 18 - Below Poverty",
                    "18 to 64 - Total", "18 to 64 - Below Poverty",
                    "65 and Over - Total", "65 and Over - Below Poverty",
                   ]]
    
    # This section sets up the overall dataframe to be sorted by age bands.  This potentially could have been done
    # as one huge dataframe, but concatenating smaller dataframes was a little easier to work with.
    
    # Under 18
    u18_df = clean_df[["Year", "Under 18 - Total", "Under 18 - Below Poverty"]]
    u18_normalized_columns = {"Under 18 - Total": "Total", "Under 18 - Below Poverty":"BelowPoverty"}
    u18_df = u18_df.rename(columns=u18_normalized_columns)
    u18_df["Race"] = key
    u18_df["AgeBand"] = "Under age 18"
    all_pbr_df = pd.concat([all_pbr_df, u18_df], sort=False)
    
    # Age 18 - 64
    a1864_df = clean_df[["Year", "18 to 64 - Total", "18 to 64 - Below Poverty"]]
    a1864_normalized_columns = {"18 to 64 - Total": "Total", "18 to 64 - Below Poverty":"BelowPoverty"}
    a1864_df = a1864_df.rename(columns=a1864_normalized_columns)
    a1864_df["Race"] = key
    a1864_df["AgeBand"] = "Aged 18 to 64"
    all_pbr_df = pd.concat([all_pbr_df, a1864_df], sort=False)
    
    # Age 65 and over
    a65ao_df = clean_df[["Year", "65 and Over - Total", "65 and Over - Below Poverty"]]
    a65ao_normalized_columns = {"65 and Over - Total": "Total", "65 and Over - Below Poverty":"BelowPoverty"}
    a65ao_df = a65ao_df.rename(columns=a65ao_normalized_columns)
    a65ao_df["Race"] = key
    a65ao_df["AgeBand"] = "Aged 65 and older"
    all_pbr_df = pd.concat([all_pbr_df, a65ao_df], sort=False)

# Display the final dataframe
all_pbr_df

Unnamed: 0,Year,AgeBand,Race,Total,BelowPoverty
0,2015,Under age 18,all races,73647,14509
1,2014,Under age 18,all races,73556,15540
2,2013 (19),Under age 18,all races,73439,15801
3,2013 (18),Under age 18,all races,73625,14659
4,2012,Under age 18,all races,73719,16073
5,2011,Under age 18,all races,73737,16134
6,2010 (17),Under age 18,all races,73873,16286
7,2009,Under age 18,all races,74579,15451
8,2008,Under age 18,all races,74068,14068
9,2007,Under age 18,all races,73996,13324
