In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
def read_file_details(file,year):
    # Read in file
    df = pd.read_csv(file)

    # Get rid of spaces in Labels
    df['Label'] = df['Label'].apply(lambda x: x.lstrip())

    # # Convert percentage estimates to float
    df['Percentage'] = df['Estimate'].apply(lambda x: float(x.rstrip('%'))/100 if '%' in x else None)

    # # Keep only full population stats
    df = df.iloc[:13]

    # # # Move Total Pop from row to column
    df['Total Population'] = int(df.iloc[0]['Estimate'].replace(',',''))
    df.dropna(inplace=True)

    # # Drop unnecessary columns
    df.drop(columns=['Estimate','Margin of Error'], inplace=True)

    # # Calculate population estimates
    df['Population Estimate'] = df['Percentage']*df['Total Population']

    # # # # Drop unnecessary columns
    df.drop(columns=['Percentage','Total Population'], inplace=True)

    # # # Change index and transpose table
    df.set_index('Label', inplace=True)
    df = df.T

    # Change index to year
    df.index = [year]

    return df

In [3]:
for year in range(2010,2023):
    file = 'data/'+str(year)+'/SEX_AND_AGE.CSV'

    try:
        read_file_details(file,year)
    except:
        print(f"No file for year '{year}'")

No file for year '2020'


In [4]:
# test that function works on one file
read_file_details('data/2014/SEX_AND_AGE.CSV',2014)

Label,Male,Female,Under 5 years,5 to 17 years,18 to 24 years,25 to 34 years,35 to 44 years,45 to 54 years,55 to 64 years,65 to 74 years,75 years and over
2014,36062.424,43545.576,3582.36,11941.2,7323.936,10985.904,12896.496,11463.552,10349.04,6766.68,4139.616


In [8]:
SEX_AND_AGE_2022 = read_file_details('data/2022/SEX_AND_AGE.CSV',2022)
SEX_AND_AGE_2021 = read_file_details('data/2021/SEX_AND_AGE.CSV',2021)
SEX_AND_AGE_2019 = read_file_details('data/2019/SEX_AND_AGE.CSV',2019)
SEX_AND_AGE_2018 = read_file_details('data/2018/SEX_AND_AGE.CSV',2018)
SEX_AND_AGE_2017 = read_file_details('data/2017/SEX_AND_AGE.CSV',2017)
SEX_AND_AGE_2016 = read_file_details('data/2016/SEX_AND_AGE.CSV',2016)
SEX_AND_AGE_2015 = read_file_details('data/2015/SEX_AND_AGE.CSV',2015)
SEX_AND_AGE_2014 = read_file_details('data/2014/SEX_AND_AGE.CSV',2014)
SEX_AND_AGE_2013 = read_file_details('data/2013/SEX_AND_AGE.CSV',2013)
SEX_AND_AGE_2012 = read_file_details('data/2012/SEX_AND_AGE.CSV',2012)
SEX_AND_AGE_2011 = read_file_details('data/2011/SEX_AND_AGE.CSV',2011)
SEX_AND_AGE_2010 = read_file_details('data/2010/SEX_AND_AGE.CSV',2010)

In [10]:
SEX_AND_AGE = pd.concat([SEX_AND_AGE_2022,SEX_AND_AGE_2021,SEX_AND_AGE_2019,SEX_AND_AGE_2018,SEX_AND_AGE_2017,SEX_AND_AGE_2016,SEX_AND_AGE_2015,SEX_AND_AGE_2014,SEX_AND_AGE_2013,SEX_AND_AGE_2012,SEX_AND_AGE_2011,SEX_AND_AGE_2010])
SEX_AND_AGE

Label,Male,Female,Under 5 years,5 to 17 years,18 to 24 years,25 to 34 years,35 to 44 years,45 to 54 years,55 to 64 years,65 to 74 years,75 years and over
2022,40754.7,49811.3,4528.3,14943.39,5071.696,12316.976,16030.182,11864.146,9962.26,9147.166,6701.884
2021,41552.7,46857.3,4420.5,13615.14,6984.39,11493.3,12731.04,12731.04,10167.15,9813.51,6542.34
2019,42221.464,48382.536,4711.408,14134.224,6795.3,12503.352,13318.788,12503.352,9694.628,9151.004,7701.34
2018,38832.694,50029.306,4798.548,13595.886,6309.202,12174.094,12440.68,13062.714,10930.026,9330.51,6309.202
2017,37334.66,46375.34,5441.15,12221.66,5775.99,11886.82,12640.21,10296.33,10798.59,9375.52,5273.73
2016,34229.65,41000.35,3836.73,10381.74,6394.55,11284.5,12412.95,10532.2,9403.75,7372.54,3611.04
2015,36407.872,45224.128,4489.76,13224.384,5959.136,12489.696,14693.76,9469.312,9387.68,7918.304,4081.6
2014,36062.424,43545.576,3582.36,11941.2,7323.936,10985.904,12896.496,11463.552,10349.04,6766.68,4139.616
2013,36811.152,46660.848,5175.264,13105.104,5926.512,13105.104,13272.048,10600.944,10851.36,7595.952,3839.712
2012,35619.528,42493.472,5702.249,10935.82,6717.718,11716.95,14997.696,8904.882,8748.656,7186.396,3280.746


In [11]:
# adjust axes
SEX_AND_AGE.reset_index(inplace=True)
SEX_AND_AGE.rename(columns={'index':'Year'}, inplace=True)
SEX_AND_AGE

Label,Year,Male,Female,Under 5 years,5 to 17 years,18 to 24 years,25 to 34 years,35 to 44 years,45 to 54 years,55 to 64 years,65 to 74 years,75 years and over
0,2022,40754.7,49811.3,4528.3,14943.39,5071.696,12316.976,16030.182,11864.146,9962.26,9147.166,6701.884
1,2021,41552.7,46857.3,4420.5,13615.14,6984.39,11493.3,12731.04,12731.04,10167.15,9813.51,6542.34
2,2019,42221.464,48382.536,4711.408,14134.224,6795.3,12503.352,13318.788,12503.352,9694.628,9151.004,7701.34
3,2018,38832.694,50029.306,4798.548,13595.886,6309.202,12174.094,12440.68,13062.714,10930.026,9330.51,6309.202
4,2017,37334.66,46375.34,5441.15,12221.66,5775.99,11886.82,12640.21,10296.33,10798.59,9375.52,5273.73
5,2016,34229.65,41000.35,3836.73,10381.74,6394.55,11284.5,12412.95,10532.2,9403.75,7372.54,3611.04
6,2015,36407.872,45224.128,4489.76,13224.384,5959.136,12489.696,14693.76,9469.312,9387.68,7918.304,4081.6
7,2014,36062.424,43545.576,3582.36,11941.2,7323.936,10985.904,12896.496,11463.552,10349.04,6766.68,4139.616
8,2013,36811.152,46660.848,5175.264,13105.104,5926.512,13105.104,13272.048,10600.944,10851.36,7595.952,3839.712
9,2012,35619.528,42493.472,5702.249,10935.82,6717.718,11716.95,14997.696,8904.882,8748.656,7186.396,3280.746


In [12]:
SEX_AND_AGE.to_csv("data/final/SEX_AND_AGE.CSV")