In [1]:
#---------------------------------------------------------------------------------------------------------------------------
# ABS Population Data
# This program will process two input excel files:
#  'ABS2021AgeBySex.xlsx' - a multi-sheet (by State) excel file which contains Male and Female population counts from Age=0 to Age=79
#  'ABS2021AgeBySexGrp.xlsx' - a multi-sheet (by State) excel file which contains Male and Female population counts for age groups:
#                              '80-84 years',  '85-89 years', '90-94 years', '95-99 years', '100 years and over'
# It will output two excel files:
#  'ABS2021StateAgegrpPopulation.xlsx' - population count for all Sates with the following groupings:
#          '0-9 years', '10-24 years', '25-39 years', '40-54 years', '55-74 years', '75-79 years',
#          '80-84 years', '85-89 years', '90-94 years', '95-99 years', '100 years and over
#  ABS2021StateAgegrpIntPopulation - population count for all States with the following groupings (this is to support the
#                                    Population by Generation Bar Chart in the ABS website (Population topic)
#                                    see => https://www.abs.gov.au/statistics/people/population/population-census/2021 :
#          '0-9 years', '10-24 years', '25-39 years', '40-54 years', '55-74 years', '75 years and over'
#---------------------------------------------------------------------------------------------------------------------------
import os               # needed for directory and file listing
import re               # import regular expressions module for string search
import pandas as pd     # import pandas

# Define variables

datapath = 'datapandas/'  # downloaded ABS data location under pycharm root directory
file1='ABS2021AgeBySex.xlsx'       #input file
file2='ABS2021AgeBySexGrp.xlsx'    #input file
outfile1='ABS2021StateAgegrpPopulation.xlsx'     # output file
outfile2='ABS2021StateAgegrpIntPopulation.xlsx'  # output file

# Read excel file 'ABS2021AgeBySex.xlsx' looping through its sheets (by State)
stateList = ['NSW','VIC','QLD','SA','WA','TAS','NT','ACT','OTH']
concatStateFile=0  # indicator to check first time concatenation while looping/grouping through State
for state in stateList:  # process each State
    #print("State =",state,'\n')
    df=pd.read_excel(datapath+file1,sheet_name=state)
    # process each Age group in the file
    ageGrps = {'0-9 years'   : [0,   9], '10-24 years' : [10, 24], '25-39 years' : [25, 39],
               '40-54 years' : [40, 54], '55-74 years' : [55, 74], '75-79 years' : [75, 79]}
    concatAgeGrp=0  # indicator to check first time concatenation while looping/grouping through age groups
    for str1, ints in ageGrps.items():  # str1 will contain age groups eg. '0-9 years', etc.; ints will contain int list eg [0,9]
        df2 = df.set_index("Age", drop=False)  # set column Age as index for dataframe
        #print(df2,'\n')
        #print(str1, ints)
        #print('str1=',str1, ' ', type(str1))
        int1=ints[0]   # get age group start eg. 0
        int2=ints[1]   # get age group end   eg. 9
        #print('int1=',int1, ' ', type(int1))
        #print('int2=',int2, ' ', type(int2))
        df2 = df2.loc[int1:int2,:]  # get first 10 rows only; ages 0,1,2..9 data
        #print(df2,'\n')
        df2['Population'] = df2[['Males', 'Females']].sum(axis=1)  # add Total column at the end
        #print(df2,'\n')
        df2.loc['Total'] = df2[['Males','Females','Population']].sum()   # Add Total row at  bottom
        #print(df2,'\n')
        df2.loc[:,'AgeGroup'] = str1               # Add column at the end
        #print(df2,'\n')
        # statement df2.loc['Total',['Population','AgeGroup']] returns Series; convert to dataframe and transpose axis
        #print(df2.loc['Total',['Population','AgeGroup']])
        df2=pd.DataFrame(df2.loc['Total',['Population','AgeGroup']]).transpose()
        #print('df2=', df2,'\n')
        if concatAgeGrp==0: # Save dataframe first time for concatenation later
           concatAgeGrp=1   # Reset indicator
           df3=df2
        else:
            df3=pd.concat([df3,df2], ignore_index = True)  # Concatenate dataframes
    #print('Grouped lowest level data df3.','\n',df3,'\n')  # Concatenated data grouped (by Age)

    # Read and process ABS-supplied grouped Age by sex population data
    #df4=pd.read_excel(datapath+file2,sheet_name=stateList)
    df4=pd.read_excel(datapath+file2,sheet_name=state)
    #print(df4,'\n')
    # Rename column 'Age' to 'AgeGroup'
    df4.rename(columns = {'Age':'AgeGroup'}, inplace=True)
    df4['Population'] = df4[['Males', 'Females']].sum(axis=1)  # add Total column at the end
    #print(df4,'\n')
    #re-order columns to match grouped lowest-level data
    df4 = df4.loc[:,['Population','AgeGroup']]
    #print(df4,'\n')
    # Concatenate 'grouped by Age lowest-level' dataframe and 'abs-supplied grouped by Age dataframe'
    dfGrpTot=pd.concat([df3,df4], ignore_index=True)
    #print(dfGrpTot,'\n')
    # Add State column
    dfGrpTot['State'] = state
    # Re-order columns
    cols = ['State', 'AgeGroup', 'Population']
    dfGrpTot = dfGrpTot[cols]
    # concatenate all State data into df5 with State as added column; Age groups '0-9',...'100 years and over'
    if concatStateFile==0: # Save dataframe first time for concatenation later
        df5=dfGrpTot
    else:
        df5=pd.concat([df5,dfGrpTot], ignore_index = True)  # Concatenate dataframes

    # Sum age for '75 years and over' group
    #print(dfGrpTot,'\n')
    df6 = dfGrpTot.iloc[5:11].sum()  #Sum rows 5-11; this is the grouping for '75 years and over'
    #print("df6-1", df6,'\n')
    df6.loc['AgeGroup'] = '75 years and over'  #rename 'AgeGroup' column
    #print("df6-2", df6,'\n')
    df6 = df6.to_frame().transpose()  # transpose data frame to align with dfGrpTot format
    #print("Transpose", df6,'\n')
    dfGrpTot = pd.concat([dfGrpTot.iloc[0:5,:],df6], ignore_index = True)
    #print('Concatenated Grouped data 75+.', '\n', dfGrpTot, '\n')
    # Set State column value
    dfGrpTot['State'] = state
    # Re-order columns
    cols = ['State', 'AgeGroup', 'Population']
    dfGrpTot = dfGrpTot[cols]
    #print('dfGrpTot\n',state, '\n', dfGrpTot,'\n')

    # concatenate all State data into df7 with State as added column; Age groups '0-9',...'75 years and over'
    if concatStateFile==0: # Save dataframe first time for concatenation later
        concatStateFile=1   # Reset indicator
        df7=dfGrpTot
    else:
        df7=pd.concat([df7,dfGrpTot], ignore_index = True)  # Concatenate dataframes

    # Calculate percentage
    #dfGrpTot['percent'] = (dfGrpTot['Population'] / dfGrpTot['Population'].sum()) * 100
    #print('Final grouped data for Visualisation-1', '\n', dfGrpTot,'\n')

# Save df5 into an excel file; it has State as column and for Age groups '0-9',...'100 years and over'
# print('Data to save to excel1.', '\n', df5, '\n')
df5.to_excel(datapath+outfile1,index=False)

# Save into excel file with State as added column; Age groups '0-9',...'75 years and over'
# print('Data to save to excel2.', '\n', df7, '\n')
df7.to_excel(datapath+outfile2,index=False)


#end of file

Grouped lowest level data df3. 
   Population     AgeGroup
0   968874.0    0-9 years
1  1455220.0  10-24 years
2  1722217.0  25-39 years
3  1539927.0  40-54 years
4  1750502.0  55-74 years
5   268109.0  75-79 years 

Grouped lowest level data df3. 
   Population     AgeGroup
0   778082.0    0-9 years
1  1164934.0  10-24 years
2  1465421.0  25-39 years
3  1255684.0  40-54 years
4  1352109.0  55-74 years
5   203205.0  75-79 years 

Grouped lowest level data df3. 
   Population     AgeGroup
0   621760.0    0-9 years
1   979801.0  10-24 years
2  1058850.0  25-39 years
3  1004827.0  40-54 years
4  1118765.0  55-74 years
5   168381.0  75-79 years 

Grouped lowest level data df3. 
   Population     AgeGroup
0   197982.0    0-9 years
1   313493.0  10-24 years
2   354299.0  25-39 years
3   332296.0  40-54 years
4   423458.0  55-74 years
5    66996.0  75-79 years 

Grouped lowest level data df3. 
   Population     AgeGroup
0   334413.0    0-9 years
1   483467.0  10-24 years
2   573256.0  25-39 y