In [36]:
#---------------------------------------------------------------------------------------------------------------------------
# ABS Population Data
# This program will process two input excel files:
#  'ABS2021AgeBySex.xlsx' - a multi-sheet (by State) excel file which contains Male and Female population counts from Age=0 to Age=79
#  'ABS2021AgeBySexGrp.xlsx' - a multi-sheet (by State) excel file which contains Male and Female population counts for age groups:
#                              '80-84 years',  '85-89 years', '90-94 years', '95-99 years', '100 years and over'
# It will output an excel file:
#  'ABS2021StatePopulation.xlsx' - population count for all Sates with the following columns:
#          'AgeGroup' 'Population'     'Sex'              'Desc'            'State'
#       eg. 0-9 Yrs    498144.0        Male                0-9 Yrs Male      NSW
#
#---------------------------------------------------------------------------------------------------------------------------
import pandas as pd     # import pandas

# Define variables

datapath = 'datapandas/'  # downloaded ABS data location under pycharm root directory
file1='ABS2021AgeBySex.xlsx'  #input file
file2='ABS2021AgeBySexGrp.xlsx'  #input file
outfile1='ABS2021StatePopulation.xlsx'  # output file

# Read excel file 'ABS2021AgeBySex.xlsx' looping through its sheets (by State)
stateList = ['NSW','VIC','QLD','SA','WA','TAS','NT','ACT','OTH']
concatStateFile=0  # indicator to check first time concatenation while looping/grouping through State
for state in stateList:  # process each State
    #print("State =",state,'\n')
    df=pd.read_excel(datapath+file1,sheet_name=state)
    # process each Age group in the file
    ageGrps = {'0-9 Yrs '   : [0,   9], '10-24 Yrs ' : [10, 24], '25-39 Yrs ' : [25, 39],
               '40-54 Yrs ' : [40, 54], '55-74 Yrs ' : [55, 74], '75-79 Yrs ' : [75, 79]}
    concatAgeGrp=0  # indicator to check first time concatenation while looping/grouping through age groups
    for str1, ints in ageGrps.items():  # str1 will contain age groups eg. '0-9 years', etc.; ints will contain int list eg [0,9]
        df2 = df.set_index("Age", drop=False)  # set column Age as index for dataframe
        int1=ints[0]   # get age group start eg. 0
        int2=ints[1]   # get age group end   eg. 9
        df2 = df2.loc[int1:int2,:]  # get first 10 rows only; ages 0,1,2..9 data
        #print(df2,'\n')
        # Add Total row at bottom
        df2.loc['Total'] = df2[['Males','Females']].sum()
        #print(df2,'\n')
        # Add AgeGroup column at the end
        df2.loc[:,'AgeGroup'] = str1
        #print(df2,'\n')
        # statement df2.loc['Total',['Population','AgeGroup']] returns Series; convert to dataframe and transpose axis
        #print(df2.loc['Total',['Males','Females','AgeGroup']],'\n')
        #print(pd.DataFrame(df2.loc['Total',['Males','Females','AgeGroup']]),'\n')
        df2=pd.DataFrame(df2.loc['Total',['Males','Females','AgeGroup']]).transpose()
        #print(df2,'\n')

        # Add Population column at the end and put Males population values in it
        df2['Population'] = df2['Males']
        # Add Sex column at the end
        df2['Sex'] = 'Male'
        # Add Desc column at the end appending Sex values (Male/Female) to it
        df2['Desc'] = df2['AgeGroup'] + df2['Sex']
        # Reset index
        df2 = df2.reset_index(drop=True)
        #print(df2,'\n')

        # Create a row for Female population by looping through the dataframe columns
        rowList =[]
        for index, rows in df2.iterrows():
            myList =[rows.Males, rows.Females, rows.AgeGroup, rows.Females, 'Female', rows.AgeGroup+'Female']
            rowList.append(myList)
        #print(rowList,'\n')

        # Create dataframe for Female row
        list = df2.columns.values.tolist() # get the column names of df2 and use it for the new dataframe
        df2a = pd.DataFrame(rowList,columns=list) # create a dataframe from list
        #print(df2,'\n')
        #print(df2a,'\n')
        # Concatenate Male and Female dataframes
        df2 = pd.concat([df2, df2a], ignore_index=True, sort=False)
        # Drop columns that are no longer needed
        df2.drop(['Males', 'Females'], axis=1,inplace=True)
        #print(df2,'\n')

        if concatAgeGrp==0: # Save dataframe first time for concatenation later
           concatAgeGrp=1   # Reset indicator
           df3=df2
        else:
            df3=pd.concat([df3,df2], ignore_index = True)  # Concatenate dataframes
    #print('Grouped lowest level data df3.','\n',df3,'\n')  # Concatenated data grouped (by Age)
#       AgeGroup Population     Sex              Desc
#0     0-9 Yrs    498144.0    Male      0-9 Yrs Male
#1     0-9 Yrs    470730.0  Female    0-9 Yrs Female
#2   10-24 Yrs    748407.0    Male    10-24 Yrs Male
#3   .........    ........


    # Read and process ABS-supplied grouped Age by sex population data. This ABS-supplied grouped data
    # will be concatenated to the grouped low-level df3 dataframe created above.
    df4=pd.read_excel(datapath+file2,sheet_name=state)
    #print(df4,'\n')
    # Rename column 'Age' to 'AgeGroup'
    df4.rename(columns = {'Age':'AgeGroup'}, inplace=True)
    # Rename values in AgeGroup column
    df4['AgeGroup'] = df4['AgeGroup'].replace('80-84 years','80-84 Yrs ')
    df4['AgeGroup'] = df4['AgeGroup'].replace('85-89 years','85-89 Yrs ')
    df4['AgeGroup'] = df4['AgeGroup'].replace('90-94 years','90-94 Yrs ')
    df4['AgeGroup'] = df4['AgeGroup'].replace('95-99 years','95-99 Yrs ')
    df4['AgeGroup'] = df4['AgeGroup'].replace('100 years and over','100+ Yrs ')
    #print(df4,'\n')

    # Add Population column and populate it with values from Males column
    df4['Population'] = df4['Males']
    # Add Sex column at the end
    df4['Sex'] = 'Male'
    # Add Desc column at the end appending Sex values (Male/Female) to it
    df4['Desc'] = df4['AgeGroup'] + df4['Sex']
    #print(df4,'\n')

#     AgeGroup  Males  Females  Population   Sex            Desc
#0  80-84 Yrs   83254   100154       83254  Male  80-84 Yrs Male
#1  85-89 Yrs   46132    64569       46132  Male  85-89 Yrs Male
#2  90-94 Yrs   20100    35703       20100  Male  90-94 Yrs Male
#3  95-99 Yrs    4453    11033        4453  Male  95-99 Yrs Male
#4   100+ Yrs     395     1518         395  Male   100+ Yrs Male

    # Create a row for Female population by looping through the dataframe columns
    rowList =[]
    for index, rows in df4.iterrows():
        myList =[rows.AgeGroup, rows.Males, rows.Females,  rows.Females, 'Female', rows.AgeGroup+'Female']
        rowList.append(myList)
    #print(rowList,'\n')
    # Create dataframe for Female row
    list = df4.columns.values.tolist() # get the column names of df4 and use it for the new dataframe
    #print(list,'\n')
#['AgeGroup', 'Males', 'Females', 'Population', 'Sex', 'Desc']
    df4a = pd.DataFrame(rowList,columns=list)
    #print(df4a,'\n')
#     AgeGroup  Males  Females  Population     Sex            Desc
#0  80-84 Yrs   83254   100154      100154  Female  80-84 Yrs Female
#1  85-89 Yrs   46132    64569       64569  Female  85-89 Yrs Female
#2  90-94 Yrs   20100    35703       35703  Female  90-94 Yrs Female
#3  95-99 Yrs    4453    11033       11033  Female  95-99 Yrs Female
#4   100+ Yrs     395     1518        1518  Female   100+ Yrs Female

    # Concatenate Male and Female dataframes
    df4 = pd.concat([df4, df4a], ignore_index=True, sort=False)
    #print(df4,'\n')
    # Drop columns that are no longer needed
    df4.drop(['Males', 'Females'], axis=1,inplace=True)
    #print(df4,'\n')

    # Concatenate 'grouped by Age lowest-level' dataframe and 'abs-supplied grouped by Age dataframe'
    dfGrpTot=pd.concat([df3,df4], ignore_index=True)
    #print(dfGrpTot,'\n')
    # Add State column
    dfGrpTot['State'] = state

    # concatenate all State data into df5
    if concatStateFile==0: # Save dataframe first time for concatenation later
        df5=dfGrpTot
        concatStateFile=1   # Reset indicator
    else:
        df5=pd.concat([df5,dfGrpTot], ignore_index = True)  # Concatenate dataframes

# Re-order columns
cols = ['State', 'AgeGroup', 'Sex', 'Desc', 'Population']
df5 = df5[cols]

# Sort by State
df5.sort_values(by=['State','AgeGroup'], inplace=True)
#print(df5,'\n')
#       AgeGroup Population     Sex              Desc State
#0      0-9 Yrs    498144.0    Male      0-9 Yrs Male   NSW
#1      0-9 Yrs    470730.0  Female    0-9 Yrs Female   NSW
#2    10-24 Yrs    748407.0    Male    10-24 Yrs Male   NSW
#3    10-24 Yrs    706813.0  Female  10-24 Yrs Female   NSW
#4    25-39 Yrs    849514.0    Male    25-39 Yrs Male   NSW
#..          ...        ...     ...               ...   ...
#193  80-84 Yrs          56  Female  80-84 Yrs Female   OTH
#194  85-89 Yrs          25  Female  85-89 Yrs Female   OTH
#195  90-94 Yrs          12  Female  90-94 Yrs Female   OTH
#196  95-99 Yrs           0  Female  95-99 Yrs Female   OTH
#197   100+ Yrs           0  Female   100+ Yrs Female   OTH

# Save df5 into an excel file;
df5.to_excel(datapath+outfile1,index=False)




#end of file

