In [2]:
#---------------------------------------------------------------------------------------------------------------------------
# ABS Population Data - raw data in excel multi-sheet file, sheet G04
# Process ABS 'General Community Profile' (GCP) data downloaded from ABS online site as excel files into 'datapandas/' directory.
# Each State and Territories of Australia has their own GCP excel multi-sheet file.
# Each file has sheet 'G14' which contains the count of persons (population) data by Age by Sex at the time of 2021 census.
# Each file has been renamed as follows:
#    'GCP1GeneralCommunityProfileNSW.xlsx', 'GCP2GeneralCommunityProfileVIC.xlsx', 'GCP3GeneralCommunicationProfileQLD.xlsx'
#    'GCP4GeneralCommunityProfileSA.xlsx', 'GCP5GeneralCommunityProfileWA.xlsx', 'GCP6GeneralCommunityProfileTAS.xlsx'
#    GCP7GeneralCommunityProfileNT.xlsx', 'GCP8GeneralCommunityProfileACT.xlsx', 'GCP9GeneralCommunityProfileOTH.xlsx'
# All GCP files will be read and its G04 sheet processed by pandas to create two reformatted population count by Age by Sex
# multi-sheet excel (by State) files.
# Output files:
#  1. 'ABS2021AgeBySex.xlsx' - file containing the lowest level population count by "Age by Sex" for ages 0,1,2...up to 79.
#  2. 'ABS2021AgeBySexGrp.xlsx' - file containing grouped level population count by "Age by Sex".
#          ABS does not supply lowest level data for ages 80 years onwards, only grouped count is supplied:
#          '80-84 years', '85-89 years', '90-94 years', '95-99 years', '100 years and over'
# NOTE: Row 9 of each of the GCP's G04 sheet excel state FILES WERE EDITED to put labels/headers so that data can be
#       properly loaded by pandas. IT IS JUST A ONE-LINER MINOR EDIT PER FILE.
#---------------------------------------------------------------------------------------------------------------------------
import os               # needed for directory and file listing
import re               # import regular expressions module for string search
import pandas as pd     # import pandas

# Define variables
datapath = 'datapandas/'  # downloaded ABS data location under pycharm root directory
substring = "GCP"  # for use in processing only ABS GCP files
sheetFlag=0  # this flag is for the saving of multiple sheets in the output excel file saved from manipulated dataftrames.
outfilename="ABS2021EduInstBySex.xlsx"     #output file

# Read datasource directory and loop through and process each ABS GCP State excel file.
with os.scandir(datapath) as entries:
    for entry in entries:
        if entry.is_file() and re.search(substring, entry.name):  # loop through all the State GCP files
            #print(entry.name)

# Create a dataframe from a GCP sheet G04 excel file.
# Skip top 8 rows when loading as they are only heading information
            df=pd.read_excel(datapath+entry.name,sheet_name='G15',skiprows=7)
            #print(df)

# Drop/delete unnecessary columns in the dataframe.
# To specify we want to drop column, we need to provide axis=1 as an argument to drop function.
# Parameter inplace=True tells pandas to update the same dataframe therefore losing what is being dropped
# Persons1 and Persons2 are Totals which are not needed. Blank1 and Blank2 are columns with spaces as values.
            df.drop(['Persons'], axis=1,inplace=True)
            #print(df)

            # Rename values in AgeGroup column
            df['Institution'] = df['Institution'].replace('Total Primary(a)','Primary')
            df['Institution'] = df['Institution'].replace('Total Secondary(b)','Secondary')
            df['Institution'] = df['Institution'].replace('Total Vocational education (including TAFE and private training providers)','Vocational')
            df['Institution'] = df['Institution'].replace('Total University or higher education','University')
            df['Institution'] = df['Institution'].replace('Total','Other')
            #print(df)

            # Get required rows
            df = df.iloc[[1,7,13,24,34,42], :].reset_index(drop=True)
            #print(df,'\n')

            # Check which State this file is for
            stateList = ['NSW','VIC','QLD','SA','WA','TAS','NT','ACT','OTH']
            for state in stateList:  # Check which State were processing
                if re.search(state, entry.name):
                    # Add State column
                    df.loc[:,'State'] = state
                    if sheetFlag==0:  # save dataframe
                        dfSave = df
                        sheetFlag=1
                    else:
                        # Concatenate dataframes
                        dfSave=pd.concat([dfSave,df], ignore_index = True)
                    break
#print(dfSave,'\n')
#Save dataframe to excel
dfSave.to_excel(datapath+outfilename,index=False)

#end of file