# Download for 2021 census demographic by postcode

In [145]:
import requests
import os
import pandas as pd
from urllib.request import urlretrieve
from urllib.error import HTTPError, URLError

In [147]:
# Ensure the folders are set up - from data download notebook
def create_data_folder(output_dir):
    """
    Create folders for each stage of the ETL pipeline
    :param output_dir: The base directory where the folders will be created
    """
    # set output directory
    import os
    
    # check if data directory exists, if not create it
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # create folders for each stage of the ETL pipeline
    for stage in ['landing', 'raw', 'curated', 'analysis']:
        stage_path = os.path.join(output_dir, stage)
        if not os.path.exists(stage_path):
            os.makedirs(stage_path) 


In [149]:
# puts the landing, raw, curated into data
create_data_folder('../data')

In [151]:
# create the directory for the landing demographic data if it doesn't exist
population_by_postcode_directory = '../data/landing/population_by_postcode'
os.makedirs(population_by_postcode_directory, exist_ok=True)


In [153]:
# open the vic_suburbs_postcodes
postcodes_df = pd.read_csv("../data/geo/vic_suburbs_postcodes.csv")

In [155]:
# URL template for moving annual rent by suburb
used_postcodes = []
for postcode in postcodes_df["postcode"]:
    AREA_CODE = f"POA{postcode}"
    URL_TEMPLATE = f"https://abs.gov.au/census/find-census-data/community-profiles/2021/{AREA_CODE}/download/GCP_{AREA_CODE}.xlsx"
    
    # generate output file path
    output_file_path = f"{population_by_postcode_directory}/{AREA_CODE}_population.xlsx"
    
    # check if output file already exists
    if not os.path.exists(output_file_path):
        # download postcode data with exception handling
        try:
            urlretrieve(URL_TEMPLATE, output_file_path)
            print(f"✅ File downloaded and saved to {output_file_path}")
            used_postcodes.append(postcode)
        except Exception as e:
            print(f"❌ Unexpected error for postcode {postcode}: {e}")

    else:
        print(f"File already exists at {output_file_path}")

✅ File downloaded and saved to ../data/landing/population_by_postcode/POA3000_population.xlsx
❌ Unexpected error for postcode 3001: HTTP Error 504: Gateway Time-out
✅ File downloaded and saved to ../data/landing/population_by_postcode/POA3002_population.xlsx
✅ File downloaded and saved to ../data/landing/population_by_postcode/POA3003_population.xlsx
✅ File downloaded and saved to ../data/landing/population_by_postcode/POA3004_population.xlsx
File already exists at ../data/landing/population_by_postcode/POA3004_population.xlsx
❌ Unexpected error for postcode 3005: HTTP Error 504: Gateway Time-out
✅ File downloaded and saved to ../data/landing/population_by_postcode/POA3006_population.xlsx
File already exists at ../data/landing/population_by_postcode/POA3006_population.xlsx
✅ File downloaded and saved to ../data/landing/population_by_postcode/POA3008_population.xlsx
❌ Unexpected error for postcode 3010: HTTP Error 504: Gateway Time-out
✅ File downloaded and saved to ../data/landing/popu

# open and format the excel sheets into flat csvs


Only doing for selected sheets - can add more

In [185]:
selected_sheets = ["G02", "G04", "G17", "G33", "G34", 
                   "G36", "G40", "G49", "G50", "G56", "G60", "G62"]
# so far only made csvs for the median data - G02 and population breakdown - G04

In [207]:
def excel_to_csv(file_path, selected_sheets, AREA_CODE):
    # creates flat csvs for the data stored in multisheet excel documents
    for sheet in selected_sheets:
        df = pd.read_excel(file_path, sheet_name=sheet, header=None)
        if sheet == "G02":
            
            # Left side table (col 0 = name, col 1 = value)
            left = df[[0, 1]].dropna().rename(columns={0: "Statistic", 1: "Value"})
            
            # Right side table (col 3 = name, col 4 = value)
            right = df[[3, 4]].dropna().rename(columns={3: "Statistic", 4: "Value"})
            
            # Combine both
            g02_cleaned = pd.concat([left, right], ignore_index=True)
            # add sheet identifier
            g02_cleaned["Sheet"] = sheet
            #print(g02_cleaned)

            # save the csv if it doesnt already exisit
            if not os.path.exists(f"../data/landing/population_by_postcode/{AREA_CODE}_median_stats.csv"):
                g02_cleaned.to_csv(f"../data/landing/population_by_postcode/{AREA_CODE}_median_stats.csv", index=False)
        elif sheet == "G04":
            # Find where "Age (years):" appears → start of table
            start_row = df.index[df.iloc[:,0].astype(str).str.contains("Age", na=False)].tolist()[0] + 1
            
            # Slice everything below that row
            table = df.iloc[start_row:, :]
            
            # Define the 3 blocks of columns (start_col, end_col)
            blocks = [(0, 3), (5, 8), (10, 13)]
            
            persons_dfs = []
            for start, end in blocks:
                temp = table.iloc[:, start:end+1].copy()
                temp.columns = ["Age group", "Males", "Females", "Persons"]
            
                # Drop rows where both Age group and Persons are empty
                temp = temp.dropna(subset=["Age group", "Persons"], how="any")
            
                # Keep only the relevant columns
                persons_dfs.append(temp[["Age group", "Persons"]])
            
            # Combine all blocks vertically
            persons_only = pd.concat(persons_dfs, ignore_index=True)

            # Reset index
            g04_cleaned = persons_only.reset_index(drop=True)

            # save the csv if it does not already exisit
            #if not os.path.exists(f"../data/landing/population_by_postcode/{AREA_CODE}_population_breakdown.csv"):
               # g04_cleaned.to_csv(f"../data/landing/population_by_postcode/{AREA_CODE}_population_breakdown.csv", 
                          #         index=False)
            g04_cleaned.to_csv(f"../data/landing/population_by_postcode/{AREA_CODE}_population_breakdown.csv", 
                               index=False)

In [209]:
for postcode in used_postcodes:
    AREA_CODE = f"POA{postcode}"
    file_path = f"{population_by_postcode_directory}/{AREA_CODE}_population.xlsx"
    excel_to_csv(file_path, selected_sheets, AREA_CODE)