# Download for 2021 census demographic by suburb

In [146]:
import requests
import os
import re
import pandas as pd
import glob

from urllib.request import urlretrieve
from urllib.error import HTTPError, URLError

In [42]:
# Ensure the folders are set up - from data download notebook
def create_data_folder(output_dir):
    """
    Create folders for each stage of the ETL pipeline
    :param output_dir: The base directory where the folders will be created
    """
    # set output directory
    import os
    
    # check if data directory exists, if not create it
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # create folders for each stage of the ETL pipeline
    for stage in ['landing', 'raw', 'curated', 'analysis']:
        stage_path = os.path.join(output_dir, stage)
        if not os.path.exists(stage_path):
            os.makedirs(stage_path) 


In [44]:
# puts the landing, raw, curated into data
create_data_folder('../data')

# SAL codes
20001 - 22944

In [50]:
# create the directory for the landing demographic data if it doesn't exist
population_by_suburb_directory = '../data/landing/population_by_suburb'
os.makedirs(population_by_suburb_directory, exist_ok=True)


In [52]:
# store SAL codes for suburbs that do not have data available
no_data = []
for i in range(20001, 22945):
    SAL_CODE = f"SAL{i}"
    URL_TEMPLATE = f"https://abs.gov.au/census/find-census-data/community-profiles/2021/{SAL_CODE}/download/GCP_{SAL_CODE}.xlsx"
    # generate output file path
    output_file_path = f"{population_by_suburb_directory}/{SAL_CODE}_population.xlsx"
    
    # check if output file already exists
    if not os.path.exists(output_file_path):
        # download postcode data with exception handling
        try:
            urlretrieve(URL_TEMPLATE, output_file_path)
            #print(f"✅ File downloaded and saved to {output_file_path}")
        except Exception as e:
            print(f"❌ Unexpected error for SAL {i}: {e}")
            # some Suburbs have no data available
            no_data.append(i)
    else:
        print(f"File already exists at {output_file_path}")

❌ Unexpected error for SAL 20001: HTTP Error 504: Gateway Time-out
File already exists at ../data/landing/population_by_suburb/SAL20002_population.xlsx
File already exists at ../data/landing/population_by_suburb/SAL20003_population.xlsx
File already exists at ../data/landing/population_by_suburb/SAL20004_population.xlsx
File already exists at ../data/landing/population_by_suburb/SAL20005_population.xlsx
❌ Unexpected error for SAL 20006: HTTP Error 404: Not Found
File already exists at ../data/landing/population_by_suburb/SAL20007_population.xlsx
File already exists at ../data/landing/population_by_suburb/SAL20008_population.xlsx
File already exists at ../data/landing/population_by_suburb/SAL20009_population.xlsx
File already exists at ../data/landing/population_by_suburb/SAL20010_population.xlsx
File already exists at ../data/landing/population_by_suburb/SAL20011_population.xlsx
❌ Unexpected error for SAL 20012: HTTP Error 404: Not Found
File already exists at ../data/landing/populatio

# create csv downloads for sheets of interest

In [54]:
selected_sheets = ["G02", "G04", "G17", "G33", 
                   "G36", "G49", "G60"]


In [56]:
# function to extract the suburb name from the localities excel spreadsheet
def extract_suburb_name(excel_data):
    # suburb info is always in row 2, first column
    cell_value = str(df.iloc[1, 0])
    
    # regex: capture everything before (SALxxxxx)
    match = re.search(r"(.+?)\s+\(SAL\d+\)", cell_value)
    if match:
        suburb = match.group(1).strip()
        
        return suburb

In [94]:
def excel_to_csv(file_path, selected_sheets, SAL_CODE, suburb):
    # creates flat csvs for the data stored in multisheet excel documents
    for sheet in selected_sheets:
        df = pd.read_excel(file_path, sheet_name=sheet, header=None)
        if sheet == "G02":
            
            # Left side table (col 0 = name, col 1 = value)
            left = df[[0, 1]].dropna().rename(columns={0: "Statistic", 1: "Value"})
            
            # Right side table (col 3 = name, col 4 = value)
            right = df[[3, 4]].dropna().rename(columns={3: "Statistic", 4: "Value"})
            
            # Combine both
            g02_cleaned = pd.concat([left, right], ignore_index=True)
            # add sheet identifier
            g02_cleaned["Suburb"] = suburb
            #print(g02_cleaned)

            # save the csv if it doesnt already exisit
            if not os.path.exists(f"../data/landing/population_by_suburb/{SAL_CODE}_median_stats.csv"):
                g02_cleaned.to_csv(f"../data/landing/population_by_suburb/{SAL_CODE}_median_stats.csv", index=False)
        elif sheet == "G04":
            # Find where "Age (years):" appears → start of table
            start_row = df.index[df.iloc[:,0].astype(str).str.contains("Age", na=False)].tolist()[0] + 1
            
            # Slice everything below that row
            table = df.iloc[start_row:, :]
            
            # Define the 3 blocks of columns (start_col, end_col)
            blocks = [(0, 3), (5, 8), (10, 13)]
            
            persons_dfs = []
            for start, end in blocks:
                temp = table.iloc[:, start:end+1].copy()
                temp.columns = ["Age group", "Males", "Females", "Persons"]
            
                # Drop rows where both Age group and Persons are empty
                temp = temp.dropna(subset=["Age group", "Persons"], how="any")
            
                # Keep only the relevant columns
                persons_dfs.append(temp[["Age group", "Persons"]])
            
            # Combine all blocks vertically
            persons_only = pd.concat(persons_dfs, ignore_index=True)

            # Reset index
            g04_cleaned = persons_only.reset_index(drop=True)
        
            # add sheet identifier
            g04_cleaned["Suburb"] = suburb
            # save the csv if it does not already exisit
            if not os.path.exists(f"../data/landing/population_by_suburb/{SAL_CODE}_population_breakdown.csv"):
                g04_cleaned.to_csv(f"../data/landing/population_by_suburb/{SAL_CODE}_population_breakdown.csv", 
                                   index=False)
        elif sheet == "G17":
            start_row = df.index[df.iloc[:,1].astype(str).str.contains("PERSONS", na=False)].tolist()[0] + 1
            end_row = df.index[df.iloc[:,0].astype(str).str.contains("This table", na=False)].tolist()[0] - 1
            # Slice everything between these rows
            g17_cleaned = df.iloc[start_row + 1:end_row, :]
            g17_cleaned.columns = ["Price Range", "15-19", "20-24", "25-34", "35-44", "45-54", "55-64", 
                             "65-74", "75-84", "85+", "Total"]
            # drop empty row
            g17_cleaned = g17_cleaned.dropna(subset = ["Price Range"])

            # add suburb name
            g17_cleaned["Suburb"] = suburb
            # save the csv if it does not already exisit
            if not os.path.exists(f"../data/landing/population_by_suburb/{SAL_CODE}_personal_income.csv"):
                g17_cleaned.to_csv(f"../data/landing/population_by_suburb/{SAL_CODE}_personal_income.csv", 
                                   index=False)
        elif sheet == "G33":
            start_row = df.index[df.iloc[:,0].astype(str).str.contains("Negative", na=False)].tolist()[0] + 1
            end_row = df.index[df.iloc[:,0].astype(str).str.contains("Total", na=False)].tolist()[0] + 1

            # Slice everything between these rows
            g33_cleaned = df.iloc[start_row + 1:end_row, :]
            g33_cleaned.columns = ["Income", "Family Households", "Non-family Households", "Total"]

            # drop empty rows
            g33_cleaned = g33_cleaned.dropna(subset = ["Income"])
            # add suburb name
            g33_cleaned["Suburb"] = suburb

            # save the csv if it does not already exisit
            if not os.path.exists(f"../data/landing/population_by_suburb/{SAL_CODE}_household_income.csv"):
                g33_cleaned.to_csv(f"../data/landing/population_by_suburb/{SAL_CODE}_household_income.csv", 
                                   index=False)
        elif sheet == "G36":
            # Find the start and end of the table
            start_idx = df[df[0].str.contains("Occupied private dwellings", na=False)].index[0]
            end_idx = df[df[0].str.contains("Total private dwellings", na=False)].index[0]
            
            # Extract only the table rows
            table_df = df.iloc[start_idx:end_idx + 1, :3]  # first 3 columns (Description, Dwellings, Persons)
            
            # Set proper column names
            table_df.columns = ["Dwelling Type", "Dwellings", "Persons"]
            
            # Remove empty rows
            table_df = table_df.dropna(subset=["Dwelling Type"])

            # Drop "Occupied private dwellings:" header row
            table_df = table_df.drop(table_df[table_df["Dwelling Type"] == 
                                     "Occupied private dwellings:"].index).reset_index(drop=True)
            
            current_section = None
            new_labels = []
            
            # Define the "global totals" that should not be prefixed
            global_totals = [
                "Total occupied private dwellings",
                "Unoccupied private dwellings",
                "Total private dwellings",
                "Dwelling structure not stated"
            ]
            
            for val in table_df["Dwelling Type"]:
                if pd.isna(val):
                    new_labels.append(val)
                elif isinstance(val, str) and val.endswith(":"):
                    # Section header
                    current_section = val.replace(":", "")
                    new_labels.append(None)
                elif val == "Total":
                    # Totals inside a section
                    new_labels.append(f"{current_section} - Total")
                elif val in global_totals:
                    # Reset section for these
                    current_section = None
                    new_labels.append(val)
                else:
                    # Normal row
                    if current_section:
                        new_labels.append(f"{current_section} - {val}")
                    else:
                        new_labels.append(val)
            
            table_df["Dwelling Type"] = new_labels
            g36_cleaned = table_df.dropna(subset=["Dwelling Type"]).reset_index(drop=True)
            g36_cleaned["Suburb"] = suburb
             # save the csv if it does not already exisit
            if not os.path.exists(f"../data/landing/population_by_suburb/{SAL_CODE}_dwelling_structure.csv"):
                g36_cleaned.to_csv(f"../data/landing/population_by_suburb/{SAL_CODE}_dwelling_structure.csv", 
                                   index=False)

            
        
        elif sheet == "G49":
            start_row = df.index[df.iloc[:,1].astype(str).str.contains("PERSONS", na=False)].tolist()[0] + 1
            end_row = df.index[df.iloc[:,0].astype(str).str.contains("This table", na=False)].tolist()[0] - 1
            # Slice everything between these rows
            g49_cleaned = df.iloc[start_row + 1:end_row, :]
            g49_cleaned.columns = ["Highest Education Level", "15-24", "25-34", "35-44", "45-54", "55-64", 
                             "65-74", "75-84", "85+", "Total"]
            # drop empty row
            g49_cleaned = g49_cleaned.dropna(subset = ["Highest Education Level"]).reset_index(drop = True)
            # drop certificate header and total rows
            g49_cleaned = g49_cleaned.drop([4, 8])
            
            # add suburb name
            g49_cleaned["Suburb"] = suburb

            # save the csv if it does not already exisit
            if not os.path.exists(f"../data/landing/population_by_suburb/{SAL_CODE}_education_level.csv"):
                g49_cleaned.to_csv(f"../data/landing/population_by_suburb/{SAL_CODE}_education_level.csv", 
                                   index=False)

        elif sheet == "G60":
            start_row = df.index[df.iloc[:,1].astype(str).str.contains("PERSONS", na=False)].tolist()[0] + 1
            end_row = df.index[df.iloc[:,0].astype(str).str.contains("This table", na=False)].tolist()[0] - 1
            # Slice everything between these rows
            g60_cleaned = df.iloc[start_row + 1:end_row, :]
            g60_cleaned.columns = ["Age", "Managers", "Proffesionals", "Trades workers", "Community workers", 
                                   "Administrative Workers", "Sales Workers", "Drivers", "Labourers",
                                   "Not Stated" ,"Total"]
            # drop empty row
            g60_cleaned = g60_cleaned.dropna(subset = ["Age"]).reset_index(drop = True)
            
            # add suburb name
            g60_cleaned["Suburb"] = suburb
            # save the csv if it does not already exisit
            if not os.path.exists(f"../data/landing/population_by_suburb/{SAL_CODE}_job_type.csv"):
                g60_cleaned.to_csv(f"../data/landing/population_by_suburb/{SAL_CODE}_job_type.csv", 
                                   index=False)

In [None]:
for i in range(20001, 22945):
    if i not in no_data:
        
        SAL_CODE = f"SAL{i}"
        file_path = f"{population_by_suburb_directory}/{SAL_CODE}_population.xlsx"
        
        # retrieve the suburb name
        df = pd.read_excel(file_path, sheet_name="G02", header=None)
        suburb = extract_suburb_name(df)
        # download the associated csvs

        excel_to_csv(file_path, selected_sheets, SAL_CODE, suburb)



# merge csvs to create 7 LARGE csvs

In [143]:
# median stats
all_files = glob.glob("../data/landing/population_by_suburb/*_median_stats.csv")

df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)
if not os.path.exists(f"../data/landing/median_stats.csv"):
                df.to_csv(f"../data/landing/median_stats.csv", 
                                   index=False)


# population breakdown
all_files = glob.glob("../data/landing/population_by_suburb/*_population_breakdown.csv")

df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)
if not os.path.exists(f"../data/landing/population_breakdown.csv"):
                df.to_csv(f"../data/landing/population_breakdown.csv", 
                                   index=False)

# income - personal
all_files = glob.glob("../data/landing/population_by_suburb/*_personal_income.csv")

df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)
if not os.path.exists(f"../data/landing/personal_income.csv"):
                df.to_csv(f"../data/landing/personal_income.csv", 
                                   index=False)

# household income
all_files = glob.glob("../data/landing/population_by_suburb/*_household_income.csv")

df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)
if not os.path.exists(f"../data/landing/household_income.csv"):
                df.to_csv(f"../data/landing/household_income.csv", 
                                   index=False)

# dwelling distribution and structure
all_files = glob.glob("../data/landing/population_by_suburb/*_dwelling_structure.csv")

df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)
if not os.path.exists(f"../data/landing/dwelling_structure.csv"):
                df.to_csv(f"../data/landing/dwelling_structure.csv", 
                                   index=False)


# job type
all_files = glob.glob("../data/landing/population_by_suburb/*_job_type.csv")

df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)
if not os.path.exists(f"../data/landing/job_type.csv"):
                df.to_csv(f"../data/landing/job_type.csv", 
                                   index=False)


# education level
all_files = glob.glob("../data/landing/population_by_suburb/*_education_level.csv")

df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)
if not os.path.exists(f"../data/landing/education_level.csv"):
                df.to_csv(f"../data/landing/education_level.csv", 
                                   index=False)
