In [2]:
# Setup path and imports
import sys
from pathlib import Path

# Add project root to path
project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

import pandas as pd
from src.api_client import ABSClient
from src.explorer import CensusExplorer
from src.data_extractor import DataExtractor

# Initialize clients
client = ABSClient()
explorer = CensusExplorer(client=client)
extractor = DataExtractor(client=client)

print("Clients initialized successfully!")

Clients initialized successfully!


## 1. Explore Dataflow Structure

First, let's understand the structure of the C21_G01_SA2 dataflow - its dimensions and available codes.

In [3]:
# Get dataflow structure
dataflow_id = "C21_G01_SA2"
details = explorer.get_dataflow_details(dataflow_id)

print("Dimensions:")
details["dimensions"]

Dimensions:


Unnamed: 0,id,position,codelist,concept
0,SEXP,0,urn:sdmx:org.sdmx.infomodel.codelist.Codelist=...,urn:sdmx:org.sdmx.infomodel.conceptscheme.Conc...
1,PCHAR,1,urn:sdmx:org.sdmx.infomodel.codelist.Codelist=...,urn:sdmx:org.sdmx.infomodel.conceptscheme.Conc...
2,REGION,2,urn:sdmx:org.sdmx.infomodel.codelist.Codelist=...,urn:sdmx:org.sdmx.infomodel.conceptscheme.Conc...
3,REGION_TYPE,3,urn:sdmx:org.sdmx.infomodel.codelist.Codelist=...,urn:sdmx:org.sdmx.infomodel.conceptscheme.Conc...
4,STATE,4,urn:sdmx:org.sdmx.infomodel.codelist.Codelist=...,urn:sdmx:org.sdmx.infomodel.conceptscheme.Conc...


In [4]:
# Show available codelists (non-geographic, smaller ones)
print("Available codelists:")
for codelist_id, codes_df in details["codelists"].items():
    if len(codes_df) < 50:
        print(f"\n{codelist_id} ({len(codes_df)} codes):")
        display(codes_df)

Available codelists:

CL_C21_PCHAR01 (36 codes):


Unnamed: 0,code,name
0,P_1,Total persons
1,0_4,Age groups: 0-4 years
2,5_14,Age groups: 5-14 years
3,15_19,Age groups: 15-19 years
4,20_24,Age groups: 20-24 years
5,25_34,Age groups: 25-34 years
6,35_44,Age groups: 35-44 years
7,45_54,Age groups: 45-54 years
8,55_64,Age groups: 55-64 years
9,65_74,Age groups: 65-74 years



CL_C21_SEXP01 (3 codes):


Unnamed: 0,code,name
0,3,Persons
1,1,Males
2,2,Females



CL_REGION_TYPE (43 codes):


Unnamed: 0,code,name
0,AUS,Australia
1,STE,States and Territories
2,SA4,Statistical Area Level 4
3,SA3,Statistical Area Level 3
4,SA2,Statistical Area Level 2
5,SA1,Statistical Area Level 1
6,RA,Remoteness Area
7,SOS,Section of State
8,SOSR,Section of State Ranges
9,UC,Urban Centres



CL_STATE (10 codes):


Unnamed: 0,code,name
0,AUS,Australia
1,1,New South Wales
2,2,Victoria
3,3,Queensland
4,4,South Australia
5,5,Western Australia
6,6,Tasmania
7,7,Northern Territory
8,8,Australian Capital Territory
9,9,Other Territories


## 2. Download the Data

Fetch the full dataset from the ABS API.

In [13]:
# Fetch the data with labels
df = client.get_data(
    dataflow_id=dataflow_id,
    data_key="all",
    start_period="2021",
    end_period="2021",
    response_format="csv_labels",
)

print(f"Data shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
df.head(10)


Data shape: (321732, 8)

Columns: ['DATAFLOW', 'SEXP: Sex', 'PCHAR: Selected person characteristic', 'REGION: Region', 'REGION_TYPE: Region Type', 'STATE: State', 'TIME_PERIOD: Time Period', 'OBS_VALUE']


Unnamed: 0,DATAFLOW,SEXP: Sex,PCHAR: Selected person characteristic,REGION: Region,REGION_TYPE: Region Type,STATE: State,TIME_PERIOD: Time Period,OBS_VALUE
0,ABS:C21_G01_SA2(1.0.0),3: Persons,65_74: Age groups: 65-74 years,213051588: Truganina - South West,SA2: Statistical Area Level 2,2: Victoria,2021,601
1,ABS:C21_G01_SA2(1.0.0),1: Males,65_74: Age groups: 65-74 years,114: Southern Highlands and Shoalhaven,SA4: Statistical Area Level 4,1: New South Wales,2021,11921
2,ABS:C21_G01_SA2(1.0.0),2: Females,45_54: Age groups: 45-54 years,307: Darling Downs - Maranoa,SA4: Statistical Area Level 4,3: Queensland,2021,8361
3,ABS:C21_G01_SA2(1.0.0),2: Females,45_54: Age groups: 45-54 years,401: Adelaide - Central and Hills,SA4: Statistical Area Level 4,4: South Australia,2021,20641
4,ABS:C21_G01_SA2(1.0.0),3: Persons,35_44: Age groups: 35-44 years,506: Perth - South East,SA4: Statistical Area Level 4,5: Western Australia,2021,80385
5,ABS:C21_G01_SA2(1.0.0),3: Persons,25_34: Age groups: 25-34 years,509: Western Australia - Wheat Belt,SA4: Statistical Area Level 4,5: Western Australia,2021,13823
6,ABS:C21_G01_SA2(1.0.0),1: Males,55_64: Age groups: 55-64 years,10401: Clarence Valley,SA3: Statistical Area Level 3,1: New South Wales,2021,4112
7,ABS:C21_G01_SA2(1.0.0),3: Persons,45_54: Age groups: 45-54 years,21104: Whitehorse - East,SA3: Statistical Area Level 3,2: Victoria,2021,8593
8,ABS:C21_G01_SA2(1.0.0),3: Persons,35_44: Age groups: 35-44 years,21305: Wyndham,SA3: Statistical Area Level 3,2: Victoria,2021,57834
9,ABS:C21_G01_SA2(1.0.0),2: Females,20_24: Age groups: 20-24 years,40103: Burnside,SA3: Statistical Area Level 3,4: South Australia,2021,1234


## 3. Save to Parquet

Save the raw data to Parquet format for efficient storage and future use.

In [17]:
df.columns

Index(['DATAFLOW', 'SEXP: Sex', 'PCHAR: Selected person characteristic',
       'REGION: Region', 'REGION_TYPE: Region Type', 'STATE: State',
       'TIME_PERIOD: Time Period', 'OBS_VALUE'],
      dtype='object')

In [6]:
# Save to parquet
output_path = project_root / "data" / "raw" / "c21_g01_sa2_selected_person_characteristics.parquet"
output_path.parent.mkdir(parents=True, exist_ok=True)

df.to_parquet(output_path, index=False)
print(f"Saved {len(df):,} rows to: {output_path}")

Saved 321,732 rows to: C:\Users\JohnMarquess\Projects\ABS_Data\data\raw\c21_g01_sa2_selected_person_characteristics.parquet


## 4. Explore the Data

Let's explore the unique values in key columns.

In [7]:
# Check column data types and basic info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 321732 entries, 0 to 321731
Data columns (total 8 columns):
 #   Column                                 Non-Null Count   Dtype 
---  ------                                 --------------   ----- 
 0   DATAFLOW                               321732 non-null  object
 1   SEXP: Sex                              321732 non-null  object
 2   PCHAR: Selected person characteristic  321732 non-null  object
 3   REGION: Region                         321732 non-null  object
 4   REGION_TYPE: Region Type               321732 non-null  object
 5   STATE: State                           321732 non-null  object
 6   TIME_PERIOD: Time Period               321732 non-null  int64 
 7   OBS_VALUE                              321732 non-null  int64 
dtypes: int64(2), object(6)
memory usage: 19.6+ MB


In [8]:
# Explore unique values in each column
for col in df.columns:
    n_unique = df[col].nunique()
    print(f"\n{col}: {n_unique} unique values")
    if n_unique <= 20:
        print(df[col].unique())


DATAFLOW: 1 unique values
['ABS:C21_G01_SA2(1.0.0)']

SEXP: Sex: 3 unique values
['3: Persons' '1: Males' '2: Females']

PCHAR: Selected person characteristic: 36 unique values

REGION: Region: 2981 unique values

REGION_TYPE: Region Type: 6 unique values
['SA2: Statistical Area Level 2' 'SA4: Statistical Area Level 4'
 'SA3: Statistical Area Level 3'
 'GCCSA: Greater Capital City Statistical Areas' 'AUS: Australia'
 'STE: States and Territories']

STATE: State: 10 unique values
['2: Victoria' '1: New South Wales' '3: Queensland' '4: South Australia'
 '5: Western Australia' '6: Tasmania' '7: Northern Territory'
 '8: Australian Capital Territory' '9: Other Territories' 'AUS: Australia']

TIME_PERIOD: Time Period: 1 unique values
[2021]

OBS_VALUE: 26715 unique values


In [9]:
# Summary statistics for the observation values
df['OBS_VALUE'].describe()

count    3.217320e+05
mean     7.395901e+03
std      1.429747e+05
min      0.000000e+00
25%      1.310000e+02
50%      4.950000e+02
75%      2.145000e+03
max      2.542279e+07
Name: OBS_VALUE, dtype: float64

## 5. Data Analysis Examples

Some example analyses using the data.

In [10]:
# Filter for a specific region type if available
region_type_cols = [c for c in df.columns if 'REGION_TYPE' in c.upper()]
if region_type_cols:
    print(f"Region types available: {df[region_type_cols[0]].unique()}")

Region types available: ['SA2: Statistical Area Level 2' 'SA4: Statistical Area Level 4'
 'SA3: Statistical Area Level 3'
 'GCCSA: Greater Capital City Statistical Areas' 'AUS: Australia'
 'STE: States and Territories']


In [11]:
# Show sample data for a specific measure/characteristic
measure_cols = [c for c in df.columns if 'SEXP' in c.upper() or 'MEASURE' in c.upper()]
if measure_cols:
    print(f"Available measures in {measure_cols[0]}:")
    for m in df[measure_cols[0]].unique():
        print(f"  - {m}")

Available measures in SEXP: Sex:
  - 3: Persons
  - 1: Males
  - 2: Females
