In [14]:
import polars as pl
import os
from dotenv import load_dotenv
from pathlib import Path

load_dotenv()

STATES = ["ACT", "NSW", "NT", "QLD", "SA", "TAS", "VIC", "WA"]

combined_data = pl.DataFrame()

for state in STATES:
    dir_path = Path(f'raw_data/election-preferences/distribution-by-polling-place/{state}')
    csv_files = list(dir_path.glob('*.csv'))
    for csv_file in csv_files:
        data = pl.read_csv(csv_file, skip_rows=1)
        if combined_data.is_empty() and not data.is_empty():
            combined_data = data.clone()
        elif not data.is_empty():
           combined_data = pl.concat([combined_data, data])

print("Rows: ", combined_data.shape[0], " Columns: ", combined_data.shape[1])

target_path = "gs://australia-analytics/election/distribution-by-poll-place.parquet"
#
combined_data.write_parquet(target_path, storage_options={"service_account_key_file": os.getenv("GOOGLE_APPLICATION_CREDENTIALS")},)

print("Finished persisting data onto Google Cloud Storage")


Finished persisting data onto Google Cloud Storage


In [15]:
combined_data.head()

StateAb,DivisionId,DivisionNm,PPId,PPNm,CountNum,BallotPosition,CandidateId,Surname,GivenNm,PartyAb,PartyNm,SittingMemberFl,CalculationType,CalculationValue
str,i64,str,i64,str,i64,i64,i64,str,str,str,str,str,str,f64
"""ACT""",101,"""Canberra""",0,"""ABSENT""",0,1,41390,"""MUDFORD""","""Isabel Meredith""","""GRN""","""Australian Greens""","""N""","""Preference Count""",322.0
"""ACT""",101,"""Canberra""",0,"""ABSENT""",0,1,41390,"""MUDFORD""","""Isabel Meredith""","""GRN""","""Australian Greens""","""N""","""Preference Percent""",22.24
"""ACT""",101,"""Canberra""",0,"""ABSENT""",0,1,41390,"""MUDFORD""","""Isabel Meredith""","""GRN""","""Australian Greens""","""N""","""Transfer Count""",0.0
"""ACT""",101,"""Canberra""",0,"""ABSENT""",0,1,41390,"""MUDFORD""","""Isabel Meredith""","""GRN""","""Australian Greens""","""N""","""Transfer Percent""",0.0
"""ACT""",101,"""Canberra""",0,"""ABSENT""",0,2,41742,"""LIDDICOAT""","""Mary-Jane Robyn""","""IMO""","""Health Environment Accountabil…","""N""","""Preference Count""",31.0


In [None]:
# Command: dbt run-operation stage_external_sources --args '{"select": "au_raw_data.election_distributions_by_poll_place"}'