In [None]:
import requests
import time
import random
import csv
import logging
from pathlib import Path

In [None]:
# --- Configuration ---
BASE_URL = "https://services.geohub.sa.gov.au/sappa/Filter/ParcelAddress"
STATE = "SA"  # State is constant for all addresses
OUTPUT_DIR = Path("scraped_addresses")
FULL_RESULTS_FILE = OUTPUT_DIR / "sa_addresses_full.csv"
PARTIAL_RESULTS_FILE = OUTPUT_DIR / "sa_addresses_partial.csv"

In [None]:
SUBURBS = ["Aberfoyle Park"]

In [None]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Ensure output directory exists
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
def make_request_with_retry(url, params=None, max_retries=5, backoff_factor=2):
    """
    Makes an HTTP GET request with retries and a random delay.
    """
    for attempt in range(max_retries):
        try:
            delay = random.uniform(1, 1.5)  # Random delay between 1 and 1.5 seconds
            logging.info(f"Waiting for {delay:.2f} seconds before requesting: {url} with params {params}")
            time.sleep(delay)

            response = requests.get(url, params=params, timeout=10)
            response.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)
            return response.json()
        except requests.exceptions.Timeout:
            logging.warning(f"Timeout occurred for {url}. Attempt {attempt + 1}/{max_retries}. Retrying...")
        except requests.exceptions.ConnectionError:
            logging.warning(f"Connection error occurred for {url}. Attempt {attempt + 1}/{max_retries}. Retrying...")
        except requests.exceptions.HTTPError as e:
            logging.error(f"HTTP error occurred for {url}: {e}. Attempt {attempt + 1}/{max_retries}. Retrying...")
        except requests.exceptions.RequestException as e:
            logging.error(f"An unexpected request error occurred for {url}: {e}. Attempt {attempt + 1}/{max_retries}. Retrying...")
        except ValueError as e:
            logging.error(f"Failed to parse JSON response from {url}: {e}. Attempt {attempt + 1}/{max_retries}. Retrying...")

        # Exponential backoff with randomness for retries
        sleep_time = backoff_factor ** attempt + random.uniform(0.5, 1.5)
        logging.info(f"Retrying in {sleep_time:.2f} seconds...")
        time.sleep(sleep_time)

    logging.error(f"Failed to fetch data from {url} after {max_retries} attempts.")
    return None


In [None]:
def save_to_csv(data, filename, mode='w', write_header=True):
    """
    Saves a list of dictionaries to a CSV file.
    """
    if not data:
        logging.warning(f"No data to save to {filename}")
        return

    fieldnames = ["street number", "street name", "suburb", "state", "full address"]
    try:
        with open(filename, mode, newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            if write_header and mode == 'w':
                writer.writeheader()
            writer.writerows(data)
        logging.info(f"Successfully saved data to {filename}")
    except IOError as e:
        logging.error(f"Error saving data to {filename}: {e}")


In [None]:
def main():
    all_addresses = []

    for suburb in SUBURBS:
        logging.info(f"--- Processing suburb: {suburb} ---")

        # Part 2: Determine the Street Names within a given suburb
        street_names_url = f"{BASE_URL}?&field=streetName&suburb={suburb}"
        street_data = make_request_with_retry(street_names_url)

        if not street_data or 'items' not in street_data:
            logging.warning(f"Could not retrieve street names for {suburb}. Skipping.")
            continue

        street_names = [item['name'] for item in street_data['items']]
        logging.info(f"Found {len(street_names)} streets in {suburb}.")

        for street_name in street_names:
            # Part 3: Get the valid street numbers for a given street
            street_numbers_url = f"{BASE_URL}?&field=houseNo&suburb={suburb}&streetName={street_name}"
            number_data = make_request_with_retry(street_numbers_url)

            if not number_data or 'items' not in number_data:
                logging.warning(f"Could not retrieve street numbers for {street_name}, {suburb}. Skipping.")
                continue

            street_numbers = [item['name'] for item in number_data['items']]
            logging.info(f"Found {len(street_numbers)} numbers for {street_name}, {suburb}.")

            for house_no in street_numbers:
                full_address = f"{house_no} {street_name}, {suburb}, {STATE}"
                address_entry = {
                    "street number": house_no,
                    "street name": street_name,
                    "suburb": suburb,
                    "state": STATE,
                    "full address": full_address
                }
                all_addresses.append(address_entry)
            
            # Save partial results after each street for resilience
            save_to_csv(all_addresses, PARTIAL_RESULTS_FILE, mode='w') # Overwrite with current progress

    # Save full results at the end
    save_to_csv(all_addresses, FULL_RESULTS_FILE, mode='w')
    logging.info("Scraping complete. All addresses saved.")

In [None]:
if __name__ == "__main__":
    main()