## Downloading the Data

Begin with downloading the provided historic data

In [None]:
# import libraries
from urllib.request import urlretrieve
from pyspark.sql import SparkSession, functions as F
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np


In [2]:
def create_data_folder(output_dir):
    """
    Create folders for each stage of the ETL pipeline
    :param output_dir: The base directory where the folders will be created
    """
    # set output directory
    import os
    
    # check if data directory exists, if not create it
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # create folders for each stage of the ETL pipeline
    for stage in ['landing', 'raw', 'curated', 'analysis']:
        stage_path = os.path.join(output_dir, stage)
        if not os.path.exists(stage_path):
            os.makedirs(stage_path) 


In [22]:
def download_file(url, output_path, file_type):
    """
    Download a file from a URL to a specified output path
    :param url: The URL of the file to download
    :param output_path: The local path where the file will be saved
    :param file_type: The file extension/type (e.g., 'csv', 'json', 'xlsx')
    """
    # generate output file path
    output_file_path = f"{output_path}.{file_type}"

    # check if output file already exists
    if not os.path.exists(output_file_path):
        os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
        # download the file from the URL and save it to the output file path
        urlretrieve(url, output_file_path)
        print(f"File downloaded and saved to {output_file_path}")
    else:
        print(f"File already exists at {output_file_path}")

In [3]:
# create data directories
create_data_folder('../data/')

In [6]:
# import os
import os, zipfile

**Download moving annual rent by suburb from ABS**

In [24]:
# create rent_by_suburb directory
directory = '../data/landing/rent/rent_by_suburb'

# URL
URL_TEMPLATE = "https://www.dffh.vic.gov.au/moving-annual-rents-suburb-march-quarter-2023-excel"

In [25]:
download_file(URL_TEMPLATE, directory, 'xlsx')

File downloaded and saved to ../data/landing/rent/rent_by_suburb.xlsx


**Download Public Transport Lines and Stops from VIC Gov open data (public transport)**

In [4]:
# create public_transport_stops directory
directory = '../data/landing/ptv/public_transport_stops'

# URL
URL_TEMPLATE = "https://opendata.transport.vic.gov.au/dataset/6d36dfd9-8693-4552-8a03-05eb29a391fd/resource/afa7b823-0c8b-47a1-bc40-ada565f684c7/download/public_transport_stops.geojson"

In [7]:
download_file(URL_TEMPLATE, directory, 'geojson')

File downloaded and saved to ../data/landing/ptv/public_transport_stops.geojson


In [8]:
# create public_transport_lines directory
directory = '../data/landing/ptv/public_transport_lines'

# URL
URL_TEMPLATE = "https://opendata.transport.vic.gov.au/dataset/6d36dfd9-8693-4552-8a03-05eb29a391fd/resource/52e5173e-b5d5-4b65-9b98-89f225fc529c/download/public_transport_lines.geojson"

In [9]:
download_file(URL_TEMPLATE, directory, 'geojson')

File downloaded and saved to ../data/landing/ptv/public_transport_lines.geojson


**Download School Locations Data**

We will download school locations from 2023 to 2025. \
There will be one dataset for each of the years we scraped.

In [23]:
# URL
schools_23 = "https://www.education.vic.gov.au/Documents/about/research/datavic/dv346-schoollocations2023.csv"
schools_24 = "https://www.education.vic.gov.au/Documents/about/research/datavic/dv378_DataVic-SchoolLocations-2024.csv"
schools_25 = "https://www.education.vic.gov.au/Documents/about/research/datavic/dv402-SchoolLocations2025.csv"

In [24]:
# create 2023 school locations directory
directory = '../data/landing/schools/school_locations_2023'
download_file(schools_23, directory, 'csv')
# create 2024 school locations directory
directory = '../data/landing/schools/school_locations_2024'
download_file(schools_24, directory, 'csv')
# create 2025 school locations directory
directory = '../data/landing/schools/school_locations_2025'
download_file(schools_25, directory, 'csv')

File downloaded and saved to ../data/landing/schools/school_locations_2023.csv
File downloaded and saved to ../data/landing/schools/school_locations_2024.csv
File downloaded and saved to ../data/landing/schools/school_locations_2025.csv


**Download Open Space Data**

In [12]:
# create open_space directory
directory = '../data/landing/open_space/open_space'

# URL
URL_TEMPLATE = "https://opendata.arcgis.com/datasets/da1c06e3ab6948fcb56de4bb3c722449_0.csv"

In [13]:
download_file(URL_TEMPLATE, directory, 'csv')

File downloaded and saved to ../data/landing/open_space/open_space.csv


**Download Moving Annual Rent by Suburb from DFFH (Latest File)**


In [11]:
# Download the latest moving annual rent file (March 2025)
# This file contains all historical data from previous quarters and years
latest_url = "https://www.dffh.vic.gov.au/moving-annual-rent-suburb-march-quarter-2025-excel"
filename = "moving_annual_median_weekly_rent_by_suburb"

print(f"Downloading latest moving annual rent file: {filename}")
print(f"URL: {latest_url}")


Downloading latest moving annual rent file: moving_annual_median_weekly_rent_by_suburb
URL: https://www.dffh.vic.gov.au/moving-annual-rent-suburb-march-quarter-2025-excel


In [12]:
# Download the latest moving annual rent file
directory = f'../data/landing/moving_annual_rent/{filename}'
try:
    download_file(latest_url, directory, 'xlsx')
    print("Successfully downloaded latest moving annual rent file!")
except Exception as e:
    print(f"Error downloading {filename}: {e}")


File downloaded and saved to ../data/landing/moving_annual_rent/moving_annual_median_weekly_rent_by_suburb.xlsx
Successfully downloaded latest moving annual rent file!


**Download Victorian Unemployment Rate Data**

Scrape monthly unemployment rate data from the Victorian labour market website and aggregate by quarter.


In [1]:
def scrape_unemployment_data():
    """
    Scrape unemployment data from the Victorian labour market website
    and aggregate by quarter taking the average unemployment rate
    """
    url = "https://djsir-data.github.io/djprecodash/tables/djsir_labour_market"
    
    try:
        # Make request to the website
        response = requests.get(url)
        response.raise_for_status()
        
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the table containing unemployment data
        table = soup.find('table')
        if not table:
            raise ValueError("No table found on the webpage")
        
        # Extract table data
        rows = table.find_all('tr')
        data = []
        
        for row in rows[1:]:  # Skip header row
            cells = row.find_all('td')
            if len(cells) >= 2:
                date_str = cells[0].get_text(strip=True)
                unemployment_rate = cells[1].get_text(strip=True)
                
                # Skip rows with missing data
                if date_str and unemployment_rate and unemployment_rate != '':
                    data.append({
                        'date': date_str,
                        'unemployment_rate': float(unemployment_rate)
                    })
        
        # Convert to DataFrame
        df = pd.DataFrame(data)
        
        if df.empty:
            raise ValueError("No data found in the table")
        
        # Convert date column to datetime
        df['date'] = pd.to_datetime(df['date'])
        
        # Create year and quarter columns
        df['year'] = df['date'].dt.year
        df['quarter'] = df['date'].dt.quarter
        
        # Group by year and quarter and calculate average unemployment rate
        quarterly_data = df.groupby(['year', 'quarter'])['unemployment_rate'].mean().reset_index()
        
        # Create a proper date column for quarters
        quarterly_data['quarter_date'] = pd.to_datetime(
            quarterly_data['year'].astype(str) + '-' + 
            (quarterly_data['quarter'] * 3).astype(str) + '-01'
        )
        
        # Sort by date
        quarterly_data = quarterly_data.sort_values('quarter_date')
        
        # Select relevant columns for final output
        final_data = quarterly_data[['quarter_date', 'year', 'quarter', 'unemployment_rate']].copy()
        final_data.columns = ['date', 'year', 'quarter', 'avg_unemployment_rate']
        
        return final_data
        
    except Exception as e:
        print(f"Error scraping unemployment data: {e}")
        return None

# Create unemployment rate directory
unemployment_dir = '../data/landing/unemployment_rate'
os.makedirs(unemployment_dir, exist_ok=True)

print("Scraping unemployment data from Victorian labour market website...")
unemployment_df = scrape_unemployment_data()

if unemployment_df is not None:
    # Save to CSV
    output_path = os.path.join(unemployment_dir, 'quarterly_unemployment_rate.csv')
    unemployment_df.to_csv(output_path, index=False)
    
    print(f"Successfully scraped and saved unemployment data to {output_path}")
    print(f"Data contains {len(unemployment_df)} quarterly records")
    print(f"Date range: {unemployment_df['date'].min()} to {unemployment_df['date'].max()}")
    print("\nFirst 5 records:")
    print(unemployment_df.head())
    print("\nLast 5 records:")
    print(unemployment_df.tail())
else:
    print("Failed to scrape unemployment data")


NameError: name 'os' is not defined