## Downloading the Data

Begin with downloading the provided historic data

In [4]:
# import libraries
from urllib.request import urlretrieve
from pyspark.sql import SparkSession, functions as F
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import os, zipfile

In [3]:
def create_data_folder(output_dir):
    """
    Create folders for each stage of the ETL pipeline
    :param output_dir: The base directory where the folders will be created
    """
    # set output directory

    
    # check if data directory exists, if not create it
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # create folders for each stage of the ETL pipeline
    for stage in ['landing', 'raw', 'curated', 'analysis']:
        stage_path = os.path.join(output_dir, stage)
        if not os.path.exists(stage_path):
            os.makedirs(stage_path) 


In [22]:
def download_file(url, output_path, file_type):
    """
    Download a file from a URL to a specified output path
    :param url: The URL of the file to download
    :param output_path: The local path where the file will be saved
    :param file_type: The file extension/type (e.g., 'csv', 'json', 'xlsx')
    """
    # generate output file path
    output_file_path = f"{output_path}.{file_type}"

    # check if output file already exists
    if not os.path.exists(output_file_path):
        os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
        # download the file from the URL and save it to the output file path
        urlretrieve(url, output_file_path)
        print(f"File downloaded and saved to {output_file_path}")
    else:
        print(f"File already exists at {output_file_path}")

In [3]:
# create data directories
create_data_folder('../data/')

**Download moving annual rent by suburb from ABS**

In [24]:
# create rent_by_suburb directory
directory = '../data/landing/rent/rent_by_suburb'

# URL
URL_TEMPLATE = "https://www.dffh.vic.gov.au/moving-annual-rents-suburb-march-quarter-2023-excel"

In [25]:
download_file(URL_TEMPLATE, directory, 'xlsx')

File downloaded and saved to ../data/landing/rent/rent_by_suburb.xlsx


**Download Public Transport Lines and Stops from VIC Gov open data (public transport)**

In [4]:
# create public_transport_stops directory
directory = '../data/landing/ptv/public_transport_stops'

# URL
URL_TEMPLATE = "https://opendata.transport.vic.gov.au/dataset/6d36dfd9-8693-4552-8a03-05eb29a391fd/resource/afa7b823-0c8b-47a1-bc40-ada565f684c7/download/public_transport_stops.geojson"

In [7]:
download_file(URL_TEMPLATE, directory, 'geojson')

File downloaded and saved to ../data/landing/ptv/public_transport_stops.geojson


In [8]:
# create public_transport_lines directory
directory = '../data/landing/ptv/public_transport_lines'

# URL
URL_TEMPLATE = "https://opendata.transport.vic.gov.au/dataset/6d36dfd9-8693-4552-8a03-05eb29a391fd/resource/52e5173e-b5d5-4b65-9b98-89f225fc529c/download/public_transport_lines.geojson"

In [9]:
download_file(URL_TEMPLATE, directory, 'geojson')

File downloaded and saved to ../data/landing/ptv/public_transport_lines.geojson


**Download School Locations Data**

We will download school locations from 2023 to 2025. \
There will be one dataset for each of the years we scraped.

In [23]:
# URL
schools_23 = "https://www.education.vic.gov.au/Documents/about/research/datavic/dv346-schoollocations2023.csv"
schools_24 = "https://www.education.vic.gov.au/Documents/about/research/datavic/dv378_DataVic-SchoolLocations-2024.csv"
schools_25 = "https://www.education.vic.gov.au/Documents/about/research/datavic/dv402-SchoolLocations2025.csv"

In [24]:
# create 2023 school locations directory
directory = '../data/landing/schools/school_locations_2023'
download_file(schools_23, directory, 'csv')
# create 2024 school locations directory
directory = '../data/landing/schools/school_locations_2024'
download_file(schools_24, directory, 'csv')
# create 2025 school locations directory
directory = '../data/landing/schools/school_locations_2025'
download_file(schools_25, directory, 'csv')

File downloaded and saved to ../data/landing/schools/school_locations_2023.csv
File downloaded and saved to ../data/landing/schools/school_locations_2024.csv
File downloaded and saved to ../data/landing/schools/school_locations_2025.csv


**Download Open Space Data**

In [12]:
# create open_space directory
directory = '../data/landing/open_space/open_space'

# URL
URL_TEMPLATE = "https://opendata.arcgis.com/datasets/da1c06e3ab6948fcb56de4bb3c722449_0.csv"

In [13]:
download_file(URL_TEMPLATE, directory, 'csv')

File downloaded and saved to ../data/landing/open_space/open_space.csv


**Download Moving Annual Rent by Suburb from DFFH (Latest File)**


In [11]:
# Download the latest moving annual rent file (March 2025)
# This file contains all historical data from previous quarters and years
latest_url = "https://www.dffh.vic.gov.au/moving-annual-rent-suburb-march-quarter-2025-excel"
filename = "moving_annual_median_weekly_rent_by_suburb"

print(f"Downloading latest moving annual rent file: {filename}")
print(f"URL: {latest_url}")


Downloading latest moving annual rent file: moving_annual_median_weekly_rent_by_suburb
URL: https://www.dffh.vic.gov.au/moving-annual-rent-suburb-march-quarter-2025-excel


In [12]:
# Download the latest moving annual rent file
directory = f'../data/landing/moving_annual_rent/{filename}'
try:
    download_file(latest_url, directory, 'xlsx')
    print("Successfully downloaded latest moving annual rent file!")
except Exception as e:
    print(f"Error downloading {filename}: {e}")


File downloaded and saved to ../data/landing/moving_annual_rent/moving_annual_median_weekly_rent_by_suburb.xlsx
Successfully downloaded latest moving annual rent file!


**Download Victorian Macroeconomic Data from RBA**

Scrape monthly unemployment rate data from the Victorian labour market website and aggregate by quarter.


In [6]:
def scrape_time_series_data(url, data_name, value_columns=None, aggregate_method='mean'):
    """
    General function to scrape time series data from Victorian government tables
    and aggregate by quarter
    
    Parameters:
    - url: URL of the webpage containing the time series table
    - data_name: Name for the dataset (used for output file naming)
    - value_columns: List of column indices to extract (default: all columns after date)
    - aggregate_method: Method for aggregation ('mean', 'last', 'first')
    
    Returns:
    - DataFrame with quarterly aggregated data
    """
    try:
        # Make request to the website
        response = requests.get(url)
        response.raise_for_status()
        
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the table containing data
        table = soup.find('table')
        if not table:
            raise ValueError("No table found on the webpage")
        
        # Extract header row to understand column structure
        header_row = table.find('tr')
        headers = [th.get_text(strip=True) for th in header_row.find_all(['th', 'td'])]
        
        # If no value_columns specified, use all columns after the first (date) column
        if value_columns is None:
            value_columns = list(range(1, len(headers)))
        
        # Extract table data
        rows = table.find_all('tr')[1:]  # Skip header row
        all_data = []
        
        for row in rows:
            cells = row.find_all('td')
            if len(cells) > 0:
                date_str = cells[0].get_text(strip=True)
                
                # Skip rows with missing date
                if not date_str:
                    continue
                
                row_data = {'date': date_str}
                
                # Extract specified columns
                for col_idx in value_columns:
                    if col_idx < len(cells):
                        value_str = cells[col_idx].get_text(strip=True)
                        col_name = headers[col_idx] if col_idx < len(headers) else f'value_{col_idx}'
                        
                        # Handle missing values
                        if value_str and value_str.strip() != '':
                            try:
                                row_data[col_name] = float(value_str)
                            except ValueError:
                                # Skip rows with non-numeric values
                                row_data = None
                                break
                        else:
                            row_data[col_name] = None
                
                if row_data is not None:
                    all_data.append(row_data)
        
        if not all_data:
            raise ValueError("No valid data found in the table")
        
        # Convert to DataFrame
        df = pd.DataFrame(all_data)
        
        # Convert date column to datetime
        df['date'] = pd.to_datetime(df['date'])
        
        # Create year and quarter columns
        df['year'] = df['date'].dt.year
        df['quarter'] = df['date'].dt.quarter
        
        # Group by year and quarter and aggregate
        agg_dict = {}
        for col in df.columns:
            if col not in ['date', 'year', 'quarter']:
                if aggregate_method == 'mean':
                    agg_dict[col] = 'mean'
                elif aggregate_method == 'last':
                    agg_dict[col] = 'last'
                elif aggregate_method == 'first':
                    agg_dict[col] = 'first'
        
        quarterly_data = df.groupby(['year', 'quarter']).agg(agg_dict).reset_index()
        
        # Create a proper date column for quarters
        quarterly_data['quarter_date'] = pd.to_datetime(
            quarterly_data['year'].astype(str) + '-' + 
            (quarterly_data['quarter'] * 3).astype(str) + '-01'
        )
        
        # Sort by date
        quarterly_data = quarterly_data.sort_values('quarter_date')
        
        # Prepare final output
        final_columns = ['quarter_date', 'year', 'quarter']
        for col in quarterly_data.columns:
            if col not in ['quarter_date', 'year', 'quarter']:
                final_columns.append(col)
        
        final_data = quarterly_data[final_columns].copy()
        final_data.rename(columns={'quarter_date': 'date'}, inplace=True)
        
        return final_data
        
    except Exception as e:
        print(f"Error scraping {data_name} data: {e}")
        return None

def save_time_series_data(data, data_name, output_dir):
    """
    Save time series data to CSV with appropriate naming
    """
    if data is not None:
        # Create output directory
        os.makedirs(output_dir, exist_ok=True)
        
        # Save to CSV
        output_path = os.path.join(output_dir, f'quarterly_{data_name}.csv')
        data.to_csv(output_path, index=False)
        
        print(f"Successfully scraped and saved {data_name} data to {output_path}")
        print(f"Data contains {len(data)} quarterly records")
        print(f"Date range: {data['date'].min()} to {data['date'].max()}")
        print("\nFirst 5 records:")
        print(data.head())
        print("\nLast 5 records:")
        print(data.tail())
        print("\n" + "="*50 + "\n")
        
        return output_path
    else:
        print(f"Failed to scrape {data_name} data")
        return None


In [8]:
# Scrape unemployment rate data
print("=== SCRAPING UNEMPLOYMENT RATE DATA ===")
unemployment_url = "https://djsir-data.github.io/djprecodash/tables/djsir_labour_market"
unemployment_data = scrape_time_series_data(
    url=unemployment_url,
    data_name="unemployment_rate",
    value_columns=[1],  # Only unemployment rate column
    aggregate_method='mean'
)
unemployment_output = save_time_series_data(
    unemployment_data, 
    "unemployment_rate", 
    "../data/landing/unemployment_rate"
)

=== SCRAPING UNEMPLOYMENT RATE DATA ===
Successfully scraped and saved unemployment_rate data to ../data/landing/unemployment_rate/quarterly_unemployment_rate.csv
Data contains 191 quarterly records
Date range: 1978-03-01 00:00:00 to 2025-09-01 00:00:00

First 5 records:
        date  year  quarter  Unemployment rate (%)
0 1978-03-01  1978        1               5.842250
1 1978-06-01  1978        2               5.552600
2 1978-09-01  1978        3               5.550233
3 1978-12-01  1978        4               5.575700
4 1979-03-01  1979        1               5.600167

Last 5 records:
          date  year  quarter  Unemployment rate (%)
186 2024-09-01  2024        3               4.473833
187 2024-12-01  2024        4               4.378333
188 2025-03-01  2025        1               4.544433
189 2025-06-01  2025        2               4.381900
190 2025-09-01  2025        3               4.472050




In [9]:
# Scrape interest rates data
print("=== SCRAPING INTEREST RATES DATA ===")
interest_rates_url = "https://djsir-data.github.io/djprecodash/tables/djsir_interest_rates"
interest_rates_data = scrape_time_series_data(
    url=interest_rates_url,
    data_name="interest_rates",
    value_columns=[1, 2, 3],  # Mortgage rates, Savings rates, Cash rate
    aggregate_method='mean'
)
interest_rates_output = save_time_series_data(
    interest_rates_data, 
    "interest_rates", 
    "../data/landing/interest_rates"
)

=== SCRAPING INTEREST RATES DATA ===
Successfully scraped and saved interest_rates data to ../data/landing/interest_rates/quarterly_interest_rates.csv
Data contains 183 quarterly records
Date range: 1990-03-01 00:00:00 to 2035-09-01 00:00:00

First 5 records:
        date  year  quarter  Mortgage rates (%)  Savings rates (%)  \
0 1990-03-01  1990        1                 NaN                NaN   
1 1990-06-01  1990        2                 NaN                NaN   
2 1990-09-01  1990        3                 NaN                NaN   
3 1990-12-01  1990        4                 NaN                NaN   
4 1991-03-01  1991        1                 NaN                NaN   

   Cash rate (%)  
0      16.666667  
1      15.000000  
2      14.333333  
3      12.666667  
4      12.000000  

Last 5 records:
          date  year  quarter  Mortgage rates (%)  Savings rates (%)  \
178 2034-09-01  2034        3                 NaN                NaN   
179 2034-12-01  2034        4               

In [11]:
# Scrape price data
print("=== SCRAPING PRICE DATA ===")
price_url = "https://djsir-data.github.io/djprecodash/tables/djsir_prices"
price_data = scrape_time_series_data(
    url=price_url,
    data_name="price_data",
    value_columns=[1, 2, 3],  # CPI (%/y), WPI (%/y), PPI, Final Demand (%/y), AWE (%/y)
    aggregate_method='mean'
)
price_output = save_time_series_data(
    price_data, 
    "price_data", 
    "../data/landing/price_data"
)


=== SCRAPING PRICE DATA ===
Successfully scraped and saved price_data data to ../data/landing/price_data/quarterly_price_data.csv
Data contains 304 quarterly records
Date range: 1949-09-01 00:00:00 to 2025-06-01 00:00:00

First 5 records:
        date  year  quarter  CPI (%/y)  WPI (%/y)  PPI, Final Demand (%/y)
0 1949-09-01  1949        3        7.9        NaN                      NaN
1 1949-12-01  1949        4       10.5        NaN                      NaN
2 1950-03-01  1950        1       10.3        NaN                      NaN
3 1950-06-01  1950        2       10.0        NaN                      NaN
4 1950-09-01  1950        3        9.8        NaN                      NaN

Last 5 records:
          date  year  quarter  CPI (%/y)  WPI (%/y)  PPI, Final Demand (%/y)
299 2024-06-01  2024        2        3.7        3.3                      4.8
300 2024-09-01  2024        3        3.0        3.2                      3.9
301 2024-12-01  2024        4        2.5        3.2            

In [12]:
# Scrape economic activity data 
print("=== SCRAPING ECONOMIC ACTIVITY DATA ===")
economic_activity_url = "https://djsir-data.github.io/djprecodash/tables/djsir_economic_activity"
economic_activity_data = scrape_time_series_data(
    url=economic_activity_url,
    data_name="economic_activity",
    value_columns=[2, 3],  # SFD (%/y)	GSP quarterly components (%/y)
    aggregate_method='mean'
)
economic_activity_output = save_time_series_data(
    economic_activity_data, 
    "economic_activity", 
    "../data/landing/economic_activity"
)


=== SCRAPING ECONOMIC ACTIVITY DATA ===
Successfully scraped and saved economic_activity data to ../data/landing/economic_activity/quarterly_economic_activity.csv
Data contains 156 quarterly records
Date range: 1986-09-01 00:00:00 to 2025-06-01 00:00:00

First 5 records:
        date  year  quarter  SFD (%/y)  GSP quarterly components (%/y)
0 1986-09-01  1986        3     2.8931                             NaN
1 1986-12-01  1986        4     2.5560                             NaN
2 1987-03-01  1987        1     1.1749                             NaN
3 1987-06-01  1987        2     1.7426                             NaN
4 1987-09-01  1987        3     2.3537                             NaN

Last 5 records:
          date  year  quarter  SFD (%/y)  GSP quarterly components (%/y)
151 2024-06-01  2024        2     1.4268                          2.0788
152 2024-09-01  2024        3     1.3878                          1.6343
153 2024-12-01  2024        4     2.4225                          