## Downloading the Data

Begin with downloading the provided historic data

In [3]:
# import libraries
from urllib.request import urlretrieve
from pyspark.sql import SparkSession, functions as F


In [2]:
def create_data_folder(output_dir):
    """
    Create folders for each stage of the ETL pipeline
    :param output_dir: The base directory where the folders will be created
    """
    # set output directory
    import os
    
    # check if data directory exists, if not create it
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # create folders for each stage of the ETL pipeline
    for stage in ['landing', 'raw', 'curated', 'analysis']:
        stage_path = os.path.join(output_dir, stage)
        if not os.path.exists(stage_path):
            os.makedirs(stage_path) 


In [3]:
# create data directories
create_data_folder('../data/')

In [6]:
# import os
import os, zipfile

**Download moving annual rent by suburb from ABS**

In [19]:
# create rent_by_suburb directory
directory = '../data/landing/rent/rent_by_suburb'

In [21]:
# URL template for moving annual rent by suburb
URL_TEMPLATE = "https://www.dffh.vic.gov.au/moving-annual-rents-suburb-march-quarter-2023-excel"

# generate output file path
output_file_path = f"{directory}.xlsx"

# check if output file already exists
if not os.path.exists(output_file_path):
    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
    # download the file from the URL and save it to the output file path
    urlretrieve(URL_TEMPLATE, output_file_path)
    print(f"File downloaded and saved to {output_file_path}")
else:
    print(f"File already exists at {output_file_path}")

File downloaded and saved to ../data/landing/rent/rent_by_suburb.xlsx


**Download Public Transport Lines and Stops from VIC Gov open data (public transport)**

In [15]:
# create public_transport_stops directory
directory = '../data/landing/ptv/public_transport_stops'

In [16]:
URL_TEMPLATE = "https://opendata.transport.vic.gov.au/dataset/6d36dfd9-8693-4552-8a03-05eb29a391fd/resource/afa7b823-0c8b-47a1-bc40-ada565f684c7/download/public_transport_stops.geojson"

# generate output file path
output_file_path = f"{directory}.geojson"

# check if output file already exists
if not os.path.exists(output_file_path):
    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
    # download the file from the URL and save it to the output file path
    urlretrieve(URL_TEMPLATE, output_file_path)
    print(f"File downloaded and saved to {output_file_path}")
else:
    print(f"File already exists at {output_file_path}")

File downloaded and saved to ../data/landing/ptv/public_transport_stops.geojson


In [17]:
# create public_transport_lines directory
directory = '../data/landing/ptv/public_transport_lines'

In [18]:
URL_TEMPLATE = "https://opendata.transport.vic.gov.au/dataset/6d36dfd9-8693-4552-8a03-05eb29a391fd/resource/52e5173e-b5d5-4b65-9b98-89f225fc529c/download/public_transport_lines.geojson"

# generate output file path
output_file_path = f"{directory}.geojson"

# check if output file already exists
if not os.path.exists(output_file_path):
    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
    # download the file from the URL and save it to the output file path
    urlretrieve(URL_TEMPLATE, output_file_path)
    print(f"File downloaded and saved to {output_file_path}")
else:
    print(f"File already exists at {output_file_path}")

File downloaded and saved to ../data/landing/ptv/public_transport_lines.geojson
