**Find Mininmum Distance from a rental property to any point of interest**

In [22]:
# Import necessary libraries
from pyspark.sql import SparkSession, functions as F
import geopandas as gpd
from shapely.geometry import Point

# Create Spark session
spark = (
    SparkSession.builder.appName('Weekly Rental Listings Preprocessing')
    .config("spark.sql.repl.eagerEval.enabled", True)  # display full dataframe in console
    .config("spark.sql.parquet.cacheMetadata", "true") # cache parquet metadata
    .config("spark.sql.session.timeZone", "Etc/UTC") # set timezone to UTC
    # memory configurations - hopefully will reduce crashing
    .config("spark.driver.memory", "4g") # set driver memory
    .config("spark.executor.memory", "4g") # set executor memory
    .getOrCreate()
)

In [25]:
# Load the shapefile (use the .shp file as entry point)
sf = gpd.read_file("../data/geo/shpfile/LOCALITY_POLYGON.shp")
sf = sf.to_crs(epsg=4326)  # Convert to lat/lon (WGS84)
sf = sf[["LOCALITY", "geometry"]]
sf["LOCALITY"] = sf["LOCALITY"].str.strip()  # Remove whitespace    
sf["suburb"] = sf["LOCALITY"].str.lower().str.strip()
sf.head()

Unnamed: 0,LOCALITY,geometry,suburb
0,MOLLONGGHIP,"POLYGON ((144.06544 -37.48382, 144.06438 -37.4...",mollongghip
1,NORTH BLACKWOOD,"POLYGON ((144.38037 -37.42376, 144.38126 -37.4...",north blackwood
2,BASALT,"POLYGON ((144.1184 -37.31148, 144.1184 -37.311...",basalt
3,LLANELLY,"POLYGON ((143.81737 -36.75048, 143.81679 -36.7...",llanelly
4,MURRABIT WEST,"POLYGON ((143.87075 -35.49319, 143.86939 -35.4...",murrabit west


In [26]:
# Function to get suburb name from lon/lat
def get_suburb(lon, lat, sf):
    pt = Point(lon, lat) # e.g., Point(144.9631, -37.8136)
    for _, row in sf.iterrows():
        poly = row['geometry']
        if poly.covers(pt):
            return row['suburb'] # return lower case suburb name
    return None

# try out the function
sub = get_suburb(144.9961565, -37.796893, sf)
print(sub)

abbotsford


In [27]:
school_locations_sf = spark.read.csv('../data/landing/schools/school_locations_2025.csv', header=True, inferSchema=True)
school_locations_sf.show(5)
#find number of distinct school types and entry count for each type
school_locations_sf.groupBy("School_Type").count().show()

+----------------+-----------+---------+--------------------+-----------+-------------+--------------------+--------------+--------------+-------------+----------------+---------------------+---------------------+--------------+------------+---------------+-------------+--------------------+--------------------+------+----------------+---------+----------+----------+
|Education_Sector|Entity_Type|School_No|         School_Name|School_Type|School_Status|      Address_Line_1|Address_Line_2|  Address_Town|Address_State|Address_Postcode|Postal_Address_Line_1|Postal_Address_Line_2|   Postal_Town|Postal_State|Postal_Postcode|Full_Phone_No|              Region|                Area|LGA_ID|        LGA_Name| LGA_TYPE|         X|         Y|
+----------------+-----------+---------+--------------------+-----------+-------------+--------------------+--------------+--------------+-------------+----------------+---------------------+---------------------+--------------+------------+---------------+---

In [30]:
# Read in geojson file with geopandas
ptv_stops_gdf = gpd.read_file("../data/landing/ptv/public_transport_stops.geojson")
ptv_stops_gdf = ptv_stops_gdf.to_crs(epsg=4326)  # Convert to lat/lon (WGS84)

ptv_stops_gdf

Unnamed: 0,STOP_ID,STOP_NAME,MODE,geometry
0,17204,Wallan Station,REGIONAL TRAIN,POINT (145.00537 -37.41686)
1,19980,Melton Station,REGIONAL TRAIN,POINT (144.57222 -37.70336)
2,19981,Rockbank Station,REGIONAL TRAIN,POINT (144.65071 -37.72919)
3,19982,Deer Park Station,REGIONAL TRAIN,POINT (144.77083 -37.77727)
4,19998,Sunbury Station,REGIONAL TRAIN,POINT (144.72803 -37.57915)
...,...,...,...,...
29197,6586,Northcott St/Exford Rd,REGIONAL BUS,POINT (144.5743 -37.70666)
29198,6642,Northcott St/Exford Rd,REGIONAL BUS,POINT (144.57443 -37.70668)
29199,6644,Exford Rd/Staughton St,REGIONAL BUS,POINT (144.56982 -37.7024)
29200,7009,Harrison St/Marengo Crst,REGIONAL BUS,POINT (143.66323 -38.77712)


In [None]:
import openrouteservice as ors
import numpy as np
from sklearn.neighbors import BallTree
import requests
import time

rentals = sdf.select("full_address", "longitude", "latitude").limit(100).toPandas()
rentals= gpd.GeoDataFrame(
    rentals,
    geometry=[Point(xy) for xy in zip(rentals.longitude, rentals.latitude)],
    crs="EPSG:4326"
)

rentals

Unnamed: 0,full_address,longitude,latitude,geometry
0,"14 Federation Lane, Abbotsford VIC 3067",144.9961565,-37.796893,POINT (144.99616 -37.79689)
1,"106/609 Victoria Street, Abbotsford VIC 3067",145.0076834,-37.8110653,POINT (145.00768 -37.81107)
2,Abbotsford VIC 3067,145.0019064,-37.80210950000001,POINT (145.00191 -37.80211)
3,"4/2 Princes Street, Abbotsford VIC 3067",144.999856,-37.8092053,POINT (144.99986 -37.80921)
4,"45 Park St, Abbotsford VIC 3067",144.9939399,-37.8080424,POINT (144.99394 -37.80804)
...,...,...,...,...
95,"5/2-4 Watt St, Airport West VIC 3042",144.8799126,-37.7314592,POINT (144.87991 -37.73146)
96,"9 Thomas St, Airport West VIC 3042",144.876504,-37.7206421,POINT (144.8765 -37.72064)
97,"2/128 Bowes Avenue, Airport West VIC 3042",144.8863906,-37.7238332,POINT (144.88639 -37.72383)
98,"34A Walters Ave, Airport West VIC 3042",144.8864192,-37.73188030000001,POINT (144.88642 -37.73188)


**PTV Stops Matrix Pulls**

In [None]:
# read csv in as a geopandas dataframe
cleaned_listings = pd.read_csv("../data/curated/rent_features/cleaned_listings.csv", low_memory=False)
cleaned_listings["geometry"] = cleaned_listings["coordinates"].apply(wkt.loads)
cleaned_listings_gdf = gpd.GeoDataFrame(cleaned_listings, geometry="geometry", crs="EPSG:4326")
gdf_tmp = cleaned_listings_gdf


In [None]:
# Do 500 API calls at a time and then save the results to a CSV and continue from the last index
import pandas as pd
poi_rad = np.radians(np.c_[ptv_stops_gdf.geometry.y, ptv_stops_gdf.geometry.x])
tree = BallTree(poi_rad, metric="haversine")

def shortlist(point, k=6, max_km=2.0):
    pt = np.radians([[point.y, point.x]])
    dist, idx = tree.query(pt, k=k)
    dist_km = dist[0] * 6371.0088
    mask = dist_km <= max_km
    return ptv_stops_gdf.iloc[idx[0][mask]]

# --- routing helper (OpenRouteService foot example) ---
API_KEY = "YOUR_API_KEY"  # replace with your ORS API key
BASE_URL = "https://api.openrouteservice.org/v2/matrix/driving-car"

def routed_distance(source, targets, max_attempts=5, base_wait=1.0):
    coords = [[source.x, source.y]] + [[pt.x, pt.y] for pt in targets]
    body = {"locations": coords, "sources": [0], "metrics": ["distance", "duration"]}
    for attempt in range(1, max_attempts + 1):
        resp = requests.post(BASE_URL, json=body,
                            headers={"Authorization": API_KEY},
                            timeout=30)

        if resp.status_code == 429:
            wait = base_wait * (2 ** (attempt - 1))  # 1, 2, 4, 8, ...
            time.sleep(wait)
            continue

        resp.raise_for_status()
        data = resp.json()
        return data["distances"][0][1:], data["durations"][0][1:]

    raise RuntimeError(f"ORS matrix still returning 429 after {max_attempts} retries")
# --- main loop ---
rows = []
for idx, row in gdf_tmp.head(500).iterrows():
    candidates = shortlist(row.geometry, k=10, max_km=3)
    if candidates.empty:
        continue

    distances, durations = routed_distance(row.geometry, candidates.geometry, max_attempts=5, base_wait=1.0)
    idx = int(np.argmin(durations))  # use distance if you prefer metres
    chosen = candidates.iloc[idx]

    cleaned_listings_gdf.loc[idx, "StationID"] = chosen["STOP_ID"]
    cleaned_listings_gdf.loc[idx, "min_route_dist_m"] = distances[best]
    cleaned_listings_gdf.loc[idx, "min_route_dur_s"] = durations[best]


# skip to the next 500
gdf_tmp = cleaned_listings_gdf.iloc[500:]