In [None]:
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np
import math as m
import pyspark.sql.functions as F
import geopandas as gpd
import folium
import matplotlib.pyplot as plt
from shapely import wkt  
from matplotlib import colors as mcolors
from folium.plugins import MarkerCluster
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
from PIL import Image


# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Tutorial 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "6g")
    .getOrCreate()
)

In [57]:
# Load the shapefile (use the .shp file as entry point)
sf = gpd.read_file("../data/geo/shpfile/LOCALITY_POLYGON.shp")
sf = sf.to_crs(epsg=4326)  # Convert to lat/lon (WGS84)
sf = sf[["LOCALITY", "geometry"]]
sf["LOCALITY"] = sf["LOCALITY"].str.strip()  # Remove whitespace    
sf.head()
# Nice map center (use the shapefile centroid)
center = [sf.geometry.unary_union.centroid.y, sf.geometry.unary_union.centroid.x]
m = folium.Map(location=center, zoom_start=6, tiles="CartoDB positron")

# Add regions
folium.GeoJson(
    sf.__geo_interface__,
    name="VicGov Regions",
    style_function=lambda f: {"fillOpacity": 0.2, "weight": 1},
    tooltip=folium.GeoJsonTooltip(fields=["LOCALITY"], aliases=["Suburb"])
).add_to(m)

folium.LayerControl().add_to(m)

m.save("vicgov_regions.html")  # Save to file (open in browser to view)

  center = [sf.geometry.unary_union.centroid.y, sf.geometry.unary_union.centroid.x]


In [58]:
sf

Unnamed: 0,LOCALITY,geometry
0,MOLLONGGHIP,"POLYGON ((144.06544 -37.48382, 144.06438 -37.4..."
1,NORTH BLACKWOOD,"POLYGON ((144.38037 -37.42376, 144.38126 -37.4..."
2,BASALT,"POLYGON ((144.1184 -37.31148, 144.1184 -37.311..."
3,LLANELLY,"POLYGON ((143.81737 -36.75048, 143.81679 -36.7..."
4,MURRABIT WEST,"POLYGON ((143.87075 -35.49319, 143.86939 -35.4..."
...,...,...
2968,CHRISTMAS HILLS,"POLYGON ((145.31596 -37.70112, 145.31607 -37.7..."
2969,YARRA GLEN,"POLYGON ((145.40036 -37.67188, 145.40022 -37.6..."
2970,HEIDELBERG WEST,"POLYGON ((145.0566 -37.73521, 145.05548 -37.73..."
2971,RESERVOIR,"POLYGON ((145.00146 -37.72995, 145.00116 -37.7..."


In [59]:
# 0-based: row 4 = 'Age group (years)', row 5 = 'no.'
df = pd.read_excel("32350DS0005_2001-24.xlsx", sheet_name="Table 3", header=[4, 5])

def clean(x):
    s = "" if x is None else str(x).strip()
    if s.startswith("Unnamed") or s.startswith("Missing value"):
        return ""
    return s

def pick_name(col):              # col is a tuple: (level0, level1)
    a, b = clean(col[0]), clean(col[1])
    if a == "Age group (years)":
        return b                 # e.g. '0–4', '5–9', ...
    return a or b                # prefer level0; fall back to level1

cols = [pick_name(c) for c in df.columns]

# If any of the first ID columns are still blank, fill with expected labels:
id_cols = ["Year","S/T code","S/T name","GCCSA code","GCCSA name",
           "SA4 code","SA4 name","SA3 code","SA3 name","SA2 code", "SA2 name"]
for i, name in enumerate(id_cols):
    if i < len(cols) and cols[i] == "":
        cols[i] = name

df.columns = cols

In [60]:
# drop that row from the data
df = df[1:].reset_index(drop=True)
df["S/T name"] = df["S/T name"].str.strip()

# filter to Victoria 
df_vic = df[(df["S/T name"] == "Victoria")].copy()
# filter to 2024
df_vic["Year"] = df_vic["Year"].astype(int)
df_vic = df_vic[(df_vic["Year"] == 2024)].copy()

df_vic.reset_index(drop=True, inplace=True)
df_vic = df_vic.drop(columns=["S/T code", "S/T name", "GCCSA code", "GCCSA name", "SA4 code", "SA4 name", "SA3 code", "SA3 name"])
df_vic

Unnamed: 0,Year,SA2 code,SA2 name,0–4,5–9,10–14,15–19,20–24,25–29,30–34,...,45–49,50–54,55–59,60–64,65–69,70–74,75–79,80–84,85 and over,Total persons
0,2024,201011001,Alfredton,1306,1615,1685,1601,1264,1131,1345,...,1329,1223,955,885,788,724,640,353,250,20130
1,2024,201011002,Ballarat,484,586,641,830,763,557,596,...,723,827,761,828,761,696,626,403,368,11773
2,2024,201011005,Buninyong,304,459,603,621,532,260,290,...,523,526,421,500,481,454,289,138,128,7358
3,2024,201011006,Delacombe,1183,1069,953,867,1028,1451,1464,...,698,661,563,580,533,437,365,205,220,14271
4,2024,201011007,Smythes Creek,185,270,337,381,260,114,216,...,289,329,348,296,294,188,159,52,51,4278
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517,2024,217031476,Otway,158,161,183,188,96,125,197,...,207,305,285,409,401,353,240,127,86,3948
518,2024,217041477,Moyne - East,447,451,471,488,336,382,391,...,399,467,511,568,481,356,276,142,134,7145
519,2024,217041478,Moyne - West,520,598,671,691,409,448,470,...,593,640,680,741,772,766,553,280,262,10215
520,2024,217041479,Warrnambool - North,1304,1348,1509,1560,1298,1338,1506,...,1405,1414,1393,1404,1343,1156,1007,636,534,22916


In [61]:
# read in public transport stops geojson
pt_stops = gpd.read_file("../data/landing/ptv/public_transport_stops.geojson")
pt_stops = pt_stops.to_crs(epsg=4326)
# grouping by stop name to find number of stops with same name
pt_stops_sub = (pt_stops.groupby(["STOP_NAME", "MODE"]).agg(num_stops=("STOP_NAME", "count"), geometry=("geometry", "first")).reset_index())

pt_stops_sub = gpd.GeoDataFrame(pt_stops_sub, geometry="geometry", crs=pt_stops.crs).to_crs(epsg=4326)
pt_stops_sub = pt_stops_sub.dropna(subset=["geometry"])
pt_stops_sub

Unnamed: 0,STOP_NAME,MODE,num_stops,geometry
0,10 Jarrah Dr,METRO BUS,1,POINT (145.11072 -38.00284)
1,10 Oban Rd,METRO BUS,1,POINT (145.25205 -37.79634)
2,10 Queens Pde,METRO BUS,1,POINT (144.97125 -37.71958)
3,100 Kent Rd,REGIONAL BUS,1,POINT (142.01692 -37.72773)
4,100 South Gippsland Hwy,METRO BUS,1,POINT (145.22923 -38.009)
...,...,...,...,...
20589,lift,METRO TRAIN,1,POINT (145.09741 -37.62259)
20590,opp 12 Main Rd,REGIONAL BUS,1,POINT (144.20394 -37.07956)
20591,opp 154 Princes Hwy,REGIONAL BUS,1,POINT (147.6562 -37.81944)
20592,opp 34 The Elms Bvd,REGIONAL BUS,1,POINT (144.95935 -37.31601)


In [62]:
pt_stops

Unnamed: 0,STOP_ID,STOP_NAME,MODE,geometry
0,17204,Wallan Station,REGIONAL TRAIN,POINT (145.00537 -37.41686)
1,19980,Melton Station,REGIONAL TRAIN,POINT (144.57222 -37.70336)
2,19981,Rockbank Station,REGIONAL TRAIN,POINT (144.65071 -37.72919)
3,19982,Deer Park Station,REGIONAL TRAIN,POINT (144.77083 -37.77727)
4,19998,Sunbury Station,REGIONAL TRAIN,POINT (144.72803 -37.57915)
...,...,...,...,...
29197,6586,Northcott St/Exford Rd,REGIONAL BUS,POINT (144.5743 -37.70666)
29198,6642,Northcott St/Exford Rd,REGIONAL BUS,POINT (144.57443 -37.70668)
29199,6644,Exford Rd/Staughton St,REGIONAL BUS,POINT (144.56982 -37.7024)
29200,7009,Harrison St/Marengo Crst,REGIONAL BUS,POINT (143.66323 -38.77712)


In [63]:
pt_stops_sub["num_stops"].unique()

array([ 1,  2,  3,  4,  8,  6,  7, 13, 10,  5, 12,  9, 11])

In [64]:
# Overlay the population data on the map
m3 = folium.Map(location=center, zoom_start=6, tiles="CartoDB positron")

cmap = plt.cm.viridis
norm = mcolors.Normalize(vmin=pt_stops_sub["num_stops"].min(), vmax=pt_stops_sub["num_stops"].max())

# Layer 2: PTV Stops choropleth (darker = higher number of stops)
folium.GeoJson(
    pt_stops_sub,  # your geoDataFrame
    name="PTV Stops per suburb",
    style_function=lambda feature: {
    "fillColor": mcolors.to_hex(
        cmap(norm(feature["properties"]["num_stops"]))
    ),
    "color": "black",
    "weight": 0.2,
    "fillOpacity": 0.7,
    },
    tooltip=folium.GeoJsonTooltip(
    fields=["STOP_NAME", "num_stops"],  # adjust field names
    aliases=["PTV Stop Name", "Number of Stops"]
    )
).add_to(m3)

folium.LayerControl().add_to(m3)
m3.save("vicgov_ptv_stops.html")  # Save to file (open in browser to view)

In [65]:
# 2) color scale
cmap = plt.cm.viridis
norm = mcolors.Normalize(pt_stops_sub["num_stops"].min(), pt_stops_sub["num_stops"].max())

# 3) map + cluster
m3 = folium.Map(location=center, zoom_start=6, tiles="CartoDB positron")
cluster = MarkerCluster(name="PTV Stops (by STOP_NAME count)").add_to(m3)

for _, r in pt_stops_sub.iterrows():
    color = mcolors.to_hex(cmap(norm(r["num_stops"])))
    folium.CircleMarker(
        location=[r.geometry.y, r.geometry.x],
        radius=4 + 0.6*(r["num_stops"]-1),
        color=color, fillColor=color,
        fill=True, fill_opacity=0.9, weight=0,
        tooltip=f"PTV Stop Name: {r['STOP_NAME']}<br>Number of Stops: {int(r['num_stops'])}"
    ).add_to(cluster)

# (optional) fit map to data
minx, miny, maxx, maxy = pt_stops_sub.total_bounds
m3.fit_bounds([[miny, minx], [maxy, maxx]])

folium.LayerControl().add_to(m3)
m3.save("vicgov_ptv_stops.html")

In [66]:
sf

Unnamed: 0,LOCALITY,geometry
0,MOLLONGGHIP,"POLYGON ((144.06544 -37.48382, 144.06438 -37.4..."
1,NORTH BLACKWOOD,"POLYGON ((144.38037 -37.42376, 144.38126 -37.4..."
2,BASALT,"POLYGON ((144.1184 -37.31148, 144.1184 -37.311..."
3,LLANELLY,"POLYGON ((143.81737 -36.75048, 143.81679 -36.7..."
4,MURRABIT WEST,"POLYGON ((143.87075 -35.49319, 143.86939 -35.4..."
...,...,...
2968,CHRISTMAS HILLS,"POLYGON ((145.31596 -37.70112, 145.31607 -37.7..."
2969,YARRA GLEN,"POLYGON ((145.40036 -37.67188, 145.40022 -37.6..."
2970,HEIDELBERG WEST,"POLYGON ((145.0566 -37.73521, 145.05548 -37.73..."
2971,RESERVOIR,"POLYGON ((145.00146 -37.72995, 145.00116 -37.7..."


**Visualizing Isochrones**

In [67]:
csv_path = "../data/curated/rent_features/cleaned_listings_isochrones_added_with_best_schools.csv"

df = pd.read_csv(csv_path, low_memory=False)
df["coordinates"] = df["coordinates"].apply(wkt.loads)
df.head()

Unnamed: 0,bathrooms,bedrooms,car_spaces,closest_ptv_station_id,closest_ptv_station_id_imputed,count_atm_imputed,count_bank_imputed,count_bar_imputed,count_biergarten_imputed,count_brothel_imputed,...,best_school_name_walking_10min,best_school_coord_walking_10min,best_score_walking_10min,best_dist_km_walking_10min,n_schools_walking_10min,best_school_name_walking_15min,best_school_coord_walking_15min,best_score_walking_15min,best_dist_km_walking_15min,n_schools_walking_15min
0,1,2,1,28713,28713,0.0,5.0,7.0,0.0,0.0,...,,,3e-08,16.218622,0.0,,,3e-08,6.245875,0.0
1,1,2,1,5102,5102,7.0,6.0,0.0,0.0,0.0,...,Our Lady of Sion College,POINT (145.12992 -37.81835),-9.213487e-05,0.426827,2.0,Box Hill High School,POINT (145.13721 -37.82),0.04135461,1.093638,3.0
2,2,2,1,47722,47722,39.0,37.0,169.0,0.0,4.0,...,Holmes Grammar School,POINT (144.97227 -37.8103),-8.181772e-05,1.111145,2.0,Holmes Grammar School,POINT (144.97227 -37.8103),-8.181772e-05,1.111145,2.0
3,1,3,1,7378,7378,0.0,0.0,1.0,0.0,0.0,...,Penola Catholic College,POINT (144.91826 -37.68786),-5.823501e-05,3.5859,10.0,,,3e-08,6.245875,1.0
4,1,2,1,6053,6053,32.0,21.0,142.0,0.0,3.0,...,,,3e-08,16.218622,0.0,,,3e-08,6.245875,0.0


In [68]:

listings_sampled_gdf = gpd.GeoDataFrame(df, geometry="coordinates", crs="EPSG:4326")

mask = listings_sampled_gdf.iloc[:, -6:].notna().all(axis=1)
first_row = listings_sampled_gdf.loc[mask].iloc[0]          # raises IndexError if none exist
first_index = mask[mask].index[0]            

row = listings_sampled_gdf.iloc[first_index]

row.keys()

Index(['bathrooms', 'bedrooms', 'car_spaces', 'closest_ptv_station_id',
       'closest_ptv_station_id_imputed', 'count_atm_imputed',
       'count_bank_imputed', 'count_bar_imputed', 'count_biergarten_imputed',
       'count_brothel_imputed',
       ...
       'best_school_name_walking_10min', 'best_school_coord_walking_10min',
       'best_score_walking_10min', 'best_dist_km_walking_10min',
       'n_schools_walking_10min', 'best_school_name_walking_15min',
       'best_school_coord_walking_15min', 'best_score_walking_15min',
       'best_dist_km_walking_15min', 'n_schools_walking_15min'],
      dtype='object', length=257)

In [69]:
from shapely import wkt, ops
from shapely.geometry import mapping
from collections import defaultdict
import textwrap
from folium import Popup


iso_cols = [
    "walking_5min", "walking_10min", "walking_15min",
    "driving_5min", "driving_10min", "driving_15min",
]

fields = {
    "Property ID": row.get("property_id"),
    "Suburb": row.get("suburb")
}

def swap_axes(geom):
    return ops.transform(lambda x, y, z=None: (y, x), geom)


def wkt_to_geojson(value):
    if value is None or value in ("None", "nan"):
        return None
    geom = wkt.loads(value) if isinstance(value, str) else value
    return mapping(geom)



row = listings_sampled_gdf.loc[mask].iloc[0]   # ensure ‘row’ is set to a record with data
geo_layers = {col: wkt_to_geojson(row[col]) for col in iso_cols}


palette = {
    "walking_5min":  {"fillColor": "#d1f3f0", "color": "#007a74"},
    "walking_10min": {"fillColor": "#6ed0c4", "color": "#007a74"},
    "walking_15min": {"fillColor": "#007a74", "color": "#00514d"},
    "driving_5min":  {"fillColor": "#f2d5f9", "color": "#8c2be2"},
    "driving_10min": {"fillColor": "#d191f1", "color": "#8c2be2"},
    "driving_15min": {"fillColor": "#8c2be2", "color": "#5b0f9f"},
}


def add_iso_layer(map_obj, geom, style, name):
    if geom is None:
        return
    folium.GeoJson(
        geom,
        name=name,
        style_function=lambda _: {
            "fillColor": style["fillColor"],
            "color": style["color"],
            "weight": 2,
            "fillOpacity": 0.2,
        },
    ).add_to(map_obj)

pt = swap_axes(row["coordinates"])
center_latlon = [pt.y, pt.x]    # now correctly lat=-37.80, lon=144.99


m4 = folium.Map(location=center_latlon, zoom_start=13, tiles="CartoDB positron")

folium.Marker(
    location=center_latlon,
    popup="Listing",
    icon=folium.Icon(color="red", icon="home")
).add_to(m4)

details = "<br>".join(
    f"<strong>{label}:</strong> {value}"
    for label, value in fields.items()
    if pd.notna(value)
)

folium.Marker(
    location=center_latlon,
    icon=folium.Icon(color="red", icon="home"),
    popup=Popup(textwrap.dedent(f"""
        <div style="min-width:180px">
            {details}
        </div>
    """), max_width=250)
).add_to(m4)

# School marker loop
marker_groups = defaultdict(list)

for col in iso_cols:
    coord_col = f"best_school_coord_{col}"
    geom_str = row.get(coord_col)
    if isinstance(geom_str, str) and geom_str not in ("None", "nan"):
        geom = wkt.loads(geom_str)
        key = (round(geom.x, 7), round(geom.y, 7))  # (lon, lat) rounded to avoid tiny diffs
        marker_groups[key].append(col.replace("_", " ").title())

for (lon, lat), labels in marker_groups.items():
    label_text = ", ".join(labels)
    folium.Marker(
        location=[lat, lon],
        icon=folium.Icon(color="green", icon="graduation-cap", prefix="fa"),
        tooltip=f"{label_text} best school"
    ).add_to(m4)

for col in iso_cols:
    add_iso_layer(m4, geo_layers[col], palette[col], col.replace("_", " ").title())

folium.LayerControl().add_to(m4)
m4.save("isochrones.html")

In [70]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
from PIL import Image

HTML_PATH = "isochrones.html"  # local file created above
PNG_OUT   = "isochrones.png"

# -- Configure headless Chrome
opts = Options()
opts.add_argument("--headless=new")
opts.add_argument("--window-size=1920,1080")
# allow file:// access & cross-origin requests for local HTML
opts.add_argument("--allow-file-access-from-files")
opts.add_argument("--disable-web-security")
opts.add_argument("--disable-site-isolation-trials")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)
driver.get(f"file://{__import__('pathlib').Path(HTML_PATH).resolve()}")

# --- Wait for Leaflet tiles to load (robust way: listen for tileload)
driver.execute_script("""
window._tilesLoading = 0;
document.querySelectorAll('img.leaflet-tile').forEach(img=>{
  if(!img.complete) window._tilesLoading++;
  img.addEventListener('load', ()=>window._tilesLoading--, {once:true});
});
""")

# Fallback wait; then poll until tiles done or timeout
deadline = time.time() + 10
while time.time() < deadline:
    tiles_left = driver.execute_script("return window._tilesLoading || 0;")
    if tiles_left == 0:
        break
    time.sleep(0.25)
time.sleep(0.5)  # tiny buffer

# --- Option A: full-page screenshot
driver.save_screenshot(PNG_OUT)

# --- Option B: crop to just the map area
try:
    from selenium.webdriver.common.by import By
    el = driver.find_element(By.CSS_SELECTOR, "div.leaflet-container")
    loc = el.location_once_scrolled_into_view
    size = el.size
    driver.save_screenshot("_full.png")
    img = Image.open("_full.png")
    x, y = int(loc["x"]), int(loc["y"])
    w, h = int(size["width"]), int(size["height"])
    img.crop((x, y, x+w, y+h)).save(PNG_OUT)
finally:
    driver.quit()

print("Saved:", PNG_OUT)

Saved: isochrones.png
