In [2]:
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np
import math as m
import pyspark.sql.functions as F
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LassoCV, ElasticNetCV
from sklearn.metrics import root_mean_squared_error, r2_score
import geopandas as gpd
import folium
import matplotlib.pyplot as plt
from utilsforecast.plotting import plot_series
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import SGDRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor


# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Tutorial 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "6g")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/17 20:23:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Load the shapefile (use the .shp file as entry point)
sf = gpd.read_file("VMADMIN/LOCALITY_POLYGON.shp")
sf = sf.to_crs(epsg=4326)  # Convert to lat/lon (WGS84)
sf = sf[["LOCALITY", "geometry"]]
sf["LOCALITY"] = sf["LOCALITY"].str.strip()  # Remove whitespace    
sf.head()
# Nice map center (use the shapefile centroid)
center = [sf.geometry.unary_union.centroid.y, sf.geometry.unary_union.centroid.x]
m = folium.Map(location=center, zoom_start=6, tiles="CartoDB positron")

# Add regions
folium.GeoJson(
    sf.__geo_interface__,
    name="VicGov Regions",
    style_function=lambda f: {"fillOpacity": 0.2, "weight": 1},
    tooltip=folium.GeoJsonTooltip(fields=["LOCALITY"], aliases=["Suburb"])
).add_to(m)

folium.LayerControl().add_to(m)

m.save("vicgov_regions.html")  # Save to file (open in browser to view)

  center = [sf.geometry.unary_union.centroid.y, sf.geometry.unary_union.centroid.x]


In [74]:
sf

Unnamed: 0,LOCALITY,geometry
0,MOLLONGGHIP,"POLYGON ((144.06544 -37.48382, 144.06438 -37.4..."
1,NORTH BLACKWOOD,"POLYGON ((144.38037 -37.42376, 144.38126 -37.4..."
2,BASALT,"POLYGON ((144.1184 -37.31148, 144.1184 -37.311..."
3,LLANELLY,"POLYGON ((143.81737 -36.75048, 143.81679 -36.7..."
4,MURRABIT WEST,"POLYGON ((143.87075 -35.49319, 143.86939 -35.4..."
...,...,...
2968,CHRISTMAS HILLS,"POLYGON ((145.31596 -37.70112, 145.31607 -37.7..."
2969,YARRA GLEN,"POLYGON ((145.40036 -37.67188, 145.40022 -37.6..."
2970,HEIDELBERG WEST,"POLYGON ((145.0566 -37.73521, 145.05548 -37.73..."
2971,RESERVOIR,"POLYGON ((145.00146 -37.72995, 145.00116 -37.7..."


In [4]:
# 0-based: row 4 = 'Age group (years)', row 5 = 'no.'
df = pd.read_excel("32350DS0005_2001-24.xlsx", sheet_name="Table 3", header=[4, 5])

def clean(x):
    s = "" if x is None else str(x).strip()
    if s.startswith("Unnamed") or s.startswith("Missing value"):
        return ""
    return s

def pick_name(col):              # col is a tuple: (level0, level1)
    a, b = clean(col[0]), clean(col[1])
    if a == "Age group (years)":
        return b                 # e.g. '0–4', '5–9', ...
    return a or b                # prefer level0; fall back to level1

cols = [pick_name(c) for c in df.columns]

# If any of the first ID columns are still blank, fill with expected labels:
id_cols = ["Year","S/T code","S/T name","GCCSA code","GCCSA name",
           "SA4 code","SA4 name","SA3 code","SA3 name","SA2 code", "SA2 name"]
for i, name in enumerate(id_cols):
    if i < len(cols) and cols[i] == "":
        cols[i] = name

df.columns = cols

In [6]:
# drop that row from the data
df = df[1:].reset_index(drop=True)
df["S/T name"] = df["S/T name"].str.strip()

# filter to Victoria 
df_vic = df[(df["S/T name"] == "Victoria")].copy()
# filter to 2024
df_vic["Year"] = df_vic["Year"].astype(int)
df_vic = df_vic[(df_vic["Year"] == 2024)].copy()

df_vic.reset_index(drop=True, inplace=True)
df_vic = df_vic.drop(columns=["S/T code", "S/T name", "GCCSA code", "GCCSA name", "SA4 code", "SA4 name", "SA3 code", "SA3 name"])
df_vic

Unnamed: 0,Year,SA2 code,SA2 name,0–4,5–9,10–14,15–19,20–24,25–29,30–34,...,45–49,50–54,55–59,60–64,65–69,70–74,75–79,80–84,85 and over,Total persons
0,2024,201011001,Alfredton,1306,1615,1685,1601,1264,1131,1345,...,1329,1223,955,885,788,724,640,353,250,20130
1,2024,201011002,Ballarat,484,586,641,830,763,557,596,...,723,827,761,828,761,696,626,403,368,11773
2,2024,201011005,Buninyong,304,459,603,621,532,260,290,...,523,526,421,500,481,454,289,138,128,7358
3,2024,201011006,Delacombe,1183,1069,953,867,1028,1451,1464,...,698,661,563,580,533,437,365,205,220,14271
4,2024,201011007,Smythes Creek,185,270,337,381,260,114,216,...,289,329,348,296,294,188,159,52,51,4278
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517,2024,217031476,Otway,158,161,183,188,96,125,197,...,207,305,285,409,401,353,240,127,86,3948
518,2024,217041477,Moyne - East,447,451,471,488,336,382,391,...,399,467,511,568,481,356,276,142,134,7145
519,2024,217041478,Moyne - West,520,598,671,691,409,448,470,...,593,640,680,741,772,766,553,280,262,10215
520,2024,217041479,Warrnambool - North,1304,1348,1509,1560,1298,1338,1506,...,1405,1414,1393,1404,1343,1156,1007,636,534,22916


In [21]:
sa2_sf = gpd.read_file("SA2_2021_AUST_SHP_GDA2020/SA2_2021_AUST_GDA2020.shp")
sa2_sf = sa2_sf.to_crs(epsg=4326)  # Convert to lat/lon (WGS84)
sa2_sf_vic = sa2_sf[(sa2_sf["STE_NAME21"] == "Victoria") & (sa2_sf["STE_CODE21"] == "2")].copy() # Double filter for VIC just in case

# make both sides 9-digit strings
sa2_sf_vic["SA2_CODE21"] = sa2_sf_vic["SA2_CODE21"].astype(str).str.zfill(9)
df_vic["SA2 code"] = df_vic["SA2 code"].astype(str).str.zfill(9)

vic_merged = sa2_sf_vic.merge(df_vic, left_on="SA2_CODE21", right_on="SA2 code", how="left")

# remove rows with no population data in vic merged
vic_merged = vic_merged.dropna(axis=0, how='any').copy()
vic_merged.reset_index(drop=True, inplace=True)
vic_merged

Unnamed: 0,SA2_CODE21,SA2_NAME21,CHG_FLAG21,CHG_LBL21,SA3_CODE21,SA3_NAME21,SA4_CODE21,SA4_NAME21,GCC_CODE21,GCC_NAME21,...,45–49,50–54,55–59,60–64,65–69,70–74,75–79,80–84,85 and over,Total persons
0,201011001,Alfredton,0,No change,20101,Ballarat,201,Ballarat,2RVIC,Rest of Vic.,...,1329,1223,955,885,788,724,640,353,250,20130
1,201011002,Ballarat,0,No change,20101,Ballarat,201,Ballarat,2RVIC,Rest of Vic.,...,723,827,761,828,761,696,626,403,368,11773
2,201011005,Buninyong,0,No change,20101,Ballarat,201,Ballarat,2RVIC,Rest of Vic.,...,523,526,421,500,481,454,289,138,128,7358
3,201011006,Delacombe,0,No change,20101,Ballarat,201,Ballarat,2RVIC,Rest of Vic.,...,698,661,563,580,533,437,365,205,220,14271
4,201011007,Smythes Creek,0,No change,20101,Ballarat,201,Ballarat,2RVIC,Rest of Vic.,...,289,329,348,296,294,188,159,52,51,4278
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517,217031476,Otway,0,No change,21703,Colac - Corangamite,217,Warrnambool and South West,2RVIC,Rest of Vic.,...,207,305,285,409,401,353,240,127,86,3948
518,217041477,Moyne - East,0,No change,21704,Warrnambool,217,Warrnambool and South West,2RVIC,Rest of Vic.,...,399,467,511,568,481,356,276,142,134,7145
519,217041478,Moyne - West,0,No change,21704,Warrnambool,217,Warrnambool and South West,2RVIC,Rest of Vic.,...,593,640,680,741,772,766,553,280,262,10215
520,217041479,Warrnambool - North,0,No change,21704,Warrnambool,217,Warrnambool and South West,2RVIC,Rest of Vic.,...,1405,1414,1393,1404,1343,1156,1007,636,534,22916


In [None]:
# Overlay the population data on the map
m2 = folium.Map(location=center, zoom_start=6, tiles="CartoDB positron")
folium.GeoJson(
    vic_merged.__geo_interface__,
    name="VicGov Regions",
    style_function=lambda f: { "fillOpacity": 0.2, "weight": 1 },
    tooltip=folium.GeoJsonTooltip(fields=["SA2_NAME21", "Total persons"], aliases=["Suburb", "Total Persons Population"])
).add_to(m2)
folium.LayerControl().add_to(m2)    
m2.save("vicgov_population.html")  # Save to file (open in browser to view)


In [29]:
import matplotlib.colors as mcolors

In [None]:

# Overlay the population data on the map
m2 = folium.Map(location=center, zoom_start=6, tiles="CartoDB positron")

cmap = plt.cm.viridis
norm = mcolors.Normalize(vmin=vic_merged["Total persons"].min(), vmax=vic_merged["Total persons"].max())

# Layer 2: Population choropleth (darker = higher population)
folium.GeoJson(
    vic_merged,  # your merged GeoDataFrame
    name="SA2 Population",
    style_function=lambda feature: {
    "fillColor": mcolors.to_hex(
        cmap(norm(feature["properties"]["Total persons"]))
    ),
    "color": "black",
    "weight": 0.2,
    "fillOpacity": 0.7,
    },
    tooltip=folium.GeoJsonTooltip(
    fields=["SA2_NAME21", "Total persons"],  # adjust field names
    aliases=["SA2", "Total Persons Population"]
    )
).add_to(m2)

folium.LayerControl().add_to(m2)
m2.save("vicgov_population2_sa2.html")  # Save to file (open in browser to view)

In [34]:
# Load the shapefile (use the .shp file as entry point)
sf = gpd.read_file("VMADMIN/LOCALITY_POLYGON.shp")
sf = sf.to_crs(epsg=4326)  # Convert to lat/lon (WGS84)
sf = sf[["LOCALITY", "geometry"]]
sf["LOCALITY"] = sf["LOCALITY"].str.strip()  # Remove whitespace    
sf.head()
# Nice map center (use the shapefile centroid)
center = [sf.geometry.unary_union.centroid.y, sf.geometry.unary_union.centroid.x]
m = folium.Map(location=center, zoom_start=6, tiles="CartoDB positron")

# Add regions
folium.GeoJson(
    sf.__geo_interface__,
    name="VicGov Regions",
    style_function=lambda f: {"fillOpacity": 0.4, "weight": 1},
    tooltip=folium.GeoJsonTooltip(fields=["LOCALITY"], aliases=["Suburb"])
).add_to(m)

# Layer 2: Population choropleth (darker = higher population)
folium.GeoJson(
    vic_merged,  # your merged GeoDataFrame
    name="SA2 Population",
    style_function=lambda feature: {
    "fillColor": mcolors.to_hex(
        cmap(norm(feature["properties"]["Total persons"]))
    ),
    "color": "black",
    "weight": 0.2,
    "fillOpacity": 0.7,
    },
    tooltip=folium.GeoJsonTooltip(
    fields=["SA2_NAME21", "Total persons"],  # adjust field names
    aliases=["SA2", "Total Persons Population"]
    )
).add_to(m)

folium.LayerControl().add_to(m)

m.save("vicgov_pop_sa2_suburb.html")  # Save to file (open in browser to view)


  center = [sf.geometry.unary_union.centroid.y, sf.geometry.unary_union.centroid.x]


In [73]:
# read in public transport stops geojson
pt_stops = gpd.read_file("../data/landing/ptv/public_transport_stops.geojson")
pt_stops = pt_stops.to_crs(epsg=4326)
# grouping by stop name to find number of stops with same name
pt_stops_sub = (pt_stops.groupby(["STOP_NAME", "MODE"]).agg(num_stops=("STOP_NAME", "count"), geometry=("geometry", "first")).reset_index())

pt_stops_sub = gpd.GeoDataFrame(pt_stops_sub, geometry="geometry", crs=pt_stops.crs).to_crs(epsg=4326)
pt_stops_sub = pt_stops_sub.dropna(subset=["geometry"])
pt_stops_sub

Unnamed: 0,STOP_NAME,MODE,num_stops,geometry
0,10 Jarrah Dr,METRO BUS,1,POINT (145.11072 -38.00284)
1,10 Oban Rd,METRO BUS,1,POINT (145.25205 -37.79634)
2,10 Queens Pde,METRO BUS,1,POINT (144.97125 -37.71958)
3,100 Kent Rd,REGIONAL BUS,1,POINT (142.01692 -37.72773)
4,100 South Gippsland Hwy,METRO BUS,1,POINT (145.22923 -38.009)
...,...,...,...,...
20589,lift,METRO TRAIN,1,POINT (145.09741 -37.62259)
20590,opp 12 Main Rd,REGIONAL BUS,1,POINT (144.20394 -37.07956)
20591,opp 154 Princes Hwy,REGIONAL BUS,1,POINT (147.6562 -37.81944)
20592,opp 34 The Elms Bvd,REGIONAL BUS,1,POINT (144.95935 -37.31601)


In [None]:
pt_stops

In [None]:
pt_stop

In [68]:
pt_stops_sub["num_stops"].unique()

array([ 1,  2,  4,  3, 10, 11,  6,  7, 13,  5,  8, 12, 14,  9])

In [69]:
# Overlay the population data on the map
m3 = folium.Map(location=center, zoom_start=6, tiles="CartoDB positron")

cmap = plt.cm.viridis
norm = mcolors.Normalize(vmin=pt_stops_sub["num_stops"].min(), vmax=pt_stops_sub["num_stops"].max())

# Layer 2: PTV Stops choropleth (darker = higher number of stops)
folium.GeoJson(
    pt_stops_sub,  # your geoDataFrame
    name="PTV Stops per suburb",
    style_function=lambda feature: {
    "fillColor": mcolors.to_hex(
        cmap(norm(feature["properties"]["num_stops"]))
    ),
    "color": "black",
    "weight": 0.2,
    "fillOpacity": 0.7,
    },
    tooltip=folium.GeoJsonTooltip(
    fields=["STOP_NAME", "num_stops"],  # adjust field names
    aliases=["PTV Stop Name", "Number of Stops"]
    )
).add_to(m3)

folium.LayerControl().add_to(m3)
m3.save("vicgov_ptv_stops.html")  # Save to file (open in browser to view)

In [64]:
from folium.plugins import MarkerCluster

In [71]:
# 2) color scale
cmap = plt.cm.viridis
norm = mcolors.Normalize(pt_stops_sub["num_stops"].min(), pt_stops_sub["num_stops"].max())

# 3) map + cluster
m3 = folium.Map(location=center, zoom_start=6, tiles="CartoDB positron")
cluster = MarkerCluster(name="PTV Stops (by STOP_NAME count)").add_to(m3)

for _, r in pt_stops_sub.iterrows():
    color = mcolors.to_hex(cmap(norm(r["num_stops"])))
    folium.CircleMarker(
        location=[r.geometry.y, r.geometry.x],
        radius=4 + 0.6*(r["num_stops"]-1),
        color=color, fillColor=color,
        fill=True, fill_opacity=0.9, weight=0,
        tooltip=f"PTV Stop Name: {r['STOP_NAME']}<br>Number of Stops: {int(r['num_stops'])}"
    ).add_to(cluster)

# (optional) fit map to data
minx, miny, maxx, maxy = pt_stops_sub.total_bounds
m3.fit_bounds([[miny, minx], [maxy, maxx]])

folium.LayerControl().add_to(m3)
m3.save("vicgov_ptv_stops.html")

In [81]:
postcode_df = pd.read_csv("../data/geo/vic_suburbs_postcodes.csv")
postcode_df["suburb"] = postcode_df["suburb"].str.lower().str.strip()
sf["suburb"] = sf["LOCALITY"].str.lower().str.strip()
postcode_df_cord = postcode_df.merge(sf, left_on="suburb", right_on="suburb", how="left")
postcode_df_cord = postcode_df_cord.dropna(how='any')
postcode_df_cord

Unnamed: 0,postcode,suburb,LOCALITY,geometry
0,3000,melbourne,MELBOURNE,"POLYGON ((144.97798 -37.83865, 144.97805 -37.8..."
1,3001,melbourne,MELBOURNE,"POLYGON ((144.97798 -37.83865, 144.97805 -37.8..."
2,3002,east melbourne,EAST MELBOURNE,"POLYGON ((144.97862 -37.81858, 144.97868 -37.8..."
3,3003,west melbourne,WEST MELBOURNE,"POLYGON ((144.93263 -37.81787, 144.93256 -37.8..."
4,3004,melbourne,MELBOURNE,"POLYGON ((144.97798 -37.83865, 144.97805 -37.8..."
...,...,...,...,...
3181,8396,melbourne,MELBOURNE,"POLYGON ((144.97798 -37.83865, 144.97805 -37.8..."
3182,8399,melbourne,MELBOURNE,"POLYGON ((144.97798 -37.83865, 144.97805 -37.8..."
3183,8576,ivanhoe,IVANHOE,"POLYGON ((145.05731 -37.78139, 145.05699 -37.7..."
3184,8627,camberwell,CAMBERWELL,"POLYGON ((145.05426 -37.84174, 145.054 -37.841..."


In [88]:
sf

Unnamed: 0,LOCALITY,geometry,suburb
0,MOLLONGGHIP,"POLYGON ((144.06544 -37.48382, 144.06438 -37.4...",mollongghip
1,NORTH BLACKWOOD,"POLYGON ((144.38037 -37.42376, 144.38126 -37.4...",north blackwood
2,BASALT,"POLYGON ((144.1184 -37.31148, 144.1184 -37.311...",basalt
3,LLANELLY,"POLYGON ((143.81737 -36.75048, 143.81679 -36.7...",llanelly
4,MURRABIT WEST,"POLYGON ((143.87075 -35.49319, 143.86939 -35.4...",murrabit west
...,...,...,...
2968,CHRISTMAS HILLS,"POLYGON ((145.31596 -37.70112, 145.31607 -37.7...",christmas hills
2969,YARRA GLEN,"POLYGON ((145.40036 -37.67188, 145.40022 -37.6...",yarra glen
2970,HEIDELBERG WEST,"POLYGON ((145.0566 -37.73521, 145.05548 -37.73...",heidelberg west
2971,RESERVOIR,"POLYGON ((145.00146 -37.72995, 145.00116 -37.7...",reservoir


In [93]:
from shapely.geometry import Point, Polygon

def get_suburb(lon, lat, sf):
    pt = Point(lon, lat) # e.g., Point(144.9631, -37.8136)
    for _, row in sf.iterrows():
        poly = row['geometry']
        if poly.covers(pt):
            return row['suburb'] # return lower case suburb name
    return None

# if None remove that row from the dataframe

# test the function
# point in melbourne
suburb_name = get_suburb(144.9631, -37.8136, sf)
print(suburb_name)  # should print 'melbourne'


melbourne
