# Greater Sydney SA4 Analysis (Unified)

This notebook performs the same analysis across three SA4 regions in Greater Sydney:
- Parramatta
- Inner South West
- Northern Beaches

It uses the same pipeline of data loading, filtering, PostgreSQL ingestion, scoring, and summary reporting.

In [92]:
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
from shapely.geometry import Point
from sqlalchemy import create_engine
import time
import requests
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import numpy as np
from IPython.display import display

# PostgreSQL connection (from original notebook)
engine = create_engine("postgresql://postgres:0111@localhost:5432/project2")


In [93]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def analyze_sa4(sa4_name):
    print(f"\n--- Analyzing SA4 Region: {sa4_name} ---\n")

    # 1. Load SA2 shapefile and filter to the selected SA4
    sa2 = gpd.read_file("../data/SA2_2021_AUST_SHP_GDA2020/SA2_2021_AUST_GDA2020.shp")
    sa2_gsyd = sa2[sa2['GCC_NAME21'] == 'Greater Sydney']
    sa2_filtered = sa2_gsyd[sa2_gsyd['SA4_NAME21'] == sa4_name].copy()

    # 2. Load and filter population data using SA2 names
    population = pd.read_csv("../data/Population.csv")
    population = population[population['sa2_name'].isin(sa2_filtered['SA2_NAME21'])]

    # Compute young people population (ages 0-19)
    young_cols = ['0-4_people', '5-9_people', '10-14_people', '15-19_people']
    population['young_people'] = population[young_cols].sum(axis=1)

    # 3. Load and aggregate businesses data by SA2
    businesses = pd.read_csv("../data/Businesses.csv")
    biz_summary = businesses.groupby('sa2_name')['total_businesses'].sum().reset_index()
    sa2_filtered = sa2_filtered.merge(biz_summary, left_on='SA2_NAME21', right_on='sa2_name', how='left')
    sa2_filtered['total_businesses'] = sa2_filtered['total_businesses'].fillna(0)

    # 4. Load and spatially join stops data using lat/lon
    stops = pd.read_csv("../data/Stops.txt")
    stops = stops.dropna(subset=['stop_lat', 'stop_lon'])
    stops['geometry'] = stops.apply(lambda row: Point(row['stop_lon'], row['stop_lat']), axis=1)
    stops_gdf = gpd.GeoDataFrame(stops, geometry='geometry', crs="EPSG:4326")
    stops_gdf = stops_gdf.to_crs(sa2_filtered.crs)
    stop_counts = gpd.sjoin(stops_gdf, sa2_filtered, how="inner", predicate="intersects") \
                    .groupby("SA2_NAME21").size().reset_index(name="stop_count")
    sa2_filtered = sa2_filtered.merge(stop_counts, on="SA2_NAME21", how="left")
    sa2_filtered["stop_count"] = sa2_filtered["stop_count"].fillna(0)

    # 5. Load and spatially join school catchments (primary + secondary)
    primary = gpd.read_file("../data/catchments/catchments/catchments_primary.shp")
    secondary = gpd.read_file("../data/catchments/catchments/catchments_secondary.shp")
    schools = pd.concat([primary, secondary], ignore_index=True)
    schools = schools.to_crs(sa2_filtered.crs)
    school_counts = gpd.sjoin(sa2_filtered, schools, how="left", predicate="intersects") \
                      .groupby("SA2_NAME21").size().reset_index(name="school_count")
    sa2_filtered = sa2_filtered.merge(school_counts, on="SA2_NAME21", how="left")
    sa2_filtered["school_count"] = sa2_filtered["school_count"].fillna(0)

    # 6. Load and filter income data
    income = pd.read_csv("../data/Income.csv")
    income = income[income['sa2_name'].isin(sa2_filtered['SA2_NAME21'])]

    # 7. Dummy POI count (random for now)
    sa2_filtered['POI_count'] = np.random.randint(10, 100, sa2_filtered.shape[0])

    # 8. Compute z-scores and final score using sigmoid
    df = pd.DataFrame()
    df['SA2_NAME'] = sa2_filtered['SA2_NAME21'].values
    df['z_business'] = (sa2_filtered['total_businesses'] - sa2_filtered['total_businesses'].mean()) / sa2_filtered['total_businesses'].std()
    df['z_stops'] = (sa2_filtered['stop_count'] - sa2_filtered['stop_count'].mean()) / sa2_filtered['stop_count'].std()
    df['z_schools'] = (sa2_filtered['school_count'] - sa2_filtered['school_count'].mean()) / sa2_filtered['school_count'].std()
    df['z_POI'] = (sa2_filtered['POI_count'] - sa2_filtered['POI_count'].mean()) / sa2_filtered['POI_count'].std()
    df['score'] = sigmoid(df[['z_business', 'z_stops', 'z_schools', 'z_POI']].sum(axis=1))
    df['median_income'] = income['median_income'].values[:len(df)]

    # 9. Display formatted summary table
    display(df.style.set_table_styles(
        [{'selector': 'table', 'props': [('border', '1px solid black')]}]
    ).set_properties(**{'border': '1px solid black'}))

    return df

In [94]:
df_parramatta = analyze_sa4('Sydney - Parramatta')


--- Analyzing SA4 Region: Sydney - Parramatta ---



Unnamed: 0,SA2_NAME,z_business,z_stops,z_schools,z_POI,score,median_income
0,Rookwood Cemetery,-1.974155,-1.412118,-1.038873,1.237861,0.039647,83172
1,Auburn - Central,1.012002,-0.022192,0.073621,-0.298794,0.68236,38824
2,Auburn - North,-0.072305,-0.829673,-0.760749,0.682958,0.272937,39571
3,Auburn - South,-0.594337,-0.789961,0.629868,-0.853697,0.166849,41555
4,Berala,-0.995634,-0.617875,-1.316996,-1.067121,0.018028,43527
5,Lidcombe,1.428248,0.507304,0.907991,0.768327,0.973709,43794
6,Regents Park,-1.154313,-1.014997,-0.760749,-1.4086,0.012886,44166
7,Silverwater - Newington,0.392234,-0.829673,-1.038873,-0.640273,0.107495,58967
8,Wentworth Point - Sydney Olympic Park,0.796981,-0.94881,-0.204503,-1.152491,0.181113,59389
9,Ermington - Rydalmere,1.089042,2.003129,2.576732,-0.512218,0.994272,56447


In [95]:
df_inner_south_west = analyze_sa4('Sydney - Inner South West')


--- Analyzing SA4 Region: Sydney - Inner South West ---



Unnamed: 0,SA2_NAME,z_business,z_stops,z_schools,z_POI,score,median_income
0,Bass Hill - Georges Hall,0.91276,3.082984,0.645597,1.00187,0.996471,45954
1,Chullora,-1.456292,-1.353197,-1.119035,-1.412115,0.00477,55658
2,Condell Park,1.908875,1.369005,0.939702,-0.750216,0.969745,45799
3,Padstow,0.149289,0.245557,1.527913,-0.516604,0.803159,54688
4,Revesby,-0.389727,1.008926,0.645597,-0.905956,0.588759,54418
5,Yagoona - Birrong,0.494981,1.196167,0.645597,-0.166187,0.897574,45361
6,Bankstown - North,1.045466,-0.186539,0.057386,-0.438734,0.617176,44021
7,Bankstown - South,1.749956,0.346379,2.116123,-1.217438,0.952349,42430
8,Greenacre - North,0.401595,1.152957,-1.41314,-1.489985,0.206104,43915
9,Greenacre - South,0.05918,-0.388183,-2.00135,0.417842,0.128699,43470


In [96]:
df_northern_beaches = analyze_sa4('Sydney - Northern Beaches')


--- Analyzing SA4 Region: Sydney - Northern Beaches ---



Unnamed: 0,SA2_NAME,z_business,z_stops,z_schools,z_POI,score,median_income
0,Balgowlah - Clontarf - Seaforth,0.858228,1.881288,-0.095763,1.176327,0.978544,73271
1,Manly - Fairlight,1.703212,0.506501,-0.615617,-0.571998,0.735381,77750
2,Avalon - Palm Beach,0.130816,1.612308,-1.395399,-0.004974,0.584859,56676
3,Bayview - Elanora Heights,-0.273546,0.431784,0.164165,-0.477494,0.461305,59217
4,Newport - Bilgola,0.167077,-0.644137,-0.875545,0.609303,0.322282,61478
5,Mona Vale - Warriewood (North),1.00327,0.10303,-0.615617,0.08953,0.641117,59292
6,North Narrabeen - Warriewood (South),-0.603188,-0.405043,-1.395399,1.412587,0.270706,63033
7,Beacon Hill - Narraweena,-0.590003,-0.16595,-0.35569,-0.193982,0.21322,57919
8,Cromer,-0.815258,-0.688967,0.424092,-0.855511,0.126127,58745
9,Forestville - Killarney Heights,-0.529568,-0.74874,0.164165,-0.855511,0.122426,63005


In [97]:
summary = pd.DataFrame({
    'SA4': ['Parramatta', 'Inner South West', 'Northern Beaches'],
    'Average Score': [
        df_parramatta['score'].mean(),
        df_inner_south_west['score'].mean(),
        df_northern_beaches['score'].mean()
    ]
})
display(summary.style.set_properties(**{'border': '1px solid black'}))


Unnamed: 0,SA4,Average Score
0,Parramatta,0.483049
1,Inner South West,0.511106
2,Northern Beaches,0.458899


## Summary Comparison Across SA4 Regions

In [98]:

summary = pd.DataFrame({
    'SA4': ['Parramatta', 'Inner South West', 'Northern Beaches'],
    'Average Score': [
        df_parramatta['score'].mean(),
        df_inner_south_west['score'].mean(),
        df_northern_beaches['score'].mean()
    ]
})
display(summary.style.set_properties(**{'border': '1px solid black'}))


Unnamed: 0,SA4,Average Score
0,Parramatta,0.483049
1,Inner South West,0.511106
2,Northern Beaches,0.458899
