# SA2 and SAL geo data preprocessing 

This notebook will download and preprocess Victoria's SA2 and SAL geo data file and save individual SA2 and SAL geo data file as csv in folder victoria_region_gdf created in raw data folder 

In [1]:
import os
import zipfile
import pandas as pd
import geopandas as gpd
import re




## Download SA2 and SAL data to local landing folder 

In [2]:
#

## Read and Extract SAL and SA2 zip to landing 
### Create a new folder name shapefile in landing to store extracted data SAL and SA2



In [3]:
# Specify the path to your zip file
SA2_zip_file_path = '../data/landing/SA2_2021_AUST_SHP_GDA2020.zip'

SAL_zip_file_path = '../data/landing/SAL_2021_AUST_GDA2020_SHP.zip'

# Specify the directory where you want to extract the files
SA2_extraction_path = '../data/landing/shapefile/SA2'

SAL_extraction_path = '../data/landing/shapefile/SAL'


# Create the extraction directory if it doesn't exist 
os.makedirs(SA2_extraction_path, exist_ok=True)
os.makedirs(SAL_extraction_path, exist_ok=True)



# Open the SAL zip file
with zipfile.ZipFile(SAL_zip_file_path, 'r') as zip_ref:
    # Extract all the contents to the extraction directory
    zip_ref.extractall(SAL_extraction_path)


# Open the SA2 zip file
with zipfile.ZipFile(SA2_zip_file_path, 'r') as zip_ref:
    # Extract all the contents to the extraction directory
    zip_ref.extractall(SA2_extraction_path)

## Read shapefiles in Geopandas

In [4]:
# Read the shapefile using GeoPandas
### SA2
SA2_gdf = gpd.read_file("../data/landing/shapefile/SA2/SA2_2021_AUST_GDA2020.shp")

### SAL
SAL_gdf = gpd.read_file("../data/landing/shapefile/SAL/SAL_2021_AUST_GDA2020_SHP/SAL_2021_AUST_GDA2020.shp")

In [5]:
SAL_gdf

Unnamed: 0,SAL_CODE21,SAL_NAME21,STE_CODE21,STE_NAME21,AUS_CODE21,AUS_NAME21,AREASQKM21,LOCI_URI21,SHAPE_Leng,SHAPE_Area,geometry
0,10001,Aarons Pass,1,New South Wales,AUS,Australia,82.7639,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.554241,0.007975,"POLYGON ((149.82477 -32.84384, 149.83271 -32.8..."
1,10002,Abbotsbury,1,New South Wales,AUS,Australia,4.9788,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.123051,0.000485,"POLYGON ((150.86523 -33.88264, 150.86479 -33.8..."
2,10003,Abbotsford (NSW),1,New South Wales,AUS,Australia,1.0180,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.053423,0.000099,"POLYGON ((151.13472 -33.85492, 151.13445 -33.8..."
3,10004,Abercrombie,1,New South Wales,AUS,Australia,2.9775,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.097338,0.000289,"POLYGON ((149.55192 -33.39280, 149.55148 -33.3..."
4,10005,Abercrombie River,1,New South Wales,AUS,Australia,127.1701,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.848903,0.012397,"POLYGON ((149.25562 -33.96535, 149.25563 -33.9..."
...,...,...,...,...,...,...,...,...,...,...,...
15348,90004,Norfolk Island,9,Other Territories,AUS,Australia,38.6510,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.629774,0.003580,"MULTIPOLYGON (((167.94051 -29.06260, 167.94046..."
15349,90005,West Island,9,Other Territories,AUS,Australia,5.9276,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.416115,0.000492,"MULTIPOLYGON (((96.82779 -12.17627, 96.82773 -..."
15350,99494,No usual address (OT),9,Other Territories,AUS,Australia,0.0000,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.000000,0.000000,
15351,99797,Migratory - Offshore - Shipping (OT),9,Other Territories,AUS,Australia,0.0000,http://linked.data.gov.au/dataset/asgsed3/SAL/...,0.000000,0.000000,


In [6]:
SA2_gdf

Unnamed: 0,SA2_CODE21,SA2_NAME21,CHG_FLAG21,CHG_LBL21,SA3_CODE21,SA3_NAME21,SA4_CODE21,SA4_NAME21,GCC_CODE21,GCC_NAME21,STE_CODE21,STE_NAME21,AUS_CODE21,AUS_NAME21,AREASQKM21,LOCI_URI21,geometry
0,101021007,Braidwood,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,3418.3525,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((149.58424 -35.44426, 149.58444 -35.4..."
1,101021008,Karabar,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,6.9825,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((149.21899 -35.36738, 149.21800 -35.3..."
2,101021009,Queanbeyan,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,4.7620,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((149.21326 -35.34325, 149.21619 -35.3..."
3,101021010,Queanbeyan - East,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,13.0032,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((149.24034 -35.34781, 149.24024 -35.3..."
4,101021012,Queanbeyan West - Jerrabomberra,0,No change,10102,Queanbeyan,101,Capital Region,1RNSW,Rest of NSW,1,New South Wales,AUS,Australia,13.6748,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((149.19572 -35.36126, 149.19970 -35.3..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2468,901031003,Jervis Bay,0,No change,90103,Jervis Bay,901,Other Territories,9OTER,Other Territories,9,Other Territories,AUS,Australia,67.2296,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"MULTIPOLYGON (((150.69567 -35.18295, 150.69556..."
2469,901041004,Norfolk Island,0,No change,90104,Norfolk Island,901,Other Territories,9OTER,Other Territories,9,Other Territories,AUS,Australia,38.6510,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"MULTIPOLYGON (((167.96325 -29.07212, 167.96326..."
2470,997979799,Migratory - Offshore - Shipping (OT),0,No change,99797,Migratory - Offshore - Shipping (OT),997,Migratory - Offshore - Shipping (OT),99799,Migratory - Offshore - Shipping (OT),9,Other Territories,AUS,Australia,,http://linked.data.gov.au/dataset/asgsed3/SA2/...,
2471,999999499,No usual address (OT),0,No change,99999,No usual address (OT),999,No usual address (OT),99499,No usual address (OT),9,Other Territories,AUS,Australia,,http://linked.data.gov.au/dataset/asgsed3/SA2/...,


In [7]:
# Select region to victoria
victoria_SA2_gdf = SA2_gdf[SA2_gdf['STE_NAME21']=='Victoria']

victoria_SAL_gdf = SAL_gdf[SAL_gdf['STE_NAME21']=='Victoria']

### Feature selection

In [8]:
# Select key features 
victoria_SA2_gdf = victoria_SA2_gdf.loc[:, ['SA2_NAME21', 'SA2_CODE21', 'geometry']]
victoria_SAL_gdf = victoria_SAL_gdf.loc[:, ['SAL_NAME21', 'SAL_CODE21', 'SHAPE_Area', 'geometry']]


In [9]:
victoria_SA2_gdf

Unnamed: 0,SA2_NAME21,SA2_CODE21,geometry
644,Alfredton,201011001,"POLYGON ((143.78282 -37.56666, 143.75558 -37.5..."
645,Ballarat,201011002,"POLYGON ((143.81896 -37.55582, 143.81644 -37.5..."
646,Buninyong,201011005,"POLYGON ((143.84171 -37.61596, 143.84176 -37.6..."
647,Delacombe,201011006,"POLYGON ((143.75050 -37.59119, 143.75044 -37.5..."
648,Smythes Creek,201011007,"POLYGON ((143.73296 -37.62333, 143.73263 -37.6..."
...,...,...,...
1163,Moyne - West,217041478,"MULTIPOLYGON (((142.00870 -38.41715, 142.00876..."
1164,Warrnambool - North,217041479,"POLYGON ((142.43668 -38.35544, 142.43658 -38.3..."
1165,Warrnambool - South,217041480,"POLYGON ((142.45281 -38.39126, 142.45230 -38.3..."
1166,Migratory - Offshore - Shipping (Vic.),297979799,


In [10]:
victoria_SAL_gdf

Unnamed: 0,SAL_NAME21,SAL_CODE21,SHAPE_Area,geometry
4544,Abbeyard,20001,0.033162,"POLYGON ((146.89824 -37.04602, 146.89947 -37.0..."
4545,Abbotsford (Vic.),20002,0.000178,"POLYGON ((145.00195 -37.79665, 145.00190 -37.7..."
4546,Aberfeldie,20003,0.000159,"POLYGON ((144.89576 -37.76514, 144.89547 -37.7..."
4547,Aberfeldy,20004,0.001107,"POLYGON ((146.38814 -37.72232, 146.38808 -37.7..."
4548,Acheron,20005,0.007381,"POLYGON ((145.76731 -37.25433, 145.76757 -37.2..."
...,...,...,...,...
7485,Yuulong,22942,0.005404,"POLYGON ((143.32185 -38.68969, 143.32203 -38.6..."
7486,Zeerust,22943,0.001808,"POLYGON ((145.40454 -36.25294, 145.40479 -36.2..."
7487,Zumsteins,22944,0.026011,"POLYGON ((142.48512 -37.15733, 142.48443 -37.1..."
7488,No usual address (Vic.),29494,0.000000,


### Remove unnecessary symbols in the string value (Cleaning)

Aware that SA2_NAME21 and SAL_NAME21 contain some, we would like to drop them for later string comparision

In [11]:
# Define a list of symbols to remove
symbols_to_remove = ['-', '_', '(Vic.)', '(', ')', '/', ]

# Remove symbols from all columns containing string values in SA2 name 
victoria_SA2_gdf["SA2_NAME21"] = victoria_SA2_gdf["SA2_NAME21"].str.replace('|'.join(map(re.escape, symbols_to_remove)), '', regex=True)


victoria_SAL_gdf["SAL_NAME21"] = victoria_SAL_gdf["SAL_NAME21"].str.replace('|'.join(map(re.escape, symbols_to_remove)), '', regex=True)

In [12]:
victoria_SA2_gdf

Unnamed: 0,SA2_NAME21,SA2_CODE21,geometry
644,Alfredton,201011001,"POLYGON ((143.78282 -37.56666, 143.75558 -37.5..."
645,Ballarat,201011002,"POLYGON ((143.81896 -37.55582, 143.81644 -37.5..."
646,Buninyong,201011005,"POLYGON ((143.84171 -37.61596, 143.84176 -37.6..."
647,Delacombe,201011006,"POLYGON ((143.75050 -37.59119, 143.75044 -37.5..."
648,Smythes Creek,201011007,"POLYGON ((143.73296 -37.62333, 143.73263 -37.6..."
...,...,...,...
1163,Moyne West,217041478,"MULTIPOLYGON (((142.00870 -38.41715, 142.00876..."
1164,Warrnambool North,217041479,"POLYGON ((142.43668 -38.35544, 142.43658 -38.3..."
1165,Warrnambool South,217041480,"POLYGON ((142.45281 -38.39126, 142.45230 -38.3..."
1166,Migratory Offshore Shipping,297979799,


In [13]:
victoria_SAL_gdf

Unnamed: 0,SAL_NAME21,SAL_CODE21,SHAPE_Area,geometry
4544,Abbeyard,20001,0.033162,"POLYGON ((146.89824 -37.04602, 146.89947 -37.0..."
4545,Abbotsford,20002,0.000178,"POLYGON ((145.00195 -37.79665, 145.00190 -37.7..."
4546,Aberfeldie,20003,0.000159,"POLYGON ((144.89576 -37.76514, 144.89547 -37.7..."
4547,Aberfeldy,20004,0.001107,"POLYGON ((146.38814 -37.72232, 146.38808 -37.7..."
4548,Acheron,20005,0.007381,"POLYGON ((145.76731 -37.25433, 145.76757 -37.2..."
...,...,...,...,...
7485,Yuulong,22942,0.005404,"POLYGON ((143.32185 -38.68969, 143.32203 -38.6..."
7486,Zeerust,22943,0.001808,"POLYGON ((145.40454 -36.25294, 145.40479 -36.2..."
7487,Zumsteins,22944,0.026011,"POLYGON ((142.48512 -37.15733, 142.48443 -37.1..."
7488,No usual address,29494,0.000000,


### Drop regions with no geometry 

In [14]:
# check rows with no geometry
display(victoria_SA2_gdf[victoria_SA2_gdf['geometry']==None])
display(victoria_SAL_gdf[victoria_SAL_gdf['geometry']==None])

Unnamed: 0,SA2_NAME21,SA2_CODE21,geometry
1166,Migratory Offshore Shipping,297979799,
1167,No usual address,299999499,


Unnamed: 0,SAL_NAME21,SAL_CODE21,SHAPE_Area,geometry
7488,No usual address,29494,0.0,
7489,Migratory Offshore Shipping,29797,0.0,


In [15]:
## Drop these columns 
victoria_SA2_gdf=victoria_SA2_gdf.dropna()
victoria_SAL_gdf=victoria_SAL_gdf.dropna()

## Check
display(victoria_SA2_gdf[victoria_SA2_gdf['geometry']==None])
display(victoria_SAL_gdf[victoria_SAL_gdf['geometry']==None])

Unnamed: 0,SA2_NAME21,SA2_CODE21,geometry


Unnamed: 0,SAL_NAME21,SAL_CODE21,SHAPE_Area,geometry


### Save the SA2 and SAL shapefile to raw data 

We will first create a new folder in ../data/raw name victoria_region_gdf 

In [16]:
## Create victoria_region_gdf in raw data

# Specify the output directory and filename
output_directory = "../data/raw"
SA2_output_filename = "SA2_region_gdf.geojson" 
SAL_output_filename = "SAL_region_gdf.geojson" 

new_folder_name = "victoria_region_gdf"

# Create the new folder
new_folder_path = os.path.join(output_directory, new_folder_name)
if not os.path.exists(new_folder_path):
    os.makedirs(new_folder_path)
    print(f"New folder created at: {new_folder_path}")
else:
    print(f"Folder already exists at: {new_folder_path}")

Folder already exists at: ../data/raw/victoria_region_gdf


In [17]:
# Save the GeoPandas DataFrame to ../data/raw/victoria_region_gdf
SA2_output_path = os.path.join(new_folder_path, SA2_output_filename)
SAL_output_path = os.path.join(new_folder_path, SAL_output_filename)

# save as GeoJson file 
victoria_SA2_gdf.to_file(SA2_output_path, driver="GeoJSON")  
victoria_SAL_gdf.to_file(SAL_output_path, driver="GeoJSON") 


#### Check by reading 

In [18]:
#Check reading it 
SA2_file_path = "../data/raw/victoria_region_gdf/SA2_region_gdf.geojson"

SAL_file_path = "../data/raw/victoria_region_gdf/SAL_region_gdf.geojson"


# Read the GeoPandas DataFrame
SA2_region_gdf = gpd.read_file(SA2_file_path)

SAL_region_gdf = gpd.read_file(SAL_file_path)

In [19]:
SAL_region_gdf

Unnamed: 0,SAL_NAME21,SAL_CODE21,SHAPE_Area,geometry
0,Abbeyard,20001,0.033162,"POLYGON ((146.89824 -37.04602, 146.89947 -37.0..."
1,Abbotsford,20002,0.000178,"POLYGON ((145.00195 -37.79665, 145.00190 -37.7..."
2,Aberfeldie,20003,0.000159,"POLYGON ((144.89576 -37.76514, 144.89547 -37.7..."
3,Aberfeldy,20004,0.001107,"POLYGON ((146.38814 -37.72232, 146.38808 -37.7..."
4,Acheron,20005,0.007381,"POLYGON ((145.76731 -37.25433, 145.76757 -37.2..."
...,...,...,...,...
2939,Yundool,22940,0.003174,"POLYGON ((145.86040 -36.28432, 145.86038 -36.2..."
2940,Yuroke,22941,0.000906,"POLYGON ((144.85250 -37.55800, 144.85303 -37.5..."
2941,Yuulong,22942,0.005404,"POLYGON ((143.32185 -38.68969, 143.32203 -38.6..."
2942,Zeerust,22943,0.001808,"POLYGON ((145.40454 -36.25294, 145.40479 -36.2..."


In [20]:
SA2_region_gdf

Unnamed: 0,SA2_NAME21,SA2_CODE21,geometry
0,Alfredton,201011001,"POLYGON ((143.78282 -37.56666, 143.75558 -37.5..."
1,Ballarat,201011002,"POLYGON ((143.81896 -37.55582, 143.81644 -37.5..."
2,Buninyong,201011005,"POLYGON ((143.84171 -37.61596, 143.84176 -37.6..."
3,Delacombe,201011006,"POLYGON ((143.75050 -37.59119, 143.75044 -37.5..."
4,Smythes Creek,201011007,"POLYGON ((143.73296 -37.62333, 143.73263 -37.6..."
...,...,...,...
517,Otway,217031476,"MULTIPOLYGON (((143.40263 -38.78152, 143.40252..."
518,Moyne East,217041477,"POLYGON ((142.41438 -38.09303, 142.41400 -38.0..."
519,Moyne West,217041478,"MULTIPOLYGON (((142.00870 -38.41715, 142.00876..."
520,Warrnambool North,217041479,"POLYGON ((142.43668 -38.35544, 142.43658 -38.3..."
