# Suburb coordinate cleaning


In [1]:
import os
import numpy as np
import pandas as pd

from geopy.geocoders import GoogleV3  # for determining uni campus coordinates
from geopy.distance import geodesic  # for calculating distance between coordinates

# make all output interactive
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import helper

In [2]:
# Define file structure constants
# ================================
DATA_PATH = os.path.join(os.getcwd(), "../data/")
RAW_DATA_PATH = os.path.join(DATA_PATH, "raw")
DERIVED_DATA_PATH = os.path.join(DATA_PATH, "derived")
CORR_DATA_PATH = os.path.join(DATA_PATH, "correspondence")


## Coordinates

The data provided in [`australian_postcodes.csv`](../data/raw/australian_postcodes.csv) is fairly accurate; however, various discripancies have been noted in the coordinates for the suburbs. This data will be cleaned using `geopy` and the `GoogleV3` engine to determine the coordinates for each suburb.

In order for the `GoogleV3` code to work, VSCode must be opened from the terminal using the following code to ensure the environment variable for the Google Maps API key is set correctly and inherited by the session:

``` bash
> source ~/.bash-profile
> code .
```

In [3]:
geolocator = GoogleV3(api_key=os.getenv("GMAPS_APIKEY"))

We collect the 'raw' data using the `getSuburbsMetadata()` function defined in [`helper.py`](helper.py). Since we are recalculating the coordinates for each suburb, we only need the name and postcode (we also use postcode just to be safe).

In [4]:
suburb_data = helper.getSuburbsMetadata("raw")
suburb_details = suburb_data[["postcode", "locality"]].to_numpy()

In [5]:
# each coordinate in coordinates_list corresponds to each row in the suburb_data
# dataframe, and so can be directly "joined" to the dataframe
coordinates_details = []
failed_details = []
previous_message_length = 0

for index, details in enumerate(suburb_details):
    search_string = f"{details[1]} VIC, {details[0]}"
    location = geolocator.geocode(search_string)
    percentage_progress = round(((index + 1) / len(suburb_details)) * 100, 2)

    if location is not None:  # geocode succeeded -> store coordinates
        # message to user
        message = f"\r({str(percentage_progress).ljust(5,'0')}%) {str('[' + str(index + 1) + '/' + str(len(suburb_details)) + ']').ljust(15)}Geocoded: {details[1]}"
        print(" " * previous_message_length, end="\r")  # clear line
        print(message, end="\r")  # print message
        previous_message_length = len(message)  # record message length for next clear
        # record coordinate point
        point = location.point
        tup = (point.latitude, point.longitude)
        coordinates_details.append([details[0], details[1], tup])

    else:  # geocode failed -> record failure to remove suburb from dataset
        print(f"({str(percentage_progress).ljust(5,'0')}%) {str('[' + str(index + 1) + '/' + str(len(suburb_details)) + ']').ljust(15)}Failed: {details[1]}")
        failed_details.append(details)

(25.16%) [840/3338]     Failed: Lyonserton                 
(25.82%) [862/3338]     Failed: Heath Field        
(25.85%) [863/3338]     Failed: Puralka
(26.78%) [894/3338]     Failed: Cullajar       
(26.99%) [901/3338]     Failed: Parkwood          
(27.29%) [911/3338]     Failed: Jallakinpe      
(34.18%) [1141/3338]    Failed: Cabbage Tree                 
(35.23%) [1176/3338]    Failed: Cross Roads      
(35.44%) [1183/3338]    Failed: Shirleyn         
(36.85%) [1230/3338]    Failed: Bridge Innn        
(37.24%) [1243/3338]    Failed: GermaniaCreek      
(37.42%) [1249/3338]    Failed: Kanyal South   
(37.66%) [1257/3338]    Failed: Rostronal       
(37.69%) [1258/3338]    Failed: Wal Wal
(38.41%) [1282/3338]    Failed: Ailsaep Hills       
(38.92%) [1299/3338]    Failed: Yartoevale      
(39.36%) [1314/3338]    Failed: Dahlenangorach     
(39.42%) [1316/3338]    Failed: Douglas        
(39.84%) [1330/3338]    Failed: Mitrea Lake      
(40.56%) [1354/3338]    Failed: Vaseyeon Pond

In the process of adding our new `coordinates` column, we also remove the suburbs whose geocoding failed since we perform an inner join which, of course, is not completed for the suburbs whose coordinates weren't added to the `coordinates_details` array.

In [85]:
# convert array into dataframe
coordinates_df = pd.DataFrame(coordinates_details, columns=["postcode", "locality", "coordinates"])
# peform inner join
new_suburb_data = suburb_data.merge(coordinates_df[["locality", "coordinates"]], how="inner", on="locality")

new_suburb_data.shape
new_suburb_data.head()

(3282, 37)

Unnamed: 0,id,postcode,locality,state,long,lat,dc,type,status,sa3,...,MMM_2019,ced,altitude,chargezone,phn_code,phn_name,lgaregion,electorate,electoraterating,coordinates
0,4746,3000,Melbourne,VIC,144.956776,-37.817403,CITY DELIVERY CENTRE,Delivery Area,Updated 6-Feb-2020,20604.0,...,1.0,Melbourne,15.244751,V1,PHN201,North Western Melbourne,Melbourne,Melbourne,Inner Metropolitan,"(-37.8152065, 144.963937)"
1,4748,3002,East Melbourne,VIC,144.982207,-37.818517,CITY DELIVERY CENTRE,Delivery Area,Updated 6-Feb-2020,20604.0,...,1.0,Melbourne,14.315022,V1,PHN201,North Western Melbourne,Yarra,Melbourne,Inner Metropolitan,"(-37.8161444, 144.9804594)"
2,4749,3003,West Melbourne,VIC,144.949592,-37.810871,CITY DELIVERY CENTRE,Delivery Area,Updated 6-Feb-2020,20604.0,...,1.0,Melbourne,15.316063,V1,PHN201,North Western Melbourne,Melbourne,Melbourne,Inner Metropolitan,"(-37.8114504, 144.9253974)"
3,4751,3004,St Kilda Road Central,VIC,144.970161,-37.844246,Melbourne Metro,,Updated 6-Feb-2020,20605.0,...,1.0,Macnamara,-3492.0,V1,PHN203,South Eastern Melbourne,Yarra,Macnamara,Inner Metropolitan,"(-37.8367638, 144.9756445)"
4,22851,3004,St Kilda Road Melbourne,VIC,144.976,-37.8368,Melbourne Metro,,Updated 25-Jan-2020,,...,1.0,Macnamara,-3492.0,V1,PHN203,South Eastern Melbourne,Yarra,Macnamara,Inner Metropolitan,"(-37.8367638, 144.9756445)"


In [86]:
print("Number of unique locality names:", len(new_suburb_data.locality.unique()))

Number of unique locality names: 3282


## 2016 to 2021 conversion of ABS structure codes/names

All columns specifying the ABS structure codes and names are from 2016, so to match the new data (for showing details on WebApp) we also need to include the new codes and names from 2021.

This step also removes any suburbs from the raw suburbs dataset which are not considered in the ABS census statistics.

In [87]:
sa1_corr = pd.read_csv(os.path.join(CORR_DATA_PATH, "CG_SA1_2016_SA1_2021.csv"))
sa2_corr = pd.read_csv(os.path.join(CORR_DATA_PATH, "CG_SA2_2016_SA2_2021.csv"))
sa3_corr = pd.read_csv(os.path.join(CORR_DATA_PATH, "CG_SA3_2016_SA3_2021.csv"))
sa4_corr = pd.read_csv(os.path.join(CORR_DATA_PATH, "CG_SA4_2016_SA4_2021.csv"))

In [88]:
new_suburb_data = new_suburb_data\
    .merge(sa1_corr, on="SA1_MAINCODE_2016", suffixes=[None, "_SA1"])\
    .merge(sa2_corr, on="SA2_MAINCODE_2016", suffixes=[None, "_SA2"])\
    .merge(sa3_corr, on="SA3_CODE_2016", suffixes=[None, "_SA3"])\
    .merge(sa4_corr, on="SA4_CODE_2016", suffixes=[None, "_SA4"])

new_suburb_data.shape
new_suburb_data.head()

(4917, 63)

Unnamed: 0,id,postcode,locality,state,long,lat,dc,type,status,sa3,...,INDIV_TO_REGION_QLTY_INDICATOR_SA3,OVERALL_QUALITY_INDICATOR_SA3,BMOS_NULL_FLAG_SA3,SA4_NAME_2016_SA4,SA4_CODE_2021,SA4_NAME_2021,RATIO_FROM_TO_SA4,INDIV_TO_REGION_QLTY_INDICATOR_SA4,OVERALL_QUALITY_INDICATOR_SA4,BMOS_NULL_FLAG_SA4
0,4746,3000,Melbourne,VIC,144.956776,-37.817403,CITY DELIVERY CENTRE,Delivery Area,Updated 6-Feb-2020,20604.0,...,Good,Good,0,Melbourne - Inner,206,Melbourne - Inner,1.0,Good,Good,0
1,4746,3000,Melbourne,VIC,144.956776,-37.817403,CITY DELIVERY CENTRE,Delivery Area,Updated 6-Feb-2020,20604.0,...,Good,Good,0,Melbourne - Inner,206,Melbourne - Inner,1.0,Good,Good,0
2,4746,3000,Melbourne,VIC,144.956776,-37.817403,CITY DELIVERY CENTRE,Delivery Area,Updated 6-Feb-2020,20604.0,...,Good,Good,0,Melbourne - Inner,206,Melbourne - Inner,1.0,Good,Good,0
3,4748,3002,East Melbourne,VIC,144.982207,-37.818517,CITY DELIVERY CENTRE,Delivery Area,Updated 6-Feb-2020,20604.0,...,Good,Good,0,Melbourne - Inner,206,Melbourne - Inner,1.0,Good,Good,0
4,4749,3003,West Melbourne,VIC,144.949592,-37.810871,CITY DELIVERY CENTRE,Delivery Area,Updated 6-Feb-2020,20604.0,...,Good,Good,0,Melbourne - Inner,206,Melbourne - Inner,1.0,Good,Good,0


However, this has introduced some duplicate entries for suburbs. These are removed using the `drop_duplicates()` method on the `locality` column.

In [89]:
new_suburb_data = new_suburb_data.drop_duplicates("locality")
new_suburb_data.shape

(3282, 63)

## Remove unnecessary columns

Quite a few columns aren't necessary for our purposes, so these columns can be removed.

In [90]:
new_suburb_data.columns

Index(['id', 'postcode', 'locality', 'state', 'long', 'lat', 'dc', 'type',
       'status', 'sa3', 'sa3name', 'sa4', 'sa4name', 'region', 'Lat_precise',
       'Long_precise', 'SA1_MAINCODE_2011', 'SA1_MAINCODE_2016',
       'SA2_MAINCODE_2016', 'SA2_NAME_2016', 'SA3_CODE_2016', 'SA3_NAME_2016',
       'SA4_CODE_2016', 'SA4_NAME_2016', 'RA_2011', 'RA_2016', 'MMM_2015',
       'MMM_2019', 'ced', 'altitude', 'chargezone', 'phn_code', 'phn_name',
       'lgaregion', 'electorate', 'electoraterating', 'coordinates',
       'SA1_CODE_2021', 'RATIO_FROM_TO', 'INDIV_TO_REGION_QLTY_INDICATOR',
       'OVERALL_QUALITY_INDICATOR', 'BMOS_NULL_FLAG', 'SA2_NAME_2016_SA2',
       'SA2_CODE_2021', 'SA2_NAME_2021', 'RATIO_FROM_TO_SA2',
       'INDIV_TO_REGION_QLTY_INDICATOR_SA2', 'OVERALL_QUALITY_INDICATOR_SA2',
       'BMOS_NULL_FLAG_SA2', 'SA3_NAME_2016_SA3', 'SA3_CODE_2021',
       'SA3_NAME_2021', 'RATIO_FROM_TO_SA3',
       'INDIV_TO_REGION_QLTY_INDICATOR_SA3', 'OVERALL_QUALITY_INDICATOR_SA3',
   

In [91]:
new_suburb_data = new_suburb_data[[
    "postcode", "locality", "coordinates", "lgaregion", "SA1_MAINCODE_2016", 
    "SA1_CODE_2021", "SA2_MAINCODE_2016", "SA2_NAME_2016_SA2", "SA2_CODE_2021", 
    "SA2_NAME_2021", "SA3_CODE_2016", "SA3_NAME_2016_SA3", "SA3_CODE_2021", 
    "SA3_NAME_2021", "SA4_CODE_2016", "SA4_NAME_2016_SA4", "SA4_CODE_2021", 
    "SA4_NAME_2021"
]]
new_suburb_data.columns = [
    "postcode", "locality", "coordinates", "lgaregion", "sa1_code_2016",
    "sa1_code_2021", "sa2_code_2016", "sa2_name_2016", "sa2_code_2021",
    "sa2_name_2021", "sa3_code_2016", "sa3_name_2016", "sa3_code_2021",
    "sa3_name_2021", "sa4_code_2016", "sa4_name_2016", "sa4_code_2021",
    "sa4_name_2021"
]

new_suburb_data.shape
new_suburb_data.head()

(3282, 18)

Unnamed: 0,postcode,locality,coordinates,lgaregion,sa1_code_2016,sa1_code_2021,sa2_code_2016,sa2_name_2016,sa2_code_2021,sa2_name_2021,sa3_code_2016,sa3_name_2016,sa3_code_2021,sa3_name_2021,sa4_code_2016,sa4_name_2016,sa4_code_2021,sa4_name_2021
0,3000,Melbourne,"(-37.8152065, 144.963937)",Melbourne,20604110000.0,20604150327,206041122.0,Melbourne,206041503,Melbourne CBD - East,20604.0,Melbourne City,20604,Melbourne City,206.0,Melbourne - Inner,206,Melbourne - Inner
3,3002,East Melbourne,"(-37.8161444, 144.9804594)",Yarra,20604110000.0,20604111914,206041119.0,East Melbourne,206041119,East Melbourne,20604.0,Melbourne City,20604,Melbourne City,206.0,Melbourne - Inner,206,Melbourne - Inner
4,3003,West Melbourne,"(-37.8114504, 144.9253974)",Melbourne,20604110000.0,20604112701,206041127.0,West Melbourne,206041127,West Melbourne - Industrial,20604.0,Melbourne City,20604,Melbourne City,206.0,Melbourne - Inner,206,Melbourne - Inner
5,3004,St Kilda Road Central,"(-37.8367638, 144.9756445)",Yarra,20604110000.0,20604112506,206041125.0,South Yarra - West,206041125,South Yarra - West,20604.0,Melbourne City,20604,Melbourne City,206.0,Melbourne - Inner,206,Melbourne - Inner
6,3004,St Kilda Road Melbourne,"(-37.8367638, 144.9756445)",Yarra,20604110000.0,20604112506,206041125.0,South Yarra - West,206041125,South Yarra - West,20604.0,Melbourne City,20604,Melbourne City,206.0,Melbourne - Inner,206,Melbourne - Inner


## Remove 'bad' rows/suburbs

There are some suburbs whose geocoded coordinates are outside the Victorian state boundaries.

Victoria's longitude can range from 140.9–150, so any rows with coordinates outside this range has been incorrectly geocoded. We will remove these rows since there is no point in correcting since these will just be an issue later in the application when we try to display these suburbs.

No coordinates have been geocoded too 'low' (i.e., less than -39.2), but the north border of Victoria is roughly divided into 5 linear segments of a piece-wise function. Luckily, no coordinates are above this line but also within the longitude range specified above (i.e., no action needs to be taken).

In [92]:
bad_localities = pd.concat([new_suburb_data.pipe(lambda x: x[x["coordinates"].str[1] < 140.9]).sort_values("coordinates"), new_suburb_data.pipe(lambda x: x[x["coordinates"].str[1] > 150]).sort_values("coordinates")], ignore_index=True).locality
new_suburb_data = new_suburb_data[~new_suburb_data.locality.isin(bad_localities)]

new_suburb_data.shape
new_suburb_data.head()

(3268, 18)

Unnamed: 0,postcode,locality,coordinates,lgaregion,sa1_code_2016,sa1_code_2021,sa2_code_2016,sa2_name_2016,sa2_code_2021,sa2_name_2021,sa3_code_2016,sa3_name_2016,sa3_code_2021,sa3_name_2021,sa4_code_2016,sa4_name_2016,sa4_code_2021,sa4_name_2021
0,3000,Melbourne,"(-37.8152065, 144.963937)",Melbourne,20604110000.0,20604150327,206041122.0,Melbourne,206041503,Melbourne CBD - East,20604.0,Melbourne City,20604,Melbourne City,206.0,Melbourne - Inner,206,Melbourne - Inner
3,3002,East Melbourne,"(-37.8161444, 144.9804594)",Yarra,20604110000.0,20604111914,206041119.0,East Melbourne,206041119,East Melbourne,20604.0,Melbourne City,20604,Melbourne City,206.0,Melbourne - Inner,206,Melbourne - Inner
4,3003,West Melbourne,"(-37.8114504, 144.9253974)",Melbourne,20604110000.0,20604112701,206041127.0,West Melbourne,206041127,West Melbourne - Industrial,20604.0,Melbourne City,20604,Melbourne City,206.0,Melbourne - Inner,206,Melbourne - Inner
5,3004,St Kilda Road Central,"(-37.8367638, 144.9756445)",Yarra,20604110000.0,20604112506,206041125.0,South Yarra - West,206041125,South Yarra - West,20604.0,Melbourne City,20604,Melbourne City,206.0,Melbourne - Inner,206,Melbourne - Inner
6,3004,St Kilda Road Melbourne,"(-37.8367638, 144.9756445)",Yarra,20604110000.0,20604112506,206041125.0,South Yarra - West,206041125,South Yarra - West,20604.0,Melbourne City,20604,Melbourne City,206.0,Melbourne - Inner,206,Melbourne - Inner


## Save to new file

Now save the dataset to a new file for use in other files.

In [84]:
new_suburb_data.to_csv(os.path.join(DERIVED_DATA_PATH, "SuburbMetadata.csv"), index=False)