# Preprocessing Postcode data

This notebook we will download and preprocess the postcode data 

In [12]:
import pandas as pd

## Download the postcode data to local landing folder

In [13]:
%run ../scripts/Postcode_Download.py

File downloaded and saved as: ../data/landing/postcode/australian_postcodes.csv


In [14]:
postcode_df = pd.read_csv('../data/landing/postcode/australian_postcodes.csv')
print(postcode_df.columns)

Index(['id', 'postcode', 'locality', 'state', 'long', 'lat', 'dc', 'type',
       'status', 'sa3', 'sa3name', 'sa4', 'sa4name', 'region', 'Lat_precise',
       'Long_precise', 'SA1_MAINCODE_2011', 'SA1_MAINCODE_2016',
       'SA2_MAINCODE_2016', 'SA2_NAME_2016', 'SA3_CODE_2016', 'SA3_NAME_2016',
       'SA4_CODE_2016', 'SA4_NAME_2016', 'RA_2011', 'RA_2016', 'MMM_2015',
       'MMM_2019', 'ced', 'altitude', 'chargezone', 'phn_code', 'phn_name',
       'lgaregion', 'lgacode', 'electorate', 'electoraterating'],
      dtype='object')


### Feature selection

In [15]:
# Define a list of column names to retain
columns_to_retain = ['postcode', 'locality', 'state', 'SA2_MAINCODE_2016', 'SA2_NAME_2016', 'Lat_precise', 'Long_precise']  # Replace with your desired column names

# Filter the DataFrame to retain only the specified columns
postcode_df = postcode_df[columns_to_retain]

In [16]:
postcode_df

Unnamed: 0,postcode,locality,state,SA2_MAINCODE_2016,SA2_NAME_2016,Lat_precise,Long_precise
0,200,ANU,ACT,801051049.0,Acton,-35.277700,149.119000
1,200,Australian National University,ACT,801051049.0,Acton,-35.277700,149.118527
2,800,DARWIN,NT,701011002.0,Darwin City,-12.393279,130.776661
3,800,DARWIN CITY,NT,701011002.0,Darwin City,-12.393279,130.776661
4,801,DARWIN,NT,701011002.0,Darwin City,-12.463440,130.845642
...,...,...,...,...,...,...,...
18508,9013,BRISBANE,QLD,305011105.0,Brisbane City,-27.469771,153.025124
18509,9015,BRISBANE,QLD,305011105.0,Brisbane City,-27.469771,153.025124
18510,9464,NORTHGATE MC,QLD,302031038.0,Northgate - Virginia,-27.390000,153.066000
18511,9726,GOLD COAST MC,QLD,309101268.0,Bundall,-28.016700,153.400000


### Select Victoria region

In [17]:
vic_postcode = postcode_df[postcode_df['state'] == 'VIC']
# now we don't need state columns anymore
vic_postcode = vic_postcode.drop(columns= 'state')

In [18]:
vic_postcode

Unnamed: 0,postcode,locality,SA2_MAINCODE_2016,SA2_NAME_2016,Lat_precise,Long_precise
6185,3000,MELBOURNE,206041122.0,Melbourne,-37.815207,144.963937
6186,3001,MELBOURNE,206041122.0,Melbourne,-37.813628,144.963058
6187,3002,EAST MELBOURNE,206041119.0,East Melbourne,-37.816144,144.980459
6188,3003,WEST MELBOURNE,206041127.0,West Melbourne,-37.811450,144.925397
6189,3004,MELBOURNE,206041126.0,Southbank,-37.830158,144.980459
...,...,...,...,...,...,...
18497,8438,SUNSHINE WEST,,,0.000000,0.000000
18498,8511,SUNSHINE WEST,,,0.000000,0.000000
18499,8785,DANDENONG,212041311.0,Dandenong,-37.984781,145.213991
18500,8785,DANDENONG SOUTH,212041311.0,Dandenong,-37.984781,145.213991


aware that there is some data containing 0 Lat_precise	and Long_precis.

Here we assume that any row of data contain NaN is invalid and thus need ot be remove

In [19]:
# remove all row of data that has nan
vic_postcode = vic_postcode.dropna()
vic_postcode

Unnamed: 0,postcode,locality,SA2_MAINCODE_2016,SA2_NAME_2016,Lat_precise,Long_precise
6185,3000,MELBOURNE,206041122.0,Melbourne,-37.815207,144.963937
6186,3001,MELBOURNE,206041122.0,Melbourne,-37.813628,144.963058
6187,3002,EAST MELBOURNE,206041119.0,East Melbourne,-37.816144,144.980459
6188,3003,WEST MELBOURNE,206041127.0,West Melbourne,-37.811450,144.925397
6189,3004,MELBOURNE,206041126.0,Southbank,-37.830158,144.980459
...,...,...,...,...,...,...
18495,8120,MELBOURNE,206041122.0,Melbourne,-37.813628,144.963058
18496,8205,MELBOURNE,206041122.0,Melbourne,-37.813628,144.963058
18499,8785,DANDENONG,212041311.0,Dandenong,-37.984781,145.213991
18500,8785,DANDENONG SOUTH,212041311.0,Dandenong,-37.984781,145.213991


### Save the files

In [20]:
# Save to raw data 
vic_postcode.to_csv('../data/raw/location_postcode.csv',index=False)