In [255]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import geocoder # import geocoder
from geopy.geocoders import Nominatim

# Scraping suburbs and postcodes

### Scraping Sydney suburbs and postcodes

In [228]:
tables = pd.read_html("https://www.intosydneydirectory.com.au/sydney-postcodes.php", header=1)
df_pctab = tables[0] # get the first table containing postal code.
df_pctab.head(5)

Unnamed: 0,Abbotsford,NSW,2046
0,Agnes Banks,NSW,2753
1,Airds,NSW,2560
2,Alexandria,NSW,2015
3,Alfords Point,NSW,2234
4,Allambie Heights,NSW,2100


In [229]:
# Create new row
new_row = pd.DataFrame({'Abbotsford':'Abbotsford', 'NSW':'NSW', '2046':'2046'}, index=[0])

# Concatenate both dataframes
syd_pc = pd.concat([new_row, df_pctab]).reset_index(drop=True)

# Rename heading
syd_pc.columns = ['Suburb', 'State', 'Postcode']

# Group dataframe by Postcode and State
syd_pc= syd_pc.groupby(["Postcode","State"]).agg({'Suburb':', '.join}).reset_index()

syd_pc.head(5)

Unnamed: 0,Postcode,State,Suburb
0,2000,NSW,"Barangaroo, Dawes Point, Haymarket, Millers Po..."
1,2006,NSW,The University Of Sydney
2,2007,NSW,"Broadway, Ultimo"
3,2008,NSW,"Chippendale, Darlington"
4,2009,NSW,Pyrmont


### Scraping Melbourne suburbs and postcodes

In [171]:
# Function to scrape suburbs table from Wikipedia
def get_sub_pc_wiki(link_string, state_string):
    tables = pd.read_html(link_string, header=1)
    df_pctab = tables[0] # get the first table containing postal code.    
    # Drop columns after postcodes
    df_pctab = df_pctab.iloc[:,0:2]
    df_pctab.head(5)    
    # Get header names
    headers = list(df_pctab)
    # Create new row
    new_row = pd.DataFrame({headers[0]:headers[0], headers[1]:headers[1]}, index=[0])
    # Concatenate both dataframes
    city_pc = pd.concat([new_row, df_pctab]).reset_index(drop=True)
    # Rename heading
    city_pc.columns = ['Suburb', 'Postcode']
    # Add State column
    vic_list = [state_string] * city_pc.shape[0]
    city_pc['State'] = vic_list    
    # Remove rows that don't have a Postcode
    city_pc['Postcode'].replace('-', np.nan, inplace=True)
    city_pc.dropna(subset=['Postcode'], inplace=True)    
    
    return city_pc

In [172]:
melb_pc = get_sub_pc_wiki('https://en.wikipedia.org/wiki/List_of_Melbourne_suburbs', 'VIC')

# Aggerate Neighbourhoods by Postalcode and Borough
melb_pc = melb_pc.groupby(["Postcode","State"]).agg({'Suburb':', '.join}).reset_index()

In [173]:
melb_pc.tail(5)

Unnamed: 0,Postcode,State,Suburb
274,3980,VIC,"Blind Bight, Tooradin, Warneet"
275,3981,VIC,"Bayles, Catani, Dalmore, Heath Hill, Koo Wee Rup"
276,3984,VIC,"Caldermeade, Lang Lang, Monomeith"
277,3987,VIC,Nyora
278,3081,VIC,Bellfield


### Scraping Hobart suburbs and postcodes

In [174]:
# Scrape suburbs and postcodes of Tasmania from wikipedia
tas_pc = get_sub_pc_wiki('https://en.wikipedia.org/wiki/List_of_localities_in_Tasmania', 'TAS')

In [176]:
tas_pc.tail(10)

Unnamed: 0,Suburb,Postcode,State
915,Wynyard,7325,TAS
916,Yambacoona,7256,TAS
917,Yarra Creek,7256,TAS
918,Yolla,7325,TAS
919,York Plains,7120,TAS
920,York Town,7270,TAS
921,York Town,7270,TAS
922,Youngtown,7249,TAS
923,Youngtown,7249,TAS
924,Zeehan,7469,TAS


In [110]:
# Scrape suburb names of Hobart
with open("view-source_https___en.wikipedia.org_wiki_List_of_Hobart_suburbs.html") as html_file:
    soup = BeautifulSoup(html_file, "lxml")

In [206]:
# Section that contains the list of Hobart suburbs
hob_suburbs_section = soup.find('div',attrs={'class':'mw-content-ltr','id':'mw-content-text'})
hob_suburbs_section = hob_suburbs_section.find("tr", class_='')

# Loop through subsections and extend to list
hob_suburbs = []
for hob_suburb in hob_suburbs_section.find_all("li", class_=''):
    suburb = hob_suburb.text.split('\n')
    hob_suburbs.append(suburb[0])

#print(hob_suburbs)
    
# Convert list to dataframe
hob_suburbs_dict = dict({'Suburb':hob_suburbs})
hob_suburbs_df = pd.DataFrame(hob_suburbs_dict)
hob_suburbs_df.head(5)

Unnamed: 0,Suburb
0,Bridgewater
1,Green Point
2,Gagebrook
3,Old Beach
4,Brighton


In [246]:
# Join dataframes based on partial string match between the 'Suburb' columns
isin_suburb_list = hob_suburbs_df['Suburb']
hob_pc = tas_pc[tas_pc['Suburb'].isin(isin_suburb_list)]

# Aggerate Neighbourhoods by Postalcode and Borough
hob_pc = hob_pc.groupby(["Postcode","State"]).agg({'Suburb':', '.join}).reset_index()

hob_pc = hob_pc[['Postcode', 'State', 'Suburb']]

In [247]:
hob_pc.head(5)

Unnamed: 0,Postcode,State,Suburb
0,7000,TAS,"Glebe, Hobart, Mount Stuart, North Hobart, Wes..."
1,7004,TAS,"Battery Point, South Hobart"
2,7005,TAS,"Dynnyrne, Sandy Bay"
3,7007,TAS,"Mount Nelson, Mount Nelson, Tolmans Hill"
4,7008,TAS,"Lenah Valley, Lenah Valley, New Town, New Town"


### Concatenate all dataframes

In [250]:
suburbs_pc = pd.concat([syd_pc, melb_pc, hob_pc], ignore_index=True)
suburbs_pc.shape

(531, 3)

In [253]:
# Double check if the length of the dataframe is correct
print(hob_pc.shape[0] + melb_pc.shape[0] + syd_pc.shape[0])

531


# Get geographical coordinates for each suburb

### Geographical coordinates for Sydney

In [256]:
postal_code = '2030'

# initialize your variable to None
lat_lng_coords = None

# loop until you get the coordinates
while(lat_lng_coords is None):
  g = geocoder.google('{}, Sydney, New South Wales'.format(postal_code))
  lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

KeyboardInterrupt: 

### Geographical coordinates for Melbourne

### Geographical coordinates for Hobart