In [969]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import geocoder
from geopy.geocoders import Nominatim
import unittest
#!conda install -c conda-forge folium=0.5.0 --yes # Library for Map - Folium
import folium
#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import re

# Scraping suburb information

### Scrape information about each Sydney suburb from Wikipedia

Here I use [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) to scrape the suburbs from Sydney as listed on [wikipedia](https://en.wikipedia.org/wiki/List_of_Sydney_suburbs). For each suburb, I identify the wikipage url for the indexed suburb, request access to the webpage, and then collect the following from their information box:
* **Postcode**: Postcode
* **Density**: Population density
* **Area**: Area size of suburb
* **LGA**: Local government assocation (council)
* **Location**: Distance from the city

In [410]:
with open("view-source_https___en.wikipedia.org_wiki_List_of_Sydney_suburbs.html") as html_file:
    soup = BeautifulSoup(html_file, "lxml")

syd_suburbs_section = soup.find('div',attrs={'class':'mw-parser-output'})
syd_suburbs_section = syd_suburbs_section.findAll('a', href=True)

In [875]:
# Scrape url for each suburb
suburb_list = {}

for i in range(len(syd_suburbs_section)):
    url = syd_suburbs_section[i]['href']
    if ',_New_South_Wales' in url:
        
        if any(x in url for x in ['File:', 'List_of_']):
            continue        

        else:
            suburb = syd_suburbs_section[i]['title'].replace(', New South Wales','')
            
            if suburb in suburb_list:
                continue           

            else:
                suburb_list[suburb] = 'https://en.wikipedia.org{}'.format(url)
    
    else:
        continue

In [892]:
# Scrape information box for each suburb from their wikipage.
# If an information box doesn't exist a try/catch error for the AttributeError will pass it.
suburbs_infobox = {}

for key,value in suburb_list.items():
    
    try:
        page = requests.get(value)
        soup_page = BeautifulSoup(page.text, 'html.parser')
        infobox = soup_page.find('table', class_='infobox vcard')
        suburbs_infobox[key] = infobox.find_all('tr', class_='')
    
    except AttributeError:
        suburbs_infobox[key] = None
        print("{}'s wikipage does not have an information box".format(key))
        pass

#list(suburb_list.items())[0]


East Gordon's wikipage does not have an information box
North St Ives's wikipage does not have an information box
Scotland Island's wikipage does not have an information box
Singletons Mill's wikipage does not have an information box
West Killara's wikipage does not have an information box
West Lindfield's wikipage does not have an information box


In [1066]:
check_strings = ('Density', 'Postcode', 'Area', 'Location')
check_strings_2 = ('LGA(s)')

sydney_suburbs = {}

for key,value in suburbs_infobox.items():
    
    try:
        infobox_val = value
        items = {}
        
        for val in infobox_val:        
            
            if any(s in val.text for s in check_strings):
                # Find check_strings in infobox list of strings
                keyword = [s for s in check_strings if s in val.text]
                keyword = keyword[0]

                infobox_split = val.text.split(' ')
                #infobox_split = re.split('  |\ ', val.text)
                
                if len(infobox_split) > 1:
                    infobox_item = [s for s in infobox_split if str(keyword) in s]
                    # Remove substrings from string
                    info = infobox_item[0]
                    info = info.replace("Density","").replace('/km2',' ').replace('\xa0(4,000/sq\xa0mi)','')\
                               .replace("Area","").replace('\xa0km2',' ')\
                               .replace("Location","").replace('\xa0km',' ')
                    
                    items[keyword] = info

                elif len(infobox_split) == 1:
                    info = infobox_split[0]
                    info = info.replace("Postcode(s)","")
                    items[keyword] = info

            elif 'LGA' in val.text:
                info = val.text.replace("LGA(s)","")
                items['LGA'] = info

            else:
                continue

        sydney_suburbs[key] = items

    except TypeError:
        sydney_suburbs[key] = None


In [1078]:
# Convert Sydney suburbs nested dictionary to a dataframe
sydney_suburbs_df = pd.DataFrame(sydney_suburbs).T
sydney_suburbs_df['Suburb'] = sydney_suburbs_df.index
sydney_suburbs_df.index = range(sydney_suburbs_df.shape[0])
sydney_suburbs_df.tail(5)

Unnamed: 0,Area,Density,LGA,Location,Postcode,Suburb
681,,,Sutherland Shire,32.0,2233.0,Yarrawarrah
682,,,City of Fairfield Cumberland Council,29.0,2161.0,Yennora
683,,,Sutherland Shire,24.0,2228.0,Yowie Bay
684,0.8,12600.0,City of Sydney,4.0,2017.0,Zetland
685,,,"Wollondilly, Campbelltown, Camden",,,Macarthur


Not every suburb has information about it's area size, population density and location distance from the city's cbd. Therefore these columns will be removed for the time being and the dataframe will be grouped according to their postcode.

In [1082]:
syd_suburbs_grouped = sydney_suburbs_df.groupby(["Postcode","LGA"]).agg({'Suburb':', '.join}).reset_index()
syd_suburbs_grouped.head(15)

Unnamed: 0,Postcode,LGA,Suburb
0,2000,City of Sydney,"The Rocks, Barangaroo, Dawes Point, Haymarket,..."
1,2007,City of Sydney,Ultimo
2,2008,City of Sydney,"Chippendale, Darlington"
3,2009,City of Sydney,Pyrmont
4,2010,City of Sydney,"Darlinghurst, Surry Hills"
5,2011,City of Sydney,"Elizabeth Bay, Potts Point, Rushcutters Bay, W..."
6,2015,City of Sydney,"Alexandria, Beaconsfield, Eveleigh"
7,2016,City of Sydney,Redfern
8,2017,City of Sydney,"Waterloo, Zetland"
9,2018,City of Sydney Bayside Council,Rosebery



### Scrape information about each Brisbane suburb from Wikipedia

In [1105]:
with open("view-source_https___en.wikipedia.org_wiki_List_of_Brisbane_suburbs.html") as html_file:
    soup = BeautifulSoup(html_file, "lxml")

bri_suburbs_section = soup.find('div',attrs={'id':'mw-content-text', 'class':'mw-content-ltr'})
bri_suburbs_section = bri_suburbs_section.find_all('p')

In [1110]:
bri_suburbs_section[1].a['href']

'/wiki/Suburbs_and_localities_(Australia)'

In [1122]:
bri_suburbs_section[2].find_all('a')

[<a href="/wiki/Bowen_Hills,_Queensland" title="Bowen Hills, Queensland">Bowen Hills</a>,
 <a href="/wiki/Brisbane_central_business_district" title="Brisbane central business district">Brisbane</a>,
 <a href="/wiki/East_Brisbane,_Queensland" title="East Brisbane, Queensland">East Brisbane</a>,
 <a href="/wiki/Fortitude_Valley,_Queensland" title="Fortitude Valley, Queensland">Fortitude Valley</a>,
 <a href="/wiki/Herston,_Queensland" title="Herston, Queensland">Herston</a>,
 <a href="/wiki/Highgate_Hill,_Queensland" title="Highgate Hill, Queensland">Highgate Hill</a>,
 <a href="/wiki/Kangaroo_Point,_Queensland" title="Kangaroo Point, Queensland">Kangaroo Point</a>,
 <a href="/wiki/Kelvin_Grove,_Queensland" title="Kelvin Grove, Queensland">Kelvin Grove</a>,
 <a href="/wiki/New_Farm,_Queensland" title="New Farm, Queensland">New Farm</a>,
 <a href="/wiki/Newstead,_Queensland" title="Newstead, Queensland">Newstead</a>,
 <a href="/wiki/Paddington,_Queensland" title="Paddington, Queensland">P

### Scraping Melbourne suburbs and postcodes

In [171]:
# Function to scrape suburbs table from Wikipedia
def get_sub_pc_wiki(link_string, state_string):
    tables = pd.read_html(link_string, header=1)
    df_pctab = tables[0] # get the first table containing postal code.    
    # Drop columns after postcodes
    df_pctab = df_pctab.iloc[:,0:2]
    df_pctab.head(5)    
    # Get header names
    headers = list(df_pctab)
    # Create new row
    new_row = pd.DataFrame({headers[0]:headers[0], headers[1]:headers[1]}, index=[0])
    # Concatenate both dataframes
    city_pc = pd.concat([new_row, df_pctab]).reset_index(drop=True)
    # Rename heading
    city_pc.columns = ['Suburb', 'Postcode']
    # Add State column
    vic_list = [state_string] * city_pc.shape[0]
    city_pc['State'] = vic_list    
    # Remove rows that don't have a Postcode
    city_pc['Postcode'].replace('-', np.nan, inplace=True)
    city_pc.dropna(subset=['Postcode'], inplace=True)    
    
    return city_pc

In [296]:
melb_pc = get_sub_pc_wiki('https://en.wikipedia.org/wiki/List_of_Melbourne_suburbs', 'VIC')

# Aggerate Neighbourhoods by Postalcode and Borough
melb_pc = melb_pc.groupby(["Postcode","State"]).agg({'Suburb':', '.join}).reset_index()

In [297]:
melb_pc.tail(5)

Unnamed: 0,Postcode,State,Suburb
274,3980,VIC,"Blind Bight, Tooradin, Warneet"
275,3981,VIC,"Bayles, Catani, Dalmore, Heath Hill, Koo Wee Rup"
276,3984,VIC,"Caldermeade, Lang Lang, Monomeith"
277,3987,VIC,Nyora
278,3081,VIC,Bellfield


### Scraping Hobart suburbs and postcodes

In [290]:
# Scrape suburbs and postcodes of Tasmania from wikipedia
tas_pc = get_sub_pc_wiki('https://en.wikipedia.org/wiki/List_of_localities_in_Tasmania', 'TAS')

In [291]:
tas_pc.tail(10)

Unnamed: 0,Suburb,Postcode,State
915,Wynyard,7325,TAS
916,Yambacoona,7256,TAS
917,Yarra Creek,7256,TAS
918,Yolla,7325,TAS
919,York Plains,7120,TAS
920,York Town,7270,TAS
921,York Town,7270,TAS
922,Youngtown,7249,TAS
923,Youngtown,7249,TAS
924,Zeehan,7469,TAS


In [292]:
# Scrape suburb names of Hobart
with open("view-source_https___en.wikipedia.org_wiki_List_of_Hobart_suburbs.html") as html_file:
    soup = BeautifulSoup(html_file, "lxml")

In [293]:
# Section that contains the list of Hobart suburbs
hob_suburbs_section = soup.find('div',attrs={'class':'mw-content-ltr','id':'mw-content-text'})
hob_suburbs_section = hob_suburbs_section.find("tr", class_='')

# Loop through subsections and extend to list
hob_suburbs = []
for hob_suburb in hob_suburbs_section.find_all("li", class_=''):
    suburb = hob_suburb.text.split('\n')
    hob_suburbs.append(suburb[0])

#print(hob_suburbs)
    
# Convert list to dataframe
hob_suburbs_dict = dict({'Suburb':hob_suburbs})
hob_suburbs_df = pd.DataFrame(hob_suburbs_dict)
hob_suburbs_df.head(5)

Unnamed: 0,Suburb
0,Bridgewater
1,Green Point
2,Gagebrook
3,Old Beach
4,Brighton


In [294]:
# Join dataframes based on partial string match between the 'Suburb' columns
isin_suburb_list = hob_suburbs_df['Suburb']
hob_pc = tas_pc[tas_pc['Suburb'].isin(isin_suburb_list)]

# Aggerate Neighbourhoods by Postalcode and Borough
hob_pc = hob_pc.groupby(["Postcode","State"]).agg({'Suburb':', '.join}).reset_index()

hob_pc = hob_pc[['Postcode', 'State', 'Suburb']]

In [404]:
hob_pc.shape

(26, 3)

In [295]:
hob_pc.head(5)

Unnamed: 0,Postcode,State,Suburb
0,7000,TAS,"Glebe, Hobart, Mount Stuart, North Hobart, Wes..."
1,7004,TAS,"Battery Point, South Hobart"
2,7005,TAS,"Dynnyrne, Sandy Bay"
3,7007,TAS,"Mount Nelson, Mount Nelson, Tolmans Hill"
4,7008,TAS,"Lenah Valley, Lenah Valley, New Town, New Town"


### Concatenate all dataframes

In [250]:
suburbs_pc = pd.concat([syd_pc, melb_pc, hob_pc], ignore_index=True)
suburbs_pc.shape

(531, 3)

In [253]:
# Double check if the length of the dataframe is correct
print(hob_pc.shape[0] + melb_pc.shape[0] + syd_pc.shape[0])

531


# Get geographical coordinates for each suburb

In [354]:
def get_latlong(df_pc):
    geolocator = Nominatim(user_agent="specify_your_app_name_here", timeout=3)
    
    lat = []
    long = []

    for index,row in df_pc.iterrows():
        postcode = row.Postcode
        state = row.State
        location = geolocator.geocode("{} {} Australia".format(postcode, state))
        lat.append(location.latitude)
        long.append(location.longitude)

    latlong = dict({'lat':lat, 'long':long})
    latlong_df = pd.DataFrame(latlong)
    
    return latlong_df

In [355]:
# Geospatial coordinates for Sydney suburbs
latlong_syd = get_latlong(syd_pc)

In [356]:
# Geospatial coordinates for Melbourne suburbs
latlong_melb = get_latlong(melb_pc)

In [357]:
# Geospatial coordinates for Hobart suburbs
latlong_hob = get_latlong(hob_pc)

In [383]:
latlong_all = pd.concat([latlong_syd, latlong_melb, latlong_hob], axis=0, sort=False, ignore_index=True)

print(latlong_all.shape)
print(suburbs_pc.shape)

(531, 2)
(531, 3)


### Merge the latlong dataframe with the suburbs_pc dataframe

In [386]:
aus_suburbs = pd.concat([suburbs_pc, latlong_all], axis=1)
aus_suburbs.shape

(531, 5)

In [387]:
aus_suburbs.head(5)

Unnamed: 0,Postcode,State,Suburb,lat,long
0,2000,NSW,"Barangaroo, Dawes Point, Haymarket, Millers Po...",-33.870826,151.205152
1,2006,NSW,The University Of Sydney,-33.888992,151.190037
2,2007,NSW,"Broadway, Ultimo",-33.879549,151.197541
3,2008,NSW,"Chippendale, Darlington",-33.887252,151.197847
4,2009,NSW,Pyrmont,-33.874611,151.196408


# Plotting Sydney, Melbourne, and Hobart suburb maps

### Plotting Sydney suburb map

In [389]:
address = "Sydney, New South Wales"
geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Sydney, NSW are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Sydney, NSW are -33.8548157, 151.2164539.


In [397]:
# create map of Toronto using latitude and longitude values
map_sydney = folium.Map(location=[latitude, longitude], zoom_start=9.5)
# add markers to map
for lat, lng, state, suburb in zip(aus_suburbs['lat'], aus_suburbs['long'], aus_suburbs['State'], aus_suburbs['Suburb']):
    #print(lat, lng)
    label = '{}, {}'.format(suburb, state)
    label = folium.Popup(label, parse_html=True)

    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_sydney)  
    
map_sydney

### Plotting Melbourne suburb map

In [399]:
address = "Melbourne, Victoria"
geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Melbourne, Victoria are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Melbourne, Victoria are -37.8142176, 144.9631608.


In [400]:
# create map of Toronto using latitude and longitude values
map_melbourne = folium.Map(location=[latitude, longitude], zoom_start=9.5)
# add markers to map
for lat, lng, state, suburb in zip(aus_suburbs['lat'], aus_suburbs['long'], aus_suburbs['State'], aus_suburbs['Suburb']):
    #print(lat, lng)
    label = '{}, {}'.format(suburb, state)
    label = folium.Popup(label, parse_html=True)

    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_melbourne)  
    
map_melbourne

### Plotting Hobart suburb map

In [402]:
address = "Hobart, Tasmania"
geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Hobart, Tasmania are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Hobart, Tasmania are -42.8825088, 147.3281233.


In [403]:
# create map of Toronto using latitude and longitude values
map_hobart = folium.Map(location=[latitude, longitude], zoom_start=9.5)
# add markers to map
for lat, lng, state, suburb in zip(aus_suburbs['lat'], aus_suburbs['long'], aus_suburbs['State'], aus_suburbs['Suburb']):
    #print(lat, lng)
    label = '{}, {}'.format(suburb, state)
    label = folium.Popup(label, parse_html=True)

    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_hobart)  
    
map_hobart