In [969]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import geocoder
from geopy.geocoders import Nominatim
import unittest
#!conda install -c conda-forge folium=0.5.0 --yes # Library for Map - Folium
import folium
#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import re

# Scraping suburb information

### Scrape information about each Sydney suburb from Wikipedia

Here I use [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) to scrape the suburbs from Sydney as listed on [wikipedia](https://en.wikipedia.org/wiki/List_of_Sydney_suburbs). For each suburb, I identify the wikipage url for the indexed suburb, request access to the webpage, and then collect the following from their information box:
* **Postcode**: Postcode
* **Density**: Population density
* **Area**: Area size of suburb
* **LGA**: Local government assocation (council)
* **Location**: Distance from the city

In [410]:
with open("view-source_https___en.wikipedia.org_wiki_List_of_Sydney_suburbs.html") as html_file:
    soup = BeautifulSoup(html_file, "lxml")

syd_suburbs_section = soup.find('div',attrs={'class':'mw-parser-output'})
syd_suburbs_section = syd_suburbs_section.findAll('a', href=True)

In [1234]:
# Scrape url for each suburb
def get_wiki_urls(html_suburbs_section, wiki_link_extension, wiki_state_name, state):
    url_list = {}
    for i in range(len(html_suburbs_section)):
        url = html_suburbs_section[i]['href']
        if wiki_link_extension in url:

            if any(x in url for x in ['File:', 'List_of_']):
                continue        

            else:
                suburb = html_suburbs_section[i]['title'].replace(wiki_state_name,'')

                if suburb in url_list:
                    continue           

                else:
                    url_list[suburb] = 'https://en.wikipedia.org{}'.format(url)

        elif '{}_CBD'.format(state) in url:
            suburb = html_suburbs_section[i]['title']#.replace(wiki_state_name,'')
            url_list[suburb] = 'https://en.wikipedia.org{}'.format(url)
            
        else:
            continue
    
    return url_list

In [1235]:
wiki_state_name = ', New South Wales'
wiki_link_extension = ',_New_South_Wales'
state = 'Sydney'
syd_wiki_urls = get_wiki_urls(syd_suburbs_section, wiki_link_extension, wiki_state_name, state)

In [1236]:
syd_wiki_urls

{'Bankstown': 'https://en.wikipedia.org/wiki/Bankstown,_New_South_Wales',
 'Bondi Beach': 'https://en.wikipedia.org/wiki/Bondi_Beach,_New_South_Wales',
 'Chatswood': 'https://en.wikipedia.org/wiki/Chatswood,_New_South_Wales',
 'Cronulla': 'https://en.wikipedia.org/wiki/Cronulla,_New_South_Wales',
 'Liverpool': 'https://en.wikipedia.org/wiki/Liverpool,_New_South_Wales',
 'Manly': 'https://en.wikipedia.org/wiki/Manly,_New_South_Wales',
 'Milsons Point': 'https://en.wikipedia.org/wiki/Milsons_Point,_New_South_Wales',
 'Mosman': 'https://en.wikipedia.org/wiki/Mosman,_New_South_Wales',
 'Newtown': 'https://en.wikipedia.org/wiki/Newtown,_New_South_Wales',
 'Parramatta': 'https://en.wikipedia.org/wiki/Parramatta,_New_South_Wales',
 'The Rocks': 'https://en.wikipedia.org/wiki/The_Rocks,_New_South_Wales',
 'Abbotsbury': 'https://en.wikipedia.org/wiki/Abbotsbury,_New_South_Wales',
 'Abbotsford': 'https://en.wikipedia.org/wiki/Abbotsford,_New_South_Wales',
 'Acacia Gardens': 'https://en.wikipedia

In [1237]:
# Scrape information box for each suburb from their wikipage.
# If an information box doesn't exist a try/catch error for the AttributeError will pass it.

def get_suburb_wiki_infobox(wiki_urls):
    
    suburbs_infobox = {}
    for key,value in wiki_urls.items():

        try:
            page = requests.get(value)
            soup_page = BeautifulSoup(page.text, 'html.parser')
            infobox = soup_page.find('table', class_='infobox vcard')
            suburbs_infobox[key] = infobox.find_all('tr', class_='')

        except AttributeError:
            suburbs_infobox[key] = None
            #print("{}'s wikipage does not have an information box".format(key))
            pass
    
    return suburbs_infobox

In [1238]:
syd_suburb_infobox = get_suburb_wiki_infobox(syd_wiki_urls)

In [1387]:
def get_suburb_info(suburb_infobox):
    check_strings = ('Density', 'Postcode', 'Area', 'Location')
    check_strings_2 = ('LGA(s)')

    suburbs = {}

    for key,value in suburb_infobox.items():

        try:
            infobox_val = value
            items = {}

            for val in infobox_val:        

                if any(s in val.text for s in check_strings):
                    # Find check_strings in infobox list of strings
                    keyword = [s for s in check_strings if s in val.text]
                    keyword = keyword[0]

                    infobox_split = val.text.split(' ')
                    #infobox_split = re.split('  |\ ', val.text)

                    if len(infobox_split) > 1:
                        infobox_item = [s for s in infobox_split if str(keyword) in s]
                        # Remove substrings from string
                        info = infobox_item[0]
                        info = info.replace("Density","").replace('/km2',' ').replace(',','')\
                                   .replace("Area","").replace('\xa0km2',' ').replace(',','')\
                                   .replace("Location","").replace('\xa0km',' ').replace(',','')

                        info = info.split('\xa0(')
                        info = info[0]

                        # Convert Area, Density, and Location columns from string to numeric
                        try:
                            info = float(info)
                            items[keyword] = info
                        except:
                            info = None 
                            items[keyword] = info

                    elif len(infobox_split) == 1:
                        info = infobox_split[0]
                        info = info.replace("Postcode(s)","")
                        info = info.split('[')
                        info = info[0]
                        items[keyword] = info

                elif 'LGA' in val.text:
                    info = val.text.replace("LGA(s)","").replace(";",")")
                    info = info.split('[')
                    info = info[0]
                    items['LGA'] = info

                else:
                    continue

            suburbs[key] = items

        except TypeError:
            suburbs[key] = None
    
    return suburbs

In [1388]:
sydney_suburbs = get_suburb_info(syd_suburb_infobox)

In [1389]:
# Convert Sydney suburbs nested dictionary to a dataframe
sydney_suburbs_df = pd.DataFrame(sydney_suburbs).T
sydney_suburbs_df['Suburb'] = sydney_suburbs_df.index
sydney_suburbs_df.index = range(sydney_suburbs_df.shape[0])

sydney_suburbs_df.head(10)

Unnamed: 0,Area,Density,LGA,Location,Postcode,Suburb
0,,,Canterbury-Bankstown Council,16,2200,Bankstown
1,1.22,9550.0,Waverley Council,7,2026,Bondi Beach
2,2.9,8590.0,City of Willoughby,10,2067,Chatswood
3,,,Sutherland Shire,26,2230,Cronulla
4,6.3,4300.0,City of Liverpool,27,2170,Liverpool
5,5.6,2833.0,Northern Beaches Council,17,2095,Manly
6,0.2,10800.0,North Sydney Council,3,2061,Milsons Point
7,8.7,3273.0,Mosman Council,8,2088,Mosman
8,1.6,9390.0,City of Sydney Inner West Council,4,2042,Newtown
9,5.3,4870.0,City of Parramatta,24,2150,Parramatta


Not every suburb has information about it's area size, population density and location distance from the city's cbd. Therefore these columns will be removed for the time being and the dataframe will be grouped according to their postcode.

In [1398]:
syd_suburbs_grouped = sydney_suburbs_df.groupby(["Postcode"]).agg({'Suburb':', '.join,\
                                                                         'Area': lambda x: x.sum(skipna=True),\
                                                                         'Density': lambda x: x.mean(skipna=True),\
                                                                         'Location': lambda x: x.mean(skipna=True)}).reset_index()

syd_suburbs_grouped.rename(columns={'Area': 'Sum Area(km2)', 'Density':'Avg Pop Density (/km2)', 'Location':'Avg distance from cbd(km)'}, inplace=True)
syd_suburbs_grouped.head(10)
#Convert density column from float to int

Unnamed: 0,Postcode,Suburb,Sum Area(km2),Avg Pop Density (/km2),Avg distance from cbd(km)
0,2000,"The Rocks, Barangaroo, Sydney CBD, Dawes Point...",4.22,5979.833333,1.0
1,2007,Ultimo,0.6,14700.0,2.0
2,2008,"Chippendale, Darlington",1.2,8393.0,2.5
3,2009,Pyrmont,1.0,13000.0,2.0
4,2010,"Darlinghurst, Surry Hills",2.0,12585.0,1.0
5,2011,"Elizabeth Bay, Potts Point, Rushcutters Bay, W...",1.7,12748.325,2.5
6,2015,"Alexandria, Beaconsfield, Eveleigh",4.0,2442.5,4.0
7,2016,Redfern,1.2,11000.0,3.0
8,2017,"Waterloo, Zetland",1.9,12950.0,3.5
9,2018,"Eastlakes, Rosebery",2.7,6254.75,7.0



### Scrape information about each Brisbane suburb from Wikipedia

In [1386]:
with open("view-source_https___en.wikipedia.org_wiki_List_of_Brisbane_suburbs.html") as html_file:
    soup = BeautifulSoup(html_file, "lxml")

bri_suburbs_section = soup.find('div',attrs={'id':'mw-content-text', 'class':'mw-content-ltr'})
bri_suburbs_section = bri_suburbs_section.find_all('a', href=True)

In [1342]:
wiki_state_name = ', Queensland'
wiki_link_extension = ',_Queensland'
state = 'Brisbane'
bne_wiki_urls = get_wiki_urls(bri_suburbs_section, wiki_link_extension, wiki_state_name, state)

In [1344]:
bne_suburb_infobox = get_suburb_wiki_infobox(bne_wiki_urls)

In [1391]:
brisbane_suburbs = get_suburb_info(bne_suburb_infobox)

In [1393]:
# Convert Brisbane suburbs nested dictionary to a dataframe
brisbane_suburbs_df = pd.DataFrame(brisbane_suburbs).T
brisbane_suburbs_df['Suburb'] = brisbane_suburbs_df.index
brisbane_suburbs_df.index = range(brisbane_suburbs_df.shape[0])

brisbane_suburbs_df.head(10)

Unnamed: 0,Area,Density,LGA,Location,Postcode,Suburb
0,1.7,1900.0,City of Brisbane(Central Ward),,4006,Bowen Hills
1,2.0,3000.0,City of Brisbane (The Gabba Ward),2.5,4169,East Brisbane
2,1.4,4980.0,City of Brisbane(Central Ward),1.0,4006,Fortitude Valley
3,1.7,1300.0,City of Brisbane(Central Ward),3.0,4006,Herston
4,1.2,5160.0,City of Brisbane (The Gabba Ward),2.0,4101,Highgate Hill
5,1.3,6200.0,City of Brisbane (The Gabba Ward),,4169,Kangaroo Point
6,,,City of Brisbane(Paddington Ward),,4059,Kelvin Grove
7,2.0,6300.0,City of Brisbane(Central Ward),2.0,4005,New Farm
8,1.3,1690.0,City of Brisbane(Central Ward),3.0,4006,Newstead
9,2.4,3570.0,City of Brisbane(Paddington Ward),,4064,Paddington


In [1399]:
bne_suburbs_grouped = brisbane_suburbs_df.groupby(["Postcode"]).agg({'Suburb':', '.join,\
                                                                         'Area': lambda x: x.sum(skipna=True),\
                                                                         'Density': lambda x: x.mean(skipna=True),\
                                                                         'Location': lambda x: x.mean(skipna=True)}).reset_index()

bne_suburbs_grouped.rename(columns={'Area': 'Sum Area(km2)', 'Density':'Avg Pop Density (/km2)', 'Location':'Avg distance from cbd(km)'}, inplace=True)
bne_suburbs_grouped.head(10)
#Convert density column from float to int

Unnamed: 0,Postcode,Suburb,Sum Area(km2),Avg Pop Density (/km2),Avg distance from cbd(km)
0,4000,"Petrie Terrace, Spring Hill",1.2,4980.0,2.0
1,4005,"New Farm, Teneriffe",3.0,5820.0,2.25
2,4006,"Bowen Hills, Fortitude Valley, Herston, Newstead",6.1,2467.5,2.333333
3,4007,"Ascot, Hamilton",4.4,3125.0,7.5
4,4008,Pinkenba,0.0,,9.0
5,4009,Eagle Farm,0.0,,9.0
6,4010,Albion,1.4,1640.0,6.0
7,4011,"Clayfield, Hendra",5.6,2675.0,8.0
8,4012,"Nundah, Wavell Heights",8.2,2737.5,11.9
9,4013,Northgate,3.1,1388.0,


### Concatenate all Sydney and Brisbane dataframes

In [1400]:
suburbs_pc = pd.concat([syd_suburbs_grouped, bne_suburbs_grouped], ignore_index=True)
suburbs_pc.shape

(358, 5)

In [1403]:
# Double check if the length of the dataframe is correct
print(syd_suburbs_grouped.shape[0] + bne_suburbs_grouped.shape[0])

358


# Get geographical coordinates for each suburb

In [1406]:
def get_latlong(df_pc, state):
    geolocator = Nominatim(user_agent="specify_your_app_name_here", timeout=3)
    
    lat = []
    long = []

    for index,row in df_pc.iterrows():
        postcode = row.Postcode
        location = geolocator.geocode("{} {} Australia".format(postcode, state))
        lat.append(location.latitude)
        long.append(location.longitude)

    latlong = dict({'lat':lat, 'long':long})
    latlong_df = pd.DataFrame(latlong)
    
    return latlong_df

In [1408]:
# Geospatial coordinates for Sydney suburbs
latlong_syd = get_latlong(syd_suburbs_grouped, 'New South Wales')

In [1409]:
# Geospatial coordinates for Brisbane suburbs
latlong_bne = get_latlong(bne_suburbs_grouped, 'Queensland')

In [1410]:
latlong_all = pd.concat([latlong_syd, latlong_bne], axis=0, sort=False, ignore_index=True)

print(latlong_all.shape)
print(suburbs_pc.shape)

(358, 2)
(358, 5)


### Merge the latlong dataframe with the suburbs_pc dataframe

In [1412]:

syd_suburbs_coord = pd.concat([syd_suburbs_grouped, latlong_syd], axis=1)
bne_suburbs_coord = pd.concat([bne_suburbs_grouped, latlong_bne], axis=1)

syd_bne_suburbs = pd.concat([syd_suburbs_coord, bne_suburbs_coord], axis=0)
syd_bne_suburbs.shape

(358, 7)

In [1420]:
bne_suburbs_coord.head(5)

Unnamed: 0,Postcode,Suburb,Sum Area(km2),Avg Pop Density (/km2),Avg distance from cbd(km),lat,long
0,4000,"Petrie Terrace, Spring Hill",1.2,4980.0,2.0,-27.467224,153.02774
1,4005,"New Farm, Teneriffe",3.0,5820.0,2.25,-27.464018,153.045491
2,4006,"Bowen Hills, Fortitude Valley, Herston, Newstead",6.1,2467.5,2.333333,-27.459367,153.038752
3,4007,"Ascot, Hamilton",4.4,3125.0,7.5,-27.433788,153.063037
4,4008,Pinkenba,0.0,,9.0,-27.374745,153.122608


# Plotting Sydney, Melbourne, and Hobart suburb maps

### Plotting Sydney suburb map

In [1415]:
address = "Sydney, New South Wales"
geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Sydney, NSW are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Sydney, NSW are -33.8548157, 151.2164539.


In [1416]:
# create map of Toronto using latitude and longitude values
map_sydney = folium.Map(location=[latitude, longitude], zoom_start=9.5)
# add markers to map
for lat, lng, postcode, suburb in zip(syd_suburbs_coord['lat'],\
                                      syd_suburbs_coord['long'],\
                                      syd_suburbs_coord['Postcode'],\
                                      syd_suburbs_coord['Suburb']):
    #print(lat, lng)
    label = '{}: {}'.format(postcode, suburb)
    label = folium.Popup(label, parse_html=True)

    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_sydney)  
    
map_sydney

### Plotting Brisbane suburb map

In [1422]:
address = "Brisbane, Queensland"
geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Brisbane, Queensland are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Brisbane, Queensland are -27.4689682, 153.0234991.


In [1424]:
# create map of Toronto using latitude and longitude values
map_brisbane = folium.Map(location=[latitude, longitude], zoom_start=9.5)
# add markers to map
for lat, lng, postcode, suburb in zip(bne_suburbs_coord['lat'],\
                                      bne_suburbs_coord['long'],\
                                      bne_suburbs_coord['Postcode'],\
                                      bne_suburbs_coord['Suburb']):
    #print(lat, lng)
    label = '{}: {}'.format(postcode, state)
    label = folium.Popup(label, parse_html=True)

    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_brisbane)  
    
map_brisbane

# Search for local venues in Sydney and Brisbane suburbs with Foursquare API