In [1]:
from area import area
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim # converts an address into latitude and longitude values
from math import sin, cos, sqrt, atan2, radians
from scipy.ndimage.filters import gaussian_filter1d
from sklearn import preprocessing
from shapely.geometry import shape
from sklearn.cluster import KMeans 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score


import folium
import geocoder
import googlemaps
import itertools
import json
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import requests
import unittest
import re
import seaborn as sns
%matplotlib inline

In [2]:
GOOGLE_MAPS_API = 'AIzaSyBxqCRTDqaRrDHDh_LBPO2QFOWqGDFeY0w'

# Scrape Sydney suburbs from Wikipedia

Here I use [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) to scrape the suburbs from Sydney as listed on [wikipedia](https://en.wikipedia.org/wiki/List_of_Sydney_suburbs). For each suburb, I identify the wikipage url for the indexed suburb, request access to the webpage, and then collect the following from their information box:
* **Postcode**: Postcode
* **Density**: Population density
* **Area**: Area size of suburb
* **LGA**: Local government assocation (council)
* **Location**: Distance from the city

In [3]:
with open("view-source_https___en.wikipedia.org_wiki_List_of_Sydney_suburbs.html") as html_file:
    soup = BeautifulSoup(html_file,
                         "lxml"
                        )

syd_suburbs_section = soup.find('div',
                                attrs={'class':'mw-parser-output'}
                               )

syd_suburbs_section = syd_suburbs_section.findAll('a',
                                                  href=True
                                                 )

In [4]:
# Scrape url for each suburb
def get_wiki_urls(html_suburbs_section,
                  wiki_link_extension,
                  wiki_state_name,
                  city,
                 ):
    
    url_list = {}
    for i in range(len(html_suburbs_section)):
        url = html_suburbs_section[i]['href']
        # Sydney cbd
        if 'Sydney_CBD' in url:
            suburb = 'Sydney central business district'
            url_list[suburb] = 'https://en.wikipedia.org/wiki/Sydney_central_business_district'

        # Brisbane cbd
        elif 'Brisbane_central_business_district' in url:
            suburb = 'Brisbane central business district'
            url_list[suburb] = 'https://en.wikipedia.org/wiki/Brisbane_central_business_district'
            
        # Melbourne and Adelaide cbd
        elif '{}_city_centre'.format(city) in url:
            suburb = '{} central business district'.format(city)
            url_list[suburb] = 'https://en.wikipedia.org{}'.format(url)
        
        # Canberra cbd
        elif 'City,_Australian_Capital_Territory' in url:
            suburb = 'Civic central business district'
            url_list[suburb] = 'https://en.wikipedia.org/wiki/Civic,_Australian_Capital_Territory'
        
        # Hobart cbd
        elif 'Hobart_city_centre' in url:
            suburb = 'Hobart central business district'
            url_list[suburb] = 'https://en.wikipedia.org/wiki/Hobart_City_Centre'
        
        # Darwin cbd
        elif 'Darwin_City' in url:
            suburb = 'Darwin central business district'
            url_list[suburb] = 'https://en.wikipedia.org/wiki/Darwin_City,_Northern_Territory'
        
        # Perth cbd
        elif 'Perth_(suburb)' in url:
            suburb = 'Perth central business district'
            url_list[suburb] = 'https://en.wikipedia.org/wiki/Perth_(suburb)'
            
        elif wiki_link_extension in url:
            
            check_list = []
            for x in ['/File:', '/List_of_', '/City_of_', '/commons.', '/Categories:']:
                
                if x in url:
                    check_list.append(1)
            
            if check_list:
                continue
            elif not check_list:
                suburb = html_suburbs_section[i]['title'].replace(wiki_state_name,'')

                if suburb in url_list:
                    continue           

                else:
                    url_list[suburb] = 'https://en.wikipedia.org{}'.format(url)
            
        else:
            continue
    
    return url_list

In [5]:
wiki_state_name = ', New South Wales'
wiki_link_extension = ',_New_South_Wales'
city = 'Sydney'
syd_wiki_urls = get_wiki_urls(syd_suburbs_section,
                              wiki_link_extension,
                              wiki_state_name,
                              city,
                             )

In [6]:
# Scrape information box for each suburb from their wikipage.
# If an information box doesn't exist a try/catch error for the AttributeError will pass it.

def get_suburb_wiki_infobox(wiki_urls):
    
    suburbs_infobox = {}
    for key,value in wiki_urls.items():
        try:
            page = requests.get(value)
            soup_page = BeautifulSoup(page.text, 'html.parser')

            try:
#                 print('Check redirection: {}'.format(key))
                soup_redirect = soup_page.find('span', class_='mw-redirectedfrom').a['href']
                soup_redirect_title = soup_redirect.replace(' ','_')
                soup_redirect_url = 'https://en.wikipedia.org{}'.format(soup_redirect)

                page = requests.get(soup_redirect_url)
                soup_redirect_page = BeautifulSoup(page.text, 'html.parser')
                soup_redirect_page_title = soup_redirect_page.find('ul', class_='redirectText').a['title']
                
                if key not in soup_redirect_page_title:
#                     print('Incorrect Redirection exists: {}\n'.format(key))
                    # Link redirects to wrong page
                    # Go to next suburb.
                    # Suburb not added to list
                    continue
                elif key in soup_redirect_page_title:
#                     print('Correct Redirection exists: {}'.format(key))
                    pass
                
            except:
                pass
            # Scrape information box from suburb wiki page
#             print('Scraping infobox: {}\n'.format(key))  
            infobox = soup_page.find('table', class_='infobox vcard')
            suburbs_infobox[key] = infobox.find_all('tr', class_='')

        except AttributeError:
            # print("No infobox: {}".format(key))
            suburbs_infobox[key] = None
            continue
    
    return suburbs_infobox


In [7]:
syd_suburb_infobox = get_suburb_wiki_infobox(syd_wiki_urls)

Function that scrapes data from wikipedia htmls of different suburbs in Australia. Here I apply it Sydney and Brisbane.

In [8]:
def get_suburb_info(suburb_infobox):
    CHECK_STRINGS = (
                     'Postcode',
                     'Population',
                     'LGA',
                     'District'
                    )

    suburbs = {}

    for key,value in suburb_infobox.items():
        key = key.split('(')
        key = key[0]
        print(key)
        
        if value:
            items = {}

            for val in value:        

                check_list = []
                for check_sting in CHECK_STRINGS:
                    if check_sting in val.text:
                        break
                                        
#                 if 'LGA' in val.text:
                if any(s in val.text for s in ['LGA',
                                               'District']):
                    val_td = val.find('td', class_='')
                    lga_list = val_td.findAll('a', href=True)

                    dummy_list = []
                    for lga in lga_list:
                        try:
                            lga_title = lga['title']
                            if '(New South Wales)' in lga_title:
                                lga_title = lga_title.replace('(New South Wales)','')
                            elif '(Queensland)' in lga_title:
                                lga_title = lga_title.replace('(Queensland)','')
                            elif '(Brisbane City)' in lga_title:
                                lga_title = lga_title.replace('(Brisbane City)','')
                            elif '(City of Brisbane)' in lga_title:
                                lga_title = lga_title.replace('(City of Brisbane)','')
                            elif ' (district)' in lga_title:
                                lga_title = lga_title.replace(' (district)','')
                            elif 'District of' in lga_title:
                                lga_title = lga_title.replace('District of ','')
                            elif '(Tasmania)' in lga_title:
                                lga_title = lga_title.replace(' (Tasmania)','')
                            elif ' (page does not exist)' in lga_title:
                                lga_title = lga_title.replace(' (page does not exist)','')
                                
                            dummy_list.append(lga_title)
                                
                        except(KeyError):
                            pass

                    if len(dummy_list) == 1:
                        items['LGA_1'] = ', '.join(dummy_list)
                        suburbs[key] = items
                    elif len(dummy_list) > 1:
                        for index, lga in enumerate(dummy_list):
                            items['LGA_{}'.format(index + 1)] = lga
                            suburbs[key] = items
                            
#                 if any(s in check_list for s in ['Population', 'Area', 'Density', 'Postcode']):
                elif any(s in val.text for s in ['Population', 'Postcode']):
                    keyword = check_sting
                    infobox_split = val.text.split(' ')

                    if len(infobox_split) > 1:
                        infobox_item = [s for s in infobox_split if str(keyword) in s]
                        if infobox_item[0] == 'Population':
                            info = infobox_split[1]
                        else:
                            info = infobox_item[0]
                        # Remove substrings from string
                        # info = infobox_item[0]
                        info = info.replace('Population','')\
                                   .replace(',','')\
                                   .replace('Postcode(s)','')\
                                   .replace('District','')\
                        
                        if '[' in info:
                            info = info.split('[')
                            info = info[0]
                        if '\xa0(' in info:
                            info = info.split('\xa0(')
                            info = info[0]

                        try:
                            if keyword == 'Population':
                                info = int(info)
                            items[keyword] = info
                        except(ValueError):
                            pass
                            items[keyword] = float('nan')

                        suburbs[key] = items

                    elif len(infobox_split) == 1:
                        keyword = check_sting
                        info = infobox_split[0]
                        info = info.replace('Postcode(s)','')\
                                   .replace('Population','')\
                                   .replace('District','')\
                                   .replace(',','')\
                                   .replace('\xa0(2016)','')\
                                   .replace('.','')

                        
                        if '[' in info:
                            info = info.split('[')
                            info = info[0]
                        if '\xa0(' in info:
                            info = info.split('\xa0(')
                            info = info[0]

                        if keyword == 'Population':
                            info = int(info)
                        items[keyword] = info

                        suburbs[key] = items

                else:
                    continue
        else:
            # For suburbs that don't have an information box on their Wikipedia page.
            continue
            
    return suburbs

In [9]:
sydney_suburbs_dict = get_suburb_info(syd_suburb_infobox)

Bankstown
Bondi Beach
Chatswood
Cronulla
Liverpool
Manly
Milsons Point
Mosman
Newtown
Parramatta
The Rocks
Abbotsbury
Abbotsford
Acacia Gardens
Agnes Banks
Airds
Alexandria
Alfords Point
Allambie Heights
Allawah
Ambarvale
Annandale
Annangrove
Arcadia
Arncliffe
Arndell Park
Artarmon
Ashbury
Ashcroft
Ashfield
Asquith
Auburn
Austral
Avalon Beach
Badgerys Creek
Balgowlah
Balgowlah Heights
Balmain
Balmain East
Bangor
Banksia
Banksmeadow
Bankstown Airport
Barangaroo
Barden Ridge
Bardia
Bardwell Park
Bardwell Valley
Bass Hill
Baulkham Hills
Bayview
Beacon Hill
Beaconsfield
Beaumont Hills
Beecroft
Belfield
Bella Vista
Bellevue Hill
Belmore
Belrose
Berala
Berkshire Park
Berowra
Berowra Creek
Berowra Heights
Berowra Waters
Berrilee
Beverley Park
Beverly Hills
Bexley
Bexley North
Bickley Vale
Bidwill
Bilgola Beach
Bilgola Plateau
Birchgrove
Birrong
Blackett
Blacktown
Blair Athol
Blairmount
Blakehurst
Bligh Park
Bondi
Bondi Junction
Bonnet Bay
Bonnyrigg
Bonnyrigg Heights
Bossley Park
Botany
Bow Bo

In [10]:
# Convert Sydney suburbs nested dictionary to a dataframe
sydney_suburbs_df = pd.DataFrame(sydney_suburbs_dict).T
sydney_suburbs_df['Suburb'] = sydney_suburbs_df.index
sydney_suburbs_df.index = range(sydney_suburbs_df.shape[0])
# sydney_suburbs_df

In [11]:
sydney_suburbs_df['Population'] = pd.to_numeric(sydney_suburbs_df['Population'], downcast='integer')

In [12]:
df = pd.DataFrame(columns=['LGA',
                           'Population',
                           'Postcode',
                           'Suburb'])

for row in sydney_suburbs_df.iterrows():
    for col in range(0,4):
        if row[1][col] == row[1][col]:
            # 4 = Population
            # 5 = Postcode
            # 6 = Suburb
            df.loc[len(df)] = [row[1][col],
                               row[1][4],
                               row[1][5],
                               row[1][6]]
        else:
            continue

sydney_suburbs_df = df
sydney_suburbs_df.sort_values(by='Suburb', ascending=True, inplace=True)
sydney_suburbs_df.reset_index(inplace=True, drop=True)
# sydney_suburbs_df

### Suburbs that didn't have a postcode on their wikipage.

In [13]:
missing_postcodes = sydney_suburbs_df[sydney_suburbs_df['Postcode'].isnull()]
missing_postcodes.reset_index(inplace=True, drop=True)
# missing_postcodes

Two things that stick out:

* Macarther is a south-west NSW region made up of three local LGA's and therefore is considered as as a suburb.
* Bankstown Airport is an airport not a suburb.
* Royal National Park is a coastline park.
* Ku-ring-gai Chase is a park in the northern region of sydney.

Therefore, Macarthur and Bankstown Airport will be removed from the dataframes.

In [14]:
# Drop Macarthur row.
to_drop = ['Bankstown Airport',
           'Macarthur',
           'Royal National Park',
           'Ku-ring-gai Chase']

sydney_suburbs_df = sydney_suburbs_df[~sydney_suburbs_df['Suburb'].isin(to_drop)]
sydney_suburbs_df.reset_index(inplace=True, drop=True)
missing_postcodes = missing_postcodes[~missing_postcodes['Suburb'].isin(to_drop)]
missing_postcodes.reset_index(inplace=True, drop=True)

Function to get postcodes with geopy

In [15]:
def get_missing_postcodes_geopy(missing_pc_df, city_str, state_str):
    geolocator = Nominatim(user_agent="specify_your_app_name_here", timeout=3)
    
    geo_pc = {}
    
    for index,row in missing_pc_df.iterrows():
        suburb = row['Suburb']
        location = geolocator.geocode("{}, {}, {}, Australia".format(suburb, city_str, state_str), geometry='geojson')
        
        display_name = location.raw['display_name']
        pc = re.findall(r'\b\d+\b', display_name)
        
        try:
            geo_pc[suburb] = pc[0]
        except(IndexError):
            # Suburbs that don't have postcodes. Need to find and fill in manually.
            geo_pc[suburb] = 'nan'

    return geo_pc

In [16]:
geo_pc_dict = get_missing_postcodes_geopy(missing_postcodes, 'Sydney', 'New South Wales')

Fill nan postcodes from sydney_suburbs_df with postcodes from dictionary

In [17]:
sydney_suburbs_df.Postcode = sydney_suburbs_df.Postcode.fillna(sydney_suburbs_df.Suburb.map(geo_pc_dict))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


Group suburbs by LGA in dataframe

In [18]:
sydney_suburbs_df = sydney_suburbs_df.groupby(['Suburb', 'Postcode']).agg({'LGA' :', '.join, 'Population':'mean'}).reset_index().reindex(columns=sydney_suburbs_df.columns)

# Get Geographical Coordinates

Function to get the list of geographical coordiantes for each suburb in the city

In [19]:
gmaps_key = googlemaps.Client(key = GOOGLE_MAPS_API)

# Create Geocode result object
sydney_suburbs_df['LAT'] = None
sydney_suburbs_df['LON'] = None

for index, row in sydney_suburbs_df.iterrows():
    geocode_result = gmaps_key.geocode("{}, {}, New South Wales".format(row['Suburb'], row['Postcode']))
    try:
        lat = geocode_result[0]['geometry']['location']['lat']
        lon = geocode_result[0]['geometry']['location']['lng']
                
        sydney_suburbs_df.iat[index, sydney_suburbs_df.columns.get_loc("LAT")] = lat
        sydney_suburbs_df.iat[index, sydney_suburbs_df.columns.get_loc("LON")] = lon
    except:
        lat = None
        lon = None

# Calculate geographical distance to Sydney CBD

Function that calculates the geographical distance between two sets of latitude and longitude coordinates and outputs hte list as a dataframe

In [20]:
def calc_geo_dist(lat1, lon1, lat2, lon2):
    R = 6373.0
    
    lat1 = radians(lat1)
    lon1 = radians(lon1)
    lat2 = radians(lat2)
    lon2 = radians(lon2)
    
    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    return R * c

In [21]:
sydney_suburbs_df['distance'] = None

CBD_LAT = sydney_suburbs_df[sydney_suburbs_df['Suburb'].str.contains("central business district")].iloc[0]['LAT']
CBD_LON = sydney_suburbs_df[sydney_suburbs_df['Suburb'].str.contains("central business district")].iloc[0]['LON']

for index, row in sydney_suburbs_df.iterrows():
    
    row_lat = row['LAT']
    row_lon = row['LON']
    distance = calc_geo_dist(CBD_LAT, CBD_LON, row_lat, row_lon)
    
    sydney_suburbs_df.iat[index, sydney_suburbs_df.columns.get_loc("distance")] = distance

# Get geographical area

In [22]:
with open('suburb_boundaries_nsw.json') as f:
    d = json.load(f)

In [23]:
geo_boundary = pd.DataFrame.from_dict(d['features'])
geo_boundary.drop(['geometry_name', 'id', 'type'], axis=1, inplace=True)

geo_boundary['Postcode'] = None

for index, row in geo_boundary.iterrows():

    suburb = row['properties']['nsw_loca_2']
    postcode = row['properties']['nsw_loca_4']
    geo_boundary.iat[index, geo_boundary.columns.get_loc('properties')] = suburb.title()
    
    try:
        if postcode == None:
            postcode = sydney_suburbs_df.loc[sydney_suburbs_df['Suburb'] == suburb.title()]['Postcode'].values[0]
            geo_boundary.iat[index, geo_boundary.columns.get_loc('Postcode')] = postcode
        else:
            geo_boundary.iat[index, geo_boundary.columns.get_loc('Postcode')] = postcode
    except(IndexError):
        continue
    
geo_boundary.rename(columns={'properties':'Suburb'}, inplace=True)

# Calculate geographical area
geo_boundary['area'] = geo_boundary['geometry'].apply(lambda x : round(shape(x).area * (10**4), 3))

In [24]:
# Add polygons to dataframe
sydney_suburbs_df = sydney_suburbs_df.merge(geo_boundary, how = 'inner', on = ['Suburb', 'Postcode'])

# Drop duplicate rows
sydney_suburbs_df.drop_duplicates(subset =["Suburb", "LGA"], keep = 'first', inplace = True)

# Calculate population density

In [25]:
sydney_suburbs_df['density'] = (sydney_suburbs_df['Population'] / sydney_suburbs_df['area'])

# Investigate missing data

In [26]:
print('Total number of missing values in Area column: {}'.format(sydney_suburbs_df.shape[0]-sydney_suburbs_df['Population'].count()))
print('Total number of missing values in Density column: {}'.format(sydney_suburbs_df.shape[0]-sydney_suburbs_df['density'].count()))

Total number of missing values in Area column: 21
Total number of missing values in Density column: 21


In [27]:
print('Percentage of missing data in Population column: {} %'.format(round(((sydney_suburbs_df.shape[0]-sydney_suburbs_df['Population'].count())/sydney_suburbs_df.shape[0])*100, 2)))
print('Percentage of missing data in Density column: {} %'.format(round(((sydney_suburbs_df.shape[0]-sydney_suburbs_df['density'].count())/sydney_suburbs_df.shape[0])*100, 2)))

Percentage of missing data in Population column: 3.15 %
Percentage of missing data in Density column: 3.15 %


Number of missing values for both columns is approximately the same. However, I will inspect the suburbs that only have area data but not density data.

In [28]:
# Split dataframe into full data vs missing data
full_data = sydney_suburbs_df[sydney_suburbs_df['Population'].notnull() & (sydney_suburbs_df['density'].notnull())]
full_data.reset_index(inplace=True, drop=True)

missing_data = sydney_suburbs_df[sydney_suburbs_df['Population'].isnull() | (sydney_suburbs_df['density'].isnull())]
missing_data.reset_index(inplace=True, drop=True)
# missing_data

In [29]:
list(np.unique(missing_data['Suburb'].values))

['Bickley Vale',
 'Camellia',
 'Chullora',
 'Clyde',
 'Cornwallis',
 'Currawong Beach',
 'Forest Glen',
 'Huntingwood',
 'Len Waters Estate',
 'Lucas Heights',
 'Macquarie Links',
 'Minchinbury',
 'North Kellyville',
 'Norwest',
 'Picnic Point',
 'Pitt Town Bottoms',
 'Pleasure Point',
 'Port Botany',
 'Rookwood',
 'South Windsor',
 'Wisemans Ferry']

Here we see that the data that is mainly missing is the suburb's population (and density, however density is calculated from the population value).

I will search for the population information individually for the suburbs listed above on the Australian Bureau Statistics. If the suburb does not contain any population information, I will either remove the suburb from the list or replace with some constant value outside a fixed value range (-999).

#### Suburbs with population statistics gathered from Australian Bureau Statistics (ABS) 
* **Cattai** population of [790](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC10859?opendocument).
* **Cornwallis** population of [53](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC11078?opendocument).
* **Forest Glen** population of [65](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC11542?opendocument).
* **Macquarie Links** population of [1360](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC12435?opendocument).
* **Minchinbury** population of [5619](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC12633?opendocument).
* **Pleasure Point** population of [528](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC13229?opendocument).
* **Picnic Point** population of [6160](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC13199?opendocument)
* **Pitt Town Bottoms** population of [102](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC13227?opendocument)
* **South Windsor** population of [5892](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2011/quickstat/SSC12119).
* **Wisemans Ferry** population of [220](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC14344?opendocument).

In [30]:
ABS_population = {'Cattai': 790,
                  'Cornwallis': 53,
                  'Forest Glen': 65,
                  'Macquarie Links': 1360,
                  'Minchinbury': 5619,
                  'Pleasure Point': 528,
                  'Picnic Point': 6160,
                  'Pitt Town Bottoms': 102,
                  'South Windsor': 5892,
                  'Wisemans Ferry': 220}

In [31]:
missing_data.Population = missing_data.Population.fillna(missing_data.Suburb.map(ABS_population))

#### Suburbs to be removed from the dataframe
Ports, industrial and commercial suburbs, and suburbs that specically state it does not contain a residential area will be removed from the dataframe. These suburbs include:
* **Camellia** is predominately an industrial suburb and no information on ABS.
* **Chullora** is predominately an industrial area and no information on ABS.
* **Clyde** is exclusively an industrial and commercial area. Wikipedia states, 'Clyde has no permanent population'. And no information on ABS.
* **Huntingwood** is predominately an industrial suburb and no information on ABS.
* **Lucas Heights** 'does not contain a residential area' according to Wikipedia and no information on ABS.
* **Port Botany** is a seaport suburb dominated by traide in containerised manufactured products. Therefore no residence. And no information on ABS.
* **Bickley Vale**, no information on Australian Bureau Statistics (ABS).
* **Currawang Beach**, no information on ABS.
* **McCarrs Creek**, new suburb since 2012 and no information on ABS.
* **Rookwood**, no information on ABS.


In [32]:
missing_data = missing_data[~missing_data['Suburb'].isin(['Bickley Vale',
                                                          'Camellia',
                                                          'Chullora',
                                                          'Clyde',
                                                          'Currawong Beach',
                                                          'Huntingwood',
                                                          'Len Waters Estate',
                                                          'Lucas Heights',
                                                          'McCarrs Creek',
                                                          'Port Botany',
                                                          'Rookwood'])]

#### Suburbs where the population will be interpolated
* **North Kellyville** was officially proclaimed a suburb on 29th June 2018 and therefore has no information on ABS. However, since it was [previously part of Kellyville](https://en.wikipedia.org/wiki/North_Kellyville,_New_South_Wales) I will use Kellyville's population density for North Kellyville and back calculate the population.
* **Norwest** was officially proclaimed a suburb on 29th June 2018 and therefore has no information on ABS. However, since it was [previously part of Kellyville and Baulkham Hills](https://en.wikipedia.org/wiki/Norwest,_New_South_Wales) I will use the mean population density of Kellyville and Baulkham Hills as the density of Norwest and back calculate the population.

In [33]:
# Get population density for Kellyville
kellyville_density = sydney_suburbs_df.loc[sydney_suburbs_df['Suburb'] == 'Kellyville']['density'].values[0]

# Get index for North Kellyville
index = missing_data.loc[missing_data['Suburb'] == 'North Kellyville'].index.values[0]

# Replace density of North Kellyville with Kellyville density
missing_data.at[index, 'density'] = kellyville_density

# Get North Kellyville area
north_kellyville_area = sydney_suburbs_df.loc[sydney_suburbs_df['Suburb'] == 'North Kellyville']['area'].values[0]

# Calculate population of North Kellyville with North Kellyville area with Kellyville density
missing_data.at[index, 'Population'] = round(north_kellyville_area * kellyville_density, 0)

In [34]:
# Get population for Baulkham Hills
BaulkhamHills_density = sydney_suburbs_df.loc[sydney_suburbs_df['Suburb'] == 'Baulkham Hills']['density'].values[0]

# Calculate mean density of Kellyville and Baulkham Hills
mean_density = round(np.mean([kellyville_density, BaulkhamHills_density]), 0)

# Replace density of Norwest
missing_data.at[index, 'density'] = mean_density

# Get Norwest area
norwest_area = sydney_suburbs_df.loc[sydney_suburbs_df['Suburb'] == 'Norwest']['area'].values[0]

# Get index for Norwest
index = missing_data.loc[missing_data['Suburb'] == 'Norwest'].index.values[0]

# Calculate population of Norwest with Norwest area with mean density
missing_data.at[index, 'Population'] = round(norwest_area * mean_density, 0)

# Calculate missing population densities

In [35]:
missing_data['density'] = round(missing_data['Population'] / missing_data['area'], 0)

# Combine the full data dataframe with the missing data dataframe

In [36]:
sydney_suburbs = pd.concat([full_data, missing_data], ignore_index=True)

In [37]:
sydney_suburbs['State'] = 'New South Wales'

In [38]:
sydney_suburbs.to_csv('sydney_suburbs.csv')

# Scrape Brisbane suburbs from Wikipedia

In [None]:
with open("view-source_https___en.wikipedia.org_wiki_List_of_Brisbane_suburbs.html") as html_file:
    soup = BeautifulSoup(html_file, "lxml")

bri_suburbs_section = soup.find('div',attrs={'id':'mw-content-text', 'class':'mw-content-ltr'})
bri_suburbs_section = bri_suburbs_section.find_all('a', href=True)

In [None]:
wiki_state_name = ', Queensland'
wiki_link_extension = ',_Queensland'
city = 'Brisbane'
bne_wiki_urls = get_wiki_urls(bri_suburbs_section, wiki_link_extension, wiki_state_name, state)

In [None]:
bne_suburb_infobox = get_suburb_wiki_infobox(bne_wiki_urls)

In [None]:
brisbane_suburbs_dict = get_suburb_info(bne_suburb_infobox)

In [None]:
# Convert Brisbane suburbs nested dictionary to a dataframe
brisbane_suburbs_df = pd.DataFrame(brisbane_suburbs_dict).T
brisbane_suburbs_df['Suburb'] = brisbane_suburbs_df.index
brisbane_suburbs_df.index = range(brisbane_suburbs_df.shape[0])

In [None]:
brisbane_suburbs_df['Population'] = pd.to_numeric(brisbane_suburbs_df['Population'], downcast='integer')

In [None]:
df = pd.DataFrame(columns=['LGA', 'Population', 'Postcode', 'Suburb'])

for row in brisbane_suburbs_df.iterrows():
    for col in range(0,4):
        if row[1][col] == row[1][col]:
            # 4 = Population
            # 5 = Postcode
            # 6 = Suburb
            df.loc[len(df)] = [row[1][col],
                               row[1][4],
                               row[1][5],
                               row[1][6]]
        else:
            continue

brisbane_suburbs_df = df
brisbane_suburbs_df.sort_values(by='Suburb', ascending=True, inplace=True)
brisbane_suburbs_df.reset_index(inplace=True, drop=True)

### Suburbs that didn't have a postcode on their wikipage.

In [None]:
missing_postcodes = brisbane_suburbs_df[brisbane_suburbs_df['Postcode'].isnull()]
missing_postcodes.reset_index(inplace=True, drop=True)
missing_postcodes

Kalinga does not have a postcode in the information box on its Wikipedia page.

The postcode for Kalinga is [4030](https://www.australiapostcode.com/qld-kalinga.html).

In [None]:
# Get index for Kalinga
index = brisbane_suburbs_df.loc[brisbane_suburbs_df['Suburb'] == 'Kalinga'].index.values[0]

# Replace density of North Kellyville with Kellyville density
brisbane_suburbs_df.at[index, 'Postcode'] = 4030

Group suburbs by LGA in dataframe

In [None]:
brisbane_suburbs_df = brisbane_suburbs_df.groupby(['Suburb', 'Postcode']).agg({'LGA' : ', '.join,
                                        'Population': 'mean'}).reset_index().reindex(columns=brisbane_suburbs_df.columns)

# Get Geographical Coordinates

In [None]:
# Set Google Maps API key
gmaps_key = googlemaps.Client(key = GOOGLE_MAPS_API)

In [None]:
# Create Geocode result object
brisbane_suburbs_df['LAT'] = None
brisbane_suburbs_df['LON'] = None

for index, row in brisbane_suburbs_df.iterrows():
    geocode_result = gmaps_key.geocode("{}, {}, Queensland".format(row['Suburb'], row['Postcode']))
    try:
        lat = geocode_result[0]['geometry']['location']['lat']
        lon = geocode_result[0]['geometry']['location']['lng']
                
        brisbane_suburbs_df.iat[index, brisbane_suburbs_df.columns.get_loc("LAT")] = lat
        brisbane_suburbs_df.iat[index, brisbane_suburbs_df.columns.get_loc("LON")] = lon
    except:
        lat = None
        lon = None

# Calculate geographical distance

In [None]:
brisbane_suburbs_df['distance'] = None

CBD_LAT = brisbane_suburbs_df[brisbane_suburbs_df['Suburb'].str.contains("central business district")].iloc[0]['LAT']
CBD_LON = brisbane_suburbs_df[brisbane_suburbs_df['Suburb'].str.contains("central business district")].iloc[0]['LON']

for index, row in brisbane_suburbs_df.iterrows():
    
    row_lat = row['LAT']
    row_lon = row['LON']
    distance = calc_geo_dist(CBD_LAT, CBD_LON, row_lat, row_lon)
    
    brisbane_suburbs_df.iat[index, brisbane_suburbs_df.columns.get_loc("distance")] = distance

In [None]:
brisbane_suburbs_df.head()

# Get Geographical Boundary

In [None]:
with open('suburb_boundaries_qld.json') as f:
    d = json.load(f)

In [None]:
geo_boundary = pd.DataFrame.from_dict(d['features'])
geo_boundary.drop(['geometry_name', 'id', 'type'], axis=1, inplace=True)

geo_boundary['Postcode'] = None

for index, row in geo_boundary.iterrows():

    suburb = row['properties']['qld_loca_2']
    postcode = row['properties']['qld_loca_4']
    geo_boundary.iat[index, geo_boundary.columns.get_loc('properties')] = suburb.title()
    
    try:
        if postcode == None:
            postcode = brisbane_suburbs_df.loc[brisbane_suburbs_df['Suburb'] == suburb.title()]['Postcode'].values[0]
            geo_boundary.iat[index, geo_boundary.columns.get_loc('Postcode')] = postcode
        else:
            geo_boundary.iat[index, geo_boundary.columns.get_loc('Postcode')] = postcode
    except(IndexError):
        continue
    
geo_boundary.rename(columns={'properties':'Suburb'}, inplace=True)

# Calculate geographical area
geo_boundary['area'] = geo_boundary['geometry'].apply(lambda x : round(shape(x).area * (10**4), 3))

# Merge area and polygons to dataframe
brisbane_suburbs_df = brisbane_suburbs_df.merge(geo_boundary, how = 'inner', on = ['Suburb', 'Postcode'])

# Drop duplicate rows
brisbane_suburbs_df.drop_duplicates(subset =["Suburb", "LGA"], keep = 'first', inplace = True)

# Calculate population density

In [None]:
brisbane_suburbs_df['density'] = (brisbane_suburbs_df['Population'] / brisbane_suburbs_df['area'])

# Investigate missing data

In [None]:
print('Total number of missing values in Population column: {}'.format(brisbane_suburbs_df.shape[0]-brisbane_suburbs_df['Population'].count()))

In [None]:
print('Percentage of missing data in Population column: {} %'.format(round(((brisbane_suburbs_df.shape[0]-brisbane_suburbs_df['Population'].count())/brisbane_suburbs_df.shape[0])*100, 2)))

In [None]:
# Split dataframe into full data vs missing data
full_data = brisbane_suburbs_df[brisbane_suburbs_df['Population'].notnull() & (brisbane_suburbs_df['density'].notnull())]
full_data.reset_index(inplace=True, drop=True)

missing_data = brisbane_suburbs_df[brisbane_suburbs_df['Population'].isnull() | (brisbane_suburbs_df['density'].isnull())]
missing_data.reset_index(inplace=True, drop=True)
# missing_data

In [None]:
list(np.unique(missing_data['Suburb'].values))

#### Suburbs with population statistics gathered from Australian Bureau Statistics (ABS) 
* Bellthorpe population of [124](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC30225?opendocument).
* Blacksoil population of [104](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC30280?opendocument).
* Campbells Pocket population of [80](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC30512?opendocument).
* Jeebropilly population of [7](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC31450?opendocument).
* Jollys Lookout population of [76](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC31464?opendocument).
* Kagaru population of [13](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC31481?opendocument).
* Kalinga population of [2126](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC31487?opendocument).
* Lyons population of [32](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC31725?opendocument).
* Mount Forbes population of [263](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC31979?opendocument).
* Mutdapilly population of [308](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC32111?opendocument).
* New Chum population of ...
* Samford Valley population of [3068](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC32529?opendocument).
* Samford Village population of [796](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC32530?opendocument).
* South Maclean population of [1362](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC32620?opendocument).
* Stones Corner population of ...
* Undullah population of [45](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC32927?opendocument).
* Veresdale population of [392](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC32966?opendocument).
* Woodhill population of [723](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC33164?opendocument).



#### Population information gathered from other sources
* New Chum population of [3074](https://profile.id.com.au/ipswich/population?WebID=260).
* Stones Corner population of [9720](https://www.brisbane.qld.gov.au/sites/default/files/20170512-stones_corner_local_business_and_destination_plan.pdf).

In [None]:
ABS_population = {'Bellthorpe': 124,
                  'Blacksoil': 104,
                  'Campbells Pocket': 80,
                  'Jeebropilly': 7,
                  'Jollys Lookout': 76,
                  'Kagaru': 13,
                  'Kalinga': 2126,
                  'Lyons': 32,
                  'Mount Forbes': 263,
                  'Mutdapilly': 308,
                  'New Chum': 3074,
                  'Samford Valley': 3068,
                  'Samford Village': 796,
                  'South Maclean': 1362,
                  'Stones Corner': 9720,
                  'Undullah': 45,
                  'Veresdale': 392,
                  'Woodhill': 723}

In [None]:
missing_data.Population = missing_data.Population.fillna(missing_data.Suburb.map(ABS_population))

Swanbank is predominately an industrial area and has no information on ABS.

In [None]:
missing_data = missing_data[~missing_data['Suburb'].isin(['Swanbank'])]

# Calculating missing population densities

In [None]:
missing_data['density'] = round(missing_data['Population'] / missing_data['area'], 0)

# Combine full dataframe with missing dataframe

In [None]:
brisbane_suburbs = pd.concat([full_data, missing_data], ignore_index=True)

In [None]:
brisbane_suburbs['State'] = 'Queensland'

In [None]:
brisbane_suburbs.to_csv('brisbane_suburbs.csv')

# Scrape Melbourne suburbs from Wikipedia

In [None]:
with open("view-source_https___en.wikipedia.org_wiki_List_of_Melbourne_suburbs.html") as html_file:
    soup = BeautifulSoup(html_file, "lxml")

melb_suburbs_section = soup.find('div',attrs={'id':'mw-content-text', 'class':'mw-content-ltr'})
melb_suburbs_section = melb_suburbs_section.find_all('a', href=True)

In [None]:
wiki_state_name = ', Victoria'
wiki_link_extension = ',_Victoria'
city = 'Melbourne'
melb_wiki_urls = get_wiki_urls(melb_suburbs_section, wiki_link_extension, wiki_state_name, state)

In [None]:
melb_suburb_infobox = get_suburb_wiki_infobox(melb_wiki_urls)

In [None]:
melbourne_suburbs_dict = get_suburb_info(melb_suburb_infobox)

In [None]:
# Convert Melbourne suburbs nested dictionary to a dataframe
melbourne_suburbs_df = pd.DataFrame(melbourne_suburbs_dict).T
melbourne_suburbs_df['Suburb'] = melbourne_suburbs_df.index
melbourne_suburbs_df.index = range(melbourne_suburbs_df.shape[0])

# Convert Melbourne city centre to Melbourne central business district
index = melbourne_suburbs_df.loc[melbourne_suburbs_df['Suburb'] == 'Melbourne city centre'].index[0]
melbourne_suburbs_df.at[index, 'Suburb'] = 'Melbourne central business district'

# Convert population to an integer
melbourne_suburbs_df['Population'] = pd.to_numeric(melbourne_suburbs_df['Population'], downcast='integer')

melbourne_suburbs_df.head()

In [None]:
df = pd.DataFrame(columns=['LGA', 'Population', 'Postcode', 'Suburb'])

for row in melbourne_suburbs_df.iterrows():
    for col in range(0,3):
        if row[1][col] == row[1][col]:
            # 3 = Population
            # 4 = Postcode
            # 5 = Suburb
            df.loc[len(df)] = [row[1][col],
                               row[1][3],
                               row[1][4],
                               row[1][5]]
        else:
            continue

melbourne_suburbs_df = df
melbourne_suburbs_df.sort_values(by='Suburb', ascending=True, inplace=True)
melbourne_suburbs_df.reset_index(inplace=True, drop=True)

Group suburbs by LGA in dataframe

In [None]:
melbourne_suburbs_df = melbourne_suburbs_df.groupby(['Suburb', 'Postcode']).agg({'LGA' : ', '.join,
                                        'Population': 'mean'}).reset_index().reindex(columns=melbourne_suburbs_df.columns)

# Get geographical coordinates

In [None]:
# Set Google Maps API key
gmaps_key = googlemaps.Client(key = GOOGLE_MAPS_API)

# Create Geocode result object
melbourne_suburbs_df['LAT'] = None
melbourne_suburbs_df['LON'] = None

for index, row in melbourne_suburbs_df.iterrows():
    geocode_result = gmaps_key.geocode("{}, {}, Victoria".format(row['Suburb'], row['Postcode']))
    try:
        lat = geocode_result[0]['geometry']['location']['lat']
        lon = geocode_result[0]['geometry']['location']['lng']
                
        melbourne_suburbs_df.iat[index, melbourne_suburbs_df.columns.get_loc("LAT")] = lat
        melbourne_suburbs_df.iat[index, melbourne_suburbs_df.columns.get_loc("LON")] = lon
    except:
        lat = None
        lon = None

# Calculate geographical distance

In [None]:
melbourne_suburbs_df['distance'] = None

CBD_LAT = melbourne_suburbs_df[melbourne_suburbs_df['Suburb'].str.contains("central business district")].iloc[0]['LAT']
CBD_LON = melbourne_suburbs_df[melbourne_suburbs_df['Suburb'].str.contains("central business district")].iloc[0]['LON']

for index, row in melbourne_suburbs_df.iterrows():
    
    row_lat = row['LAT']
    row_lon = row['LON']
    distance = calc_geo_dist(CBD_LAT, CBD_LON, row_lat, row_lon)
    
    melbourne_suburbs_df.iat[index, melbourne_suburbs_df.columns.get_loc("distance")] = distance

# Calculate geographical area

In [None]:
with open('suburb_boundaries_vic.json') as f:
    d = json.load(f)

In [None]:
geo_boundary = pd.DataFrame.from_dict(d['features'])
geo_boundary.drop(['geometry_name', 'id', 'type'], axis=1, inplace=True)

geo_boundary['Postcode'] = None

for index, row in geo_boundary.iterrows():

    suburb = row['properties']['vic_loca_2']
    postcode = row['properties']['vic_loca_4']
    geo_boundary.iat[index, geo_boundary.columns.get_loc('properties')] = suburb.title()
    
    try:
        if postcode == None:
            postcode = melbourne_suburbs_df.loc[melbourne_suburbs_df['Suburb'] == suburb.title()]['Postcode'].values[0]
            geo_boundary.iat[index, geo_boundary.columns.get_loc('Postcode')] = postcode
        else:
            geo_boundary.iat[index, geo_boundary.columns.get_loc('Postcode')] = postcode
    except(IndexError):
        continue
    
geo_boundary.rename(columns={'properties':'Suburb'}, inplace=True)

# Calculate geographical area
geo_boundary['area'] = geo_boundary['geometry'].apply(lambda x : round(shape(x).area * (10**4), 3))

# Merge area and polygons to dataframe
melbourne_suburbs_df = melbourne_suburbs_df.merge(geo_boundary, how = 'inner', on = ['Suburb', 'Postcode'])

# Drop duplicate rows
melbourne_suburbs_df.drop_duplicates(subset =["Suburb", "LGA"], keep = 'first', inplace = True)

# Calculate population density

In [None]:
melbourne_suburbs_df['density'] = (melbourne_suburbs_df['Population'] / melbourne_suburbs_df['area'])

# Investigate missing data

In [None]:
print('Total number of missing values in Population column: {}'.format(melbourne_suburbs_df.shape[0]-melbourne_suburbs_df['Population'].count()))

In [None]:
print('Percentage of missing data in Population column: {} %'.format(round(((melbourne_suburbs_df.shape[0]-melbourne_suburbs_df['Population'].count())/melbourne_suburbs_df.shape[0])*100, 2)))

In [None]:
# Split dataframe into full data vs missing data
full_data = melbourne_suburbs_df[melbourne_suburbs_df['Population'].notnull() & (melbourne_suburbs_df['density'].notnull())]
full_data.reset_index(inplace=True, drop=True)

missing_data = melbourne_suburbs_df[melbourne_suburbs_df['Population'].isnull() | (melbourne_suburbs_df['density'].isnull())]
missing_data.reset_index(inplace=True, drop=True)
missing_data

In [None]:
print(missing_data['Suburb'].values)

#### Suburb population for the following suburbs on ABS:
   * Beenak - [25](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC20186?opendocument)
   * Big Pats Creek - [73](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC20236?opendocument)
   * Gilderoy - [65](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC20988?opendocument)
   * Mount Toolebewong - [140](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC21802?opendocument)
   * Reefton - [59](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC22147?opendocument)
   * Tarrawarra - [78](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC22445?opendocument)
   * The Patch - [1065](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC22489?opendocument)
   * Wandin North - [3051](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC22660?opendocument)
   * Yering - [115](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC22915?opendocument)

#### Suburb population from other sources:
   * Manor Lakes - [8667](https://www.wyndham.vic.gov.au/sites/default/files/2017-07/Manor%20Lakes%20suburb%20profile.docx)
   * Somerton - [5288](https://profile.id.com.au/s_campbellfield-somerton/population-of-campbellfield-somerton)

#### The following suburbs are removed from the list:
   * Aintree - [new suburb since mid-2017](https://en.wikipedia.org/wiki/Aintree,_Victoria) therefore no population information.
   * Bonnie Brook - [new suburb since mid-2017](https://en.wikipedia.org/wiki/Aintree,_Victoria) therefore no population information.
   * Calder Park - primarily a [race track](https://en.wikipedia.org/wiki/Calder_Park,_Victoria) therefore no residence
   * Camabarville - much of its [area is part of the Yarra Ranges National Park](https://en.wikipedia.org/wiki/Cambarville,_Victoria)
   * Cobblebank - [new suburb since mid-2017](https://en.wikipedia.org/wiki/Aintree,_Victoria) therefore no population information.
   * Cocoroc - primarily a [treatment plant](https://en.wikipedia.org/wiki/Cocoroc,_Victoria)
   * Deanside - [new suburb since mid-2017](https://en.wikipedia.org/wiki/Aintree,_Victoria) therefore no population information.
   * Essendon Fields - is primarily an [airport and commercial area](https://en.wikipedia.org/wiki/Essendon_Fields,_Victoria)
   * Fernshaw - much of its [area is part of the Yarra Ranges National Park](https://en.wikipedia.org/wiki/Fernshaw,_Victoria)
   * Fieldstone - [new suburb since mid-2017](https://en.wikipedia.org/wiki/Fieldstone,_Victoria) therefore no population information.
   * Fraser Rise - [new suburb since mid-2017](https://en.wikipedia.org/wiki/Fraser_Rise,_Victoria) therefore no population information.
   * Grangefields - [new suburb since mid-2017](https://en.wikipedia.org/wiki/Grangefields,_Victoria) therefore no population information.
   * Harkness - [new suburb since mid-2017](https://en.wikipedia.org/wiki/Harkness,_Victoria) therefore no population information.
   * Matlock - no information
   * Quandong - [no population information](https://www.realestateinvestar.com.au/property/quandong)
   * Strathtulloh - [new suburb since mid-2017](https://en.wikipedia.org/wiki/Strathtulloh,_Victoria) therefore no population information.
   * Thornhill Park - [new suburb since mid-2017](https://en.wikipedia.org/wiki/Thornhill_Park,_Victoria) therefore no population information.
   * Toorongo - much of its [area is part of the Yarra Ranges National Park](https://en.wikipedia.org/wiki/Toorongo,_Victoria)
   * Weir Views - [new suburb since mid-2017](https://en.wikipedia.org/wiki/Weir_Views,_Victoria) therefore no population information.

In [None]:
ABS_population = {'Beenak': 25,
                  'Big Pats Creek': 73,
                  'Gilderoy': 65,
                  'Mount Toolebewong': 140,
                  'Reefton': 59,
                  'Tarrawarra': 78,
                  'The Patch': 1065,
                  'Wandin North': 3051,
                  'Yering': 115,
                  'Manor Lakes': 8667,
                  'Matlock': 4,
                  'Somerton': 5288}

missing_data.Population = missing_data.Population.fillna(missing_data.Suburb.map(ABS_population))

Drop the following suburbs.

In [None]:
missing_data = missing_data[~missing_data['Suburb'].isin(['Aintree',
                                                          'Bonnie Brook',
                                                          'Calder Park',
                                                          'Cambarville',
                                                          'Cobblebank',
                                                          'Cocoroc',
                                                          'Deanside',
                                                          'Essendon Fields',
                                                          'Fernshaw',
                                                          'Fieldstone',
                                                          'Fraser Rise',
                                                          'Grangefields',
                                                          'Harkness',
                                                          'Matlock',
                                                          'Quandong',
                                                          'Strathtulloh',
                                                          'Thornhill Park',
                                                          'Toorongo',
                                                          'Weir Views'])]

# Calculating missing population densities

In [None]:
missing_data['density'] = round(missing_data['Population'] / missing_data['area'], 0)

In [None]:
melbourne_suburbs = pd.concat([full_data, missing_data], ignore_index=True)

In [None]:
melbourne_suburbs['State'] = 'Melbourne'

In [None]:
melbourne_suburbs.to_csv('melbourne_suburbs.csv')

# Scrape Canberra suburbs from Wikipedia

In [None]:
with open("view-source_https___en.wikipedia.org_wiki_List_of_Canberra_suburbs.html") as html_file:
    soup = BeautifulSoup(html_file, "lxml")

cbr_suburbs_section = soup.find('div',attrs={'id':'mw-content-text', 'class':'mw-content-ltr'})
cbr_suburbs_section = cbr_suburbs_section.find_all('a', href=True)

In [None]:
wiki_state_name = ', Australian Capital Territory'
wiki_link_extension = ',_Australian_Capital_Territory'
city = 'Canberra'
cbr_wiki_urls = get_wiki_urls(cbr_suburbs_section, wiki_link_extension, wiki_state_name, state)

In [None]:
cbr_suburb_infobox = get_suburb_wiki_infobox(cbr_wiki_urls)

In [None]:
canberra_suburbs_dict = get_suburb_info(cbr_suburb_infobox)

In [None]:
# Convert Canberra suburbs nested dictionary to a dataframe
canberra_suburbs_df = pd.DataFrame(canberra_suburbs_dict).T
canberra_suburbs_df['Suburb'] = canberra_suburbs_df.index
canberra_suburbs_df.index = range(canberra_suburbs_df.shape[0])

In [None]:
canberra_suburbs_df['Population'] = pd.to_numeric(canberra_suburbs_df['Population'], downcast='integer')

In [None]:
df = pd.DataFrame(columns=['LGA', 'Population', 'Postcode', 'Suburb'])

for row in canberra_suburbs_df.iterrows():
    for col in range(0,2):
        if row[1][col] == row[1][col]:
            # 3 = Population
            # 4 = Postcode
            # 5 = Suburb
            df.loc[len(df)] = [row[1][col],
                               row[1][2],
                               row[1][3],
                               row[1][4]]
        else:
            continue

canberra_suburbs_df = df
canberra_suburbs_df.sort_values(by='Suburb', ascending=True, inplace=True)
canberra_suburbs_df.reset_index(inplace=True, drop=True)

Check for suburbs with missing postcodes

In [None]:
canberra_suburbs_df[canberra_suburbs_df['Postcode'].isnull()]

In [None]:
canberra_suburbs_df.at[117, 'Postcode'] = '2615'

Group suburbs by LGA in dataframe

In [None]:
canberra_suburbs_df = canberra_suburbs_df.groupby(['Suburb', 'Postcode']).agg({'LGA' : ', '.join,
                                        'Population': 'mean'}).reset_index().reindex(columns=canberra_suburbs_df.columns)

# Get geographical coordinates

In [None]:
# Set Google Maps API key
gmaps_key = googlemaps.Client(key = GOOGLE_MAPS_API)

# Create Geocode result object
canberra_suburbs_df['LAT'] = None
canberra_suburbs_df['LON'] = None

for index, row in canberra_suburbs_df.iterrows():
    geocode_result = gmaps_key.geocode("{}, {}, Australian Capital Territory".format(row['Suburb'], row['Postcode']))
    try:
        lat = geocode_result[0]['geometry']['location']['lat']
        lon = geocode_result[0]['geometry']['location']['lng']
                
        canberra_suburbs_df.iat[index, canberra_suburbs_df.columns.get_loc("LAT")] = lat
        canberra_suburbs_df.iat[index, canberra_suburbs_df.columns.get_loc("LON")] = lon
    except:
        lat = None
        lon = None

# Calculate geographical distance

In [None]:
canberra_suburbs_df['distance'] = None

CBD_LAT = canberra_suburbs_df[canberra_suburbs_df['Suburb'].str.contains("central business district")].iloc[0]['LAT']
CBD_LON = canberra_suburbs_df[canberra_suburbs_df['Suburb'].str.contains("central business district")].iloc[0]['LON']

for index, row in canberra_suburbs_df.iterrows():
    
    row_lat = row['LAT']
    row_lon = row['LON']
    distance = calc_geo_dist(CBD_LAT, CBD_LON, row_lat, row_lon)
    
    canberra_suburbs_df.iat[index, canberra_suburbs_df.columns.get_loc("distance")] = distance

# Calculate geographical area

In [None]:
with open('suburb_boundaries_act.json') as f:
    d = json.load(f)

In [None]:
geo_boundary = pd.DataFrame.from_dict(d['features'])
geo_boundary.drop(['geometry_name', 'id', 'type'], axis=1, inplace=True)

geo_boundary['Postcode'] = None

for index, row in geo_boundary.iterrows():

    suburb = row['properties']['act_loca_2']
    postcode = row['properties']['act_loca_4']
    geo_boundary.iat[index, geo_boundary.columns.get_loc('properties')] = suburb.title()
    
    try:
        if postcode == None:
            postcode = canberra_suburbs_df.loc[canberra_suburbs_df['Suburb'] == suburb.title()]['Postcode'].values[0]
            geo_boundary.iat[index, geo_boundary.columns.get_loc('Postcode')] = postcode
        else:
            geo_boundary.iat[index, geo_boundary.columns.get_loc('Postcode')] = postcode
    except(IndexError):
        continue
    
geo_boundary.rename(columns={'properties':'Suburb'}, inplace=True)

# Calculate geographical area
geo_boundary['area'] = geo_boundary['geometry'].apply(lambda x : round(shape(x).area * (10**4), 3))

# Merge area and polygons to dataframe
canberra_suburbs_df = canberra_suburbs_df.merge(geo_boundary, how = 'inner', on = ['Suburb', 'Postcode'])

# Drop duplicate rows
canberra_suburbs_df.drop_duplicates(subset =["Suburb", "LGA"], keep = 'first', inplace = True)

# Calculate population density

In [None]:
canberra_suburbs_df['density'] = (canberra_suburbs_df['Population'] / canberra_suburbs_df['area'])

# Investigate missing data

In [None]:
print('Total number of missing values in Population column: {}'.format(canberra_suburbs_df.shape[0]-canberra_suburbs_df['Population'].count()))
print('')
print('Percentage of missing data in Population column: {} %'.format(round(((canberra_suburbs_df.shape[0]-canberra_suburbs_df['Population'].count())/canberra_suburbs_df.shape[0])*100, 2)))

In [None]:
# Split dataframe into full data vs missing data
full_data = canberra_suburbs_df[canberra_suburbs_df['Population'].notnull() & (canberra_suburbs_df['density'].notnull())]
full_data.reset_index(inplace=True, drop=True)

missing_data = canberra_suburbs_df[canberra_suburbs_df['Population'].isnull() | (canberra_suburbs_df['density'].isnull())]
missing_data.reset_index(inplace=True, drop=True)
missing_data

In [None]:
print(missing_data['Suburb'].values)

There was no residential population information on ABS or else where.
* Beard - no information.
* Capital Hill - site of the Parliament house.
* Denman Prospect - no information.
* Macnamara - no information.
* Molonglo - [suburb underdevelopment](https://en.wikipedia.org/wiki/Molonglo,_Australian_Capital_Territory)
* Russell - comprised of [government offices and no residence](https://en.wikipedia.org/wiki/Russell,_Australian_Capital_Territory)
* Strathnairn - no information.
* Taylor - no information.
* Throsby - no information.
* Whitlam - future suburb in [2020](https://en.wikipedia.org/wiki/Whitlam,_Australian_Capital_Territory).

Therefore missing_data table is discarded entirely.

In [None]:
canberra_suburbs = full_data

In [None]:
canberra_suburbs['State'] = 'Australian Capital Territory'

In [None]:
canberra_suburbs.to_csv('canberra_suburbs.csv')

# Scrape Hobart suburbs from Wikipedia

In [None]:
with open("view-source_https___en.wikipedia.org_wiki_List_of_Hobart_suburbs.html") as html_file:
    soup = BeautifulSoup(html_file, "lxml")

hob_suburbs_section = soup.find('div',attrs={'id':'mw-content-text', 'class':'mw-content-ltr'})
hob_suburbs_section = hob_suburbs_section.find_all('a', href=True)

In [None]:
wiki_state_name = ', Tasmania'
wiki_link_extension = ',_Tasmania'
city = 'Hobart'
hob_wiki_urls = get_wiki_urls(hob_suburbs_section, wiki_link_extension, wiki_state_name, city)

In [None]:
hob_suburb_infobox = get_suburb_wiki_infobox(hob_wiki_urls)

In [None]:
hobart_suburbs_dict = get_suburb_info(hob_suburb_infobox)

In [None]:
# Convert Canberra suburbs nested dictionary to a dataframe
hobart_suburbs_df = pd.DataFrame(hobart_suburbs_dict).T
hobart_suburbs_df['Suburb'] = hobart_suburbs_df.index
hobart_suburbs_df.index = range(hobart_suburbs_df.shape[0])

In [None]:
hobart_suburbs_df['Population'] = pd.to_numeric(hobart_suburbs_df['Population'], downcast='integer')

In [None]:
hobart_suburbs_df.rename(columns={'LGA_1': 'LGA'}, inplace=True)

# Get geographical coordinates

In [None]:
# Set Google Maps API key
gmaps_key = googlemaps.Client(key = GOOGLE_MAPS_API)

# Create Geocode result object
hobart_suburbs_df['LAT'] = None
hobart_suburbs_df['LON'] = None

for index, row in hobart_suburbs_df.iterrows():
    geocode_result = gmaps_key.geocode("{}, {}, Tasmania".format(row['Suburb'], row['Postcode']))
    try:
        lat = geocode_result[0]['geometry']['location']['lat']
        lon = geocode_result[0]['geometry']['location']['lng']
                
        hobart_suburbs_df.iat[index, hobart_suburbs_df.columns.get_loc("LAT")] = lat
        hobart_suburbs_df.iat[index, hobart_suburbs_df.columns.get_loc("LON")] = lon
    except:
        lat = None
        lon = None

# Calculate geographical distance

In [None]:
hobart_suburbs_df['distance'] = None

CBD_LAT = hobart_suburbs_df[hobart_suburbs_df['Suburb'].str.contains("central business district")].iloc[0]['LAT']
CBD_LON = hobart_suburbs_df[hobart_suburbs_df['Suburb'].str.contains("central business district")].iloc[0]['LON']

for index, row in hobart_suburbs_df.iterrows():
    
    row_lat = row['LAT']
    row_lon = row['LON']
    distance = calc_geo_dist(CBD_LAT, CBD_LON, row_lat, row_lon)
    
    hobart_suburbs_df.iat[index, hobart_suburbs_df.columns.get_loc("distance")] = distance

# Calculate geographical area

In [None]:
with open('suburb_boundaries_tas.json') as f:
    d = json.load(f)

In [None]:
geo_boundary = pd.DataFrame.from_dict(d['features'])
geo_boundary.drop(['geometry_name', 'id', 'type'], axis=1, inplace=True)

geo_boundary['Postcode'] = None

for index, row in geo_boundary.iterrows():

    suburb = row['properties']['tas_loca_2']
    postcode = row['properties']['tas_loca_4']
    geo_boundary.iat[index, geo_boundary.columns.get_loc('properties')] = suburb.title()
    
    try:
        if postcode == None:
            postcode = hobart_suburbs_df.loc[hobart_suburbs_df['Suburb'] == suburb.title()]['Postcode'].values[0]
            geo_boundary.iat[index, geo_boundary.columns.get_loc('Postcode')] = postcode
        else:
            geo_boundary.iat[index, geo_boundary.columns.get_loc('Postcode')] = postcode
    except(IndexError):
        continue
    
geo_boundary.rename(columns={'properties':'Suburb'}, inplace=True)

# Calculate geographical area
geo_boundary['area'] = geo_boundary['geometry'].apply(lambda x : round(shape(x).area * (10**4), 3))

# Merge area and polygons to dataframe
hobart_suburbs_df = hobart_suburbs_df.merge(geo_boundary, how = 'inner', on = ['Suburb', 'Postcode'])

# Drop duplicate rows
hobart_suburbs_df.drop_duplicates(subset =["Suburb", "LGA"], keep = 'first', inplace = True)

# Calculate population density

In [None]:
hobart_suburbs_df['density'] = (hobart_suburbs_df['Population'] / hobart_suburbs_df['area'])

# Investigate missing data

In [None]:
print('Total number of missing values in Population column: {}'.format(hobart_suburbs_df.shape[0]-hobart_suburbs_df['Population'].count()))
print('')
print('Percentage of missing data in Population column: {} %'.format(round(((hobart_suburbs_df.shape[0]-hobart_suburbs_df['Population'].count())/hobart_suburbs_df.shape[0])*100, 2)))

In [None]:
# Split dataframe into full data vs missing data
full_data = hobart_suburbs_df[hobart_suburbs_df['Population'].notnull() & (hobart_suburbs_df['density'].notnull())]
full_data.reset_index(inplace=True, drop=True)

missing_data = hobart_suburbs_df[hobart_suburbs_df['Population'].isnull() | (hobart_suburbs_df['density'].isnull())]
missing_data.reset_index(inplace=True, drop=True)
missing_data

Suburb population from ABS:
* Midway Point - [2859](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC60399?opendocument)
* Acton Park - [2078](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC60006?opendocument)
* Lauderdale - [2411](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC60318?opendocument)
* Chigwell - [2002](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC60110?opendocument)
* Glenlusk - [200](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC60231?opendocument)
* Mount Nelson - [2495](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC60425?opendocument)
* Ridgeway - [175](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC60547?opendocument)





In [None]:
ABS_population = {'Midway Point': 2859,
                  'Acton Park': 2078,
                  'Lauderdale': 2411,
                  'Chigwell': 2022,
                  'Glenlusk': 200,
                  'Mount Nelson': 2495,
                  'Ridgeway': 175}

# Add new population data to the main dataframe
missing_data.Population = missing_data.Population.fillna(missing_data.Suburb.map(ABS_population))

# Calculate missing population densities

In [None]:
missing_data['density'] = round(missing_data['Population'] / missing_data['area'], 0)

In [None]:
hobart_suburbs = pd.concat([full_data, missing_data], ignore_index=True)

In [None]:
hobart_suburbs['State'] = 'Tasmania'

In [None]:
hobart_suburbs.to_csv('hobart_suburbs.csv')

# Scrape Adelaide suburbs from wikipedia

In [None]:
with open("view-source_https___en.wikipedia.org_wiki_List_of_Adelaide_suburbs.html") as html_file:
    soup = BeautifulSoup(html_file, "lxml")

adl_suburbs_section = soup.find('div',attrs={'id':'mw-content-text', 'class':'mw-content-ltr'})
adl_suburbs_section = adl_suburbs_section.find_all('a', href=True)

In [None]:
wiki_state_name = ', South Australia'
wiki_link_extension = ',_South_Australia'
city = 'Adelaide'
adl_wiki_urls = get_wiki_urls(adl_suburbs_section, wiki_link_extension, wiki_state_name, city)

In [None]:
adl_suburb_infobox = get_suburb_wiki_infobox(adl_wiki_urls)

In [None]:
adelaide_suburbs_dict = get_suburb_info(adl_suburb_infobox)

In [None]:
# Convert Canberra suburbs nested dictionary to a dataframe
adelaide_suburbs_df = pd.DataFrame(adelaide_suburbs_dict).T
adelaide_suburbs_df['Suburb'] = adelaide_suburbs_df.index
adelaide_suburbs_df.index = range(adelaide_suburbs_df.shape[0])

# Assign local government area to Adelaide central business district
adelaide_suburbs_df.at[1, 'LGA_1'] = 'City of Adelaide'

# Convert population to an integer
adelaide_suburbs_df['Population'] = pd.to_numeric(adelaide_suburbs_df['Population'], downcast='integer')

# adelaide_suburbs_df.head()

In [None]:
# Group LGA's into one column
df = pd.DataFrame(columns=['LGA', 'Population', 'Postcode', 'Suburb'])

for row in adelaide_suburbs_df.iterrows():
    for col in range(0,3):
        if row[1][col] == row[1][col]:
            # 9 = Population
            # 10 = Postcode
            # 11 = Suburb
            df.loc[len(df)] = [row[1][col],
                               row[1][3],
                               row[1][4],
                               row[1][5]]
        else:
            continue

adelaide_suburbs_df = df
adelaide_suburbs_df.sort_values(by='Suburb', ascending=True, inplace=True)
adelaide_suburbs_df.reset_index(inplace=True, drop=True)

Group suburbs by LGA

In [None]:
adelaide_suburbs_df = adelaide_suburbs_df.groupby(['Suburb', 'Postcode']).agg({'LGA' : ', '.join,
                                        'Population': 'mean'}).reset_index().reindex(columns=adelaide_suburbs_df.columns)

Adjust Gawler [population](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40469?opendocument)

In [None]:
index = adelaide_suburbs_df.loc[adelaide_suburbs_df.Suburb == 'Gawler'].index[0]
adelaide_suburbs_df.at[index, 'Population'] = 650

# Get geographical coordinates

In [None]:
# Set Google Maps API key
gmaps_key = googlemaps.Client(key = GOOGLE_MAPS_API)

# Create Geocode result object
adelaide_suburbs_df['LAT'] = None
adelaide_suburbs_df['LON'] = None

for index, row in adelaide_suburbs_df.iterrows():
    geocode_result = gmaps_key.geocode("{}, {}, South Australia".format(row['Suburb'], row['Postcode']))
    try:
        lat = geocode_result[0]['geometry']['location']['lat']
        lon = geocode_result[0]['geometry']['location']['lng']
                
        adelaide_suburbs_df.iat[index, adelaide_suburbs_df.columns.get_loc("LAT")] = lat
        adelaide_suburbs_df.iat[index, adelaide_suburbs_df.columns.get_loc("LON")] = lon
    except:
        lat = None
        lon = None

# Calculate geographical distance

In [None]:
adelaide_suburbs_df['distance'] = None

CBD_LAT = adelaide_suburbs_df[adelaide_suburbs_df['Suburb'].str.contains("central business district")].iloc[0]['LAT']
CBD_LON = adelaide_suburbs_df[adelaide_suburbs_df['Suburb'].str.contains("central business district")].iloc[0]['LON']

for index, row in adelaide_suburbs_df.iterrows():
    
    row_lat = row['LAT']
    row_lon = row['LON']
    distance = calc_geo_dist(CBD_LAT, CBD_LON, row_lat, row_lon)
    
    adelaide_suburbs_df.iat[index, adelaide_suburbs_df.columns.get_loc("distance")] = distance

# Calculate geographical area

In [None]:
with open('suburb_boundaries_sa.json') as f:
    d = json.load(f)

In [None]:
geo_boundary = pd.DataFrame.from_dict(d['features'])
geo_boundary.drop(['geometry_name', 'id', 'type'], axis=1, inplace=True)

geo_boundary['Postcode'] = None

for index, row in geo_boundary.iterrows():
    suburb = row['properties']['sa_local_2']
    postcode = row['properties']['sa_local_4']

    geo_boundary.iat[index, geo_boundary.columns.get_loc('properties')] = suburb.title()
    
    try:
        if postcode == None:
            postcode = adelaide_suburbs_df.loc[adelaide_suburbs_df['Suburb'] == suburb.title()]['Postcode'].values[0]
            geo_boundary.iat[index, geo_boundary.columns.get_loc('Postcode')] = postcode
        else:
            geo_boundary.iat[index, geo_boundary.columns.get_loc('Postcode')] = postcode
    except(IndexError):
        continue
    
geo_boundary.rename(columns={'properties':'Suburb'}, inplace=True)

# Calculate geographical area
geo_boundary['area'] = geo_boundary['geometry'].apply(lambda x : round(shape(x).area * (10**4), 3))

# Merge area and polygons to dataframe
adelaide_suburbs_df = adelaide_suburbs_df.merge(geo_boundary, how = 'inner', on = ['Suburb', 'Postcode'])

# Drop duplicate rows
adelaide_suburbs_df.drop_duplicates(subset =["Suburb", "LGA"], keep = 'first', inplace = True)

Adjust Hindmarsh [area](https://en.wikipedia.org/wiki/Hindmarsh,_South_Australia)

In [None]:
adelaide_suburbs_df.loc[adelaide_suburbs_df.Suburb == 'Hindmarsh']

In [None]:
index = adelaide_suburbs_df.loc[adelaide_suburbs_df.Suburb == 'Hindmarsh'].index[0]
adelaide_suburbs_df.at[index, 'area'] = 0.88

# Calculate population density

In [None]:
adelaide_suburbs_df['density'] = (adelaide_suburbs_df['Population'] / adelaide_suburbs_df['area'])

# Investigate missing data

In [None]:
print('Total number of missing values in Population column: {}'.format(adelaide_suburbs_df.shape[0]-adelaide_suburbs_df['Population'].count()))
print('')
print('Percentage of missing data in Population column: {} %'.format(round(((adelaide_suburbs_df.shape[0]-adelaide_suburbs_df['Population'].count())/adelaide_suburbs_df.shape[0])*100, 2)))

In [None]:
# Split dataframe into full data vs missing data
full_data = adelaide_suburbs_df[adelaide_suburbs_df['Population'].notnull() & (adelaide_suburbs_df['density'].notnull())]
full_data.reset_index(inplace=True, drop=True)

missing_data = adelaide_suburbs_df[adelaide_suburbs_df['Population'].isnull() | (adelaide_suburbs_df['density'].isnull())]
missing_data.reset_index(inplace=True, drop=True)
missing_data

Suburb population from ABS:
* Brown Hill Creek: 50 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40170?opendocument)
* Buckland Park: 173 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40178?opendocument)
* Craigmore: 10895 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40322?opendocument)
* Edwardstown: 4328 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40391?opendocument)
* Elizabeth Downs: 5069 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40394?opendocument)
* Elizabeth North: 3463 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40397?opendocument)
* Elizabeth Park: 3861 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40398?opendocument)
* Evanston South: 341 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40420?opendocument)
* Eyre: 503 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/IARE402003?opendocument)
* Fairview Park: 3599 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40424?opendocument)
* Fitzroy: 781 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40437?opendocument)
* Gawler East: 5338 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40471?opendocument)
* Gawler West: 948 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40475?opendocument)
* Gould Creek: 242 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40521?opendocument)
* Greenwith: 8988 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40532?opendocument)
* Gulfview Heights: 3642 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40533?opendocument)
* Hillbank: 4610 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40585?opendocument)
* Leawood Gardens: 61 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40750?opendocument)
* Medindie Gardens: 340 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40846?opendocument)
* Munno Para Downs: 228 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40971?opendocument)
* Para Hills West: 3190 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41086?opendocument)
* Para Vista: 2904 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41087?opendocument)
* Parafield: 105 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41091?opendocument)
* Penfield Gardens: 335 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41121?opendocument)
* Pooraka: 7228 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41171?opendocument)
* Redwood Park: 5421 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41229?opendocument)
* Salisbury Downs: 5984 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41276?opendocument)
* Salisbury Park: 2164 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41280?opendocument)
* Salisbury South: 99 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41282?opendocument)
* Sampson Flat: 124 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41286?opendocument)
* Sefton Park: 1210 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41313?opendocument)
* Semaphore South: 1019 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41318?opendocument)
* Smithfield Plains: 2871 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41334?opendocument)
* St Agnes: 4134 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41353?opendocument)
* Taperoo: 3091 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41407?opendocument)
* Tea Tree Gully: 3242 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41418?opendocument)
* Uleybury: 292 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41468?opendocument)
* Upper Hermitage: 285 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41477?opendocument)
* Vista: 972 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41492?opendocument)
* Windsor Gardens: 5272 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41601?opendocument)
* Wingfield: 478 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41602?opendocument)
* Woodville South: 3179 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41632?opendocument)
* Yatala Vale: 251 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41664?opendocument)
* Yattalunga: 313 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41666?opendocument)

Suburb population information from other sources:
* Lightsview: 327 [link](https://profile.id.com.au/s_lightsview/population-of-lightsview)


* Direk: primarily an industrial area
* Dorset Vale: Reservoir in Adelaide
* Edinburgh North: primarily an industrial area


In [None]:
ABS_population = {'Brown Hill Creek': 50,
                  'Buckland Park': 173,
                  'Craigmore': 10895,
                  'Edwardstown': 4328,
                  'Elizabeth Downs': 5069,
                  'Elizabeth North': 3463,
                  'Elizabeth Park': 3861,
                  'Evanston South': 341,
                  'Eyre': 503,
                  'Fairview Park': 3599,
                  'Fitzroy': 781,
                  'Gawler East': 5338,
                  'Gawler West': 948,
                  'Gould Creek': 242,
                  'Greenwith': 8988,
                  'Gulfview Heights': 3642,
                  'Hillbank': 4610,
                  'Leawood Gardens': 61,
                  'Medindie Gardens': 340,
                  'Munno Para Downs': 228,
                  'Para Hills West': 3190,
                  'Para Vista': 2904,
                  'Parafield': 105,
                  'Penfield Gardens': 335,
                  'Pooraka': 7228,
                  'Redwood Park': 5421,
                  'Salisbury Downs': 5984,
                  'Salisbury Park': 2164,
                  'Salisbury South': 99,
                  'Sampson Flat': 124,
                  'Sefton Park': 1210,
                  'Semaphore South': 1019,
                  'Smithfield Plains': 2871,
                  'St Agnes': 4134,
                  'Taperoo': 3091,
                  'Tea Tree Gully': 3242,
                  'Uleybury': 292,
                  'Upper Hermitage': 285,
                  'Vista': 972,
                  'Windsor Gardens': 5272,
                  'Wingfield': 478,
                  'Woodville South': 3179,
                  'Yatala Vale': 251,
                  'Yattalunga': 313,
                  'Lightsview': 327}

In [None]:
# Add new population data to the main dataframe
missing_data.Population = missing_data.Population.fillna(missing_data.Suburb.map(ABS_population))

In [None]:
# Drop the following suburbs
missing_data = missing_data[~missing_data['Suburb'].isin(['Direk',
                                                          'Dorset Vale',
                                                          'Edinburgh North'])]

# Calculate missing population densities

In [None]:
missing_data['density'] = round(missing_data['Population'] / missing_data['area'], 0)

In [None]:
adelaide_suburbs = pd.concat([full_data, missing_data], ignore_index=True)

In [None]:
adelaide_suburbs['State'] = 'South Australia'

In [None]:
adelaide_suburbs.to_csv('adelaide_suburbs.csv')

# Scrape Darwin suburbs from wikipedia

In [None]:
with open("view-source_https___en.wikipedia.org_wiki_List_of_Darwin_suburbs.html") as html_file:
    soup = BeautifulSoup(html_file, "lxml")

drw_suburbs_section = soup.find('div',attrs={'id':'mw-content-text', 'class':'mw-content-ltr'})
drw_suburbs_section = drw_suburbs_section.find_all('a', href=True)

In [None]:
wiki_state_name = ', Northern Territory'
wiki_link_extension = ',_Northern_Territory'
city = 'Darwin'
drw_wiki_urls = get_wiki_urls(drw_suburbs_section, wiki_link_extension, wiki_state_name, city)

In [None]:
drw_suburb_infobox = get_suburb_wiki_infobox(drw_wiki_urls)

In [None]:
darwin_suburbs_dict = get_suburb_info(drw_suburb_infobox)

In [None]:
# Convert Canberra suburbs nested dictionary to a dataframe
darwin_suburbs_df = pd.DataFrame(darwin_suburbs_dict).T
darwin_suburbs_df['Suburb'] = darwin_suburbs_df.index
darwin_suburbs_df.index = range(darwin_suburbs_df.shape[0])

# Drop generic Darwin wikipedia page row
darwin_suburbs_df.drop(index=0, inplace=True)

# Convert population to an integer
darwin_suburbs_df['Population'] = pd.to_numeric(darwin_suburbs_df['Population'], downcast='integer')

darwin_suburbs_df.head()

In [None]:
# Group LGA's into one column
df = pd.DataFrame(columns=['LGA', 'Population', 'Postcode', 'Suburb'])

for row in darwin_suburbs_df.iterrows():
    for col in range(0,3):
        if row[1][col] == row[1][col]:
            # 9 = Population
            # 10 = Postcode
            # 11 = Suburb
            df.loc[len(df)] = [row[1][col],
                               row[1][3],
                               row[1][4],
                               row[1][5]]
        else:
            continue

darwin_suburbs_df = df
darwin_suburbs_df.sort_values(by='Suburb', ascending=True, inplace=True)
darwin_suburbs_df.reset_index(inplace=True, drop=True)

Find missing postcodes

In [None]:
darwin_suburbs_df[darwin_suburbs_df['Postcode'].isnull()]

Fill in missing postcodes

In [None]:
missing_postcodes = {'Black Jungle': '0822',
                     'Blackmore': '0822',
                     'Darwin River': '0841',
                     'Fly Creek': '0822',
                     'Freds Pass': '0822',
                     'Gunn Point': '0822',
                     'Hughes': '0822',
                     'Lloyd Creek': '0822',
                     'Manton': '0837',
                     'Murrumujuk': '0822'}

In [None]:
darwin_suburbs_df.Postcode = darwin_suburbs_df.Postcode.fillna(darwin_suburbs_df.Suburb.map(missing_postcodes))


Group suburbs by LGA

In [None]:
darwin_suburbs_df = darwin_suburbs_df.groupby(['Suburb','Postcode']).agg({'LGA' : ', '.join,
                                                                           'Population': 'mean'}).reset_index().reindex(columns=darwin_suburbs_df.columns)

# Get geographical coordinates

In [None]:
# Set Google Maps API key
gmaps_key = googlemaps.Client(key = GOOGLE_MAPS_API)

# Create Geocode result object
darwin_suburbs_df['LAT'] = None
darwin_suburbs_df['LON'] = None

for index, row in darwin_suburbs_df.iterrows():
    geocode_result = gmaps_key.geocode("{}, {}, Northern Territory".format(row['Suburb'], row['Postcode']))
    try:
        lat = geocode_result[0]['geometry']['location']['lat']
        lon = geocode_result[0]['geometry']['location']['lng']
                
        darwin_suburbs_df.iat[index, darwin_suburbs_df.columns.get_loc("LAT")] = lat
        darwin_suburbs_df.iat[index, darwin_suburbs_df.columns.get_loc("LON")] = lon
    except:
        lat = None
        lon = None

# Calculate geographical distance

In [None]:
darwin_suburbs_df['distance'] = None

CBD_LAT = darwin_suburbs_df[darwin_suburbs_df['Suburb'].str.contains("central business district")].iloc[0]['LAT']
CBD_LON = darwin_suburbs_df[darwin_suburbs_df['Suburb'].str.contains("central business district")].iloc[0]['LON']

for index, row in darwin_suburbs_df.iterrows():
    
    row_lat = row['LAT']
    row_lon = row['LON']
    distance = calc_geo_dist(CBD_LAT, CBD_LON, row_lat, row_lon)
    
    darwin_suburbs_df.iat[index, darwin_suburbs_df.columns.get_loc("distance")] = distance

# Get geographical area

In [None]:
with open('suburb_boundaries_nt.json') as f:
    d = json.load(f)
    
geo_boundary = pd.DataFrame.from_dict(d['features'])
geo_boundary.drop(['geometry_name', 'id', 'type'], axis=1, inplace=True)

geo_boundary['Postcode'] = None

for index, row in geo_boundary.iterrows():
    suburb = row['properties']['nt_local_2']
    postcode = row['properties']['nt_local_4']

    geo_boundary.iat[index, geo_boundary.columns.get_loc('properties')] = suburb.title()
    
    try:
        if postcode == None:
            postcode = darwin_suburbs_df.loc[darwin_suburbs_df['Suburb'] == suburb.title()]['Postcode'].values[0]
            geo_boundary.iat[index, geo_boundary.columns.get_loc('Postcode')] = postcode
        else:
            geo_boundary.iat[index, geo_boundary.columns.get_loc('Postcode')] = postcode
    except(IndexError):
        continue
    
geo_boundary.rename(columns={'properties':'Suburb'}, inplace=True)

# Calculate geographical area
geo_boundary['area'] = geo_boundary['geometry'].apply(lambda x : round(shape(x).area * (10**4), 3))

# Merge area and polygons to dataframe
darwin_suburbs_df = darwin_suburbs_df.merge(geo_boundary, how = 'inner', on = ['Suburb', 'Postcode'])

# Drop duplicate rows
darwin_suburbs_df.drop_duplicates(subset =["Suburb", "LGA"], keep = 'first', inplace = True)    

# Calculate population density

In [None]:
darwin_suburbs_df['density'] = (darwin_suburbs_df['Population'] / darwin_suburbs_df['area'])

# Investigate missing data

In [None]:
# Split dataframe into full data vs missing data
full_data = darwin_suburbs_df[darwin_suburbs_df['Population'].notnull() & (darwin_suburbs_df['density'].notnull())]
full_data.reset_index(inplace=True, drop=True)

missing_data = darwin_suburbs_df[darwin_suburbs_df['Population'].isnull() | (darwin_suburbs_df['density'].isnull())]
missing_data.reset_index(inplace=True, drop=True)

There's no missing population data

In [None]:
darwin_suburbs = full_data

In [None]:
darwin_suburbs['State'] = 'Northern Territory'

In [None]:
darwin_suburbs.to_csv('darwin_suburbs.csv')

# Scrape Perth suburbs from wikipedia

In [None]:
with open("view-source_https___en.wikipedia.org_wiki_List_of_Perth_suburbs.html") as html_file:
    soup = BeautifulSoup(html_file, "lxml")

per_suburbs_section = soup.find('div',attrs={'id':'mw-content-text', 'class':'mw-content-ltr'})
per_suburbs_section = per_suburbs_section.find_all('a', href=True)

In [None]:
wiki_state_name = ', Western Australia'
wiki_link_extension = ',_Western_Australia'
city = 'Perth'
per_wiki_urls = get_wiki_urls(per_suburbs_section, wiki_link_extension, wiki_state_name, city)

In [None]:
per_suburb_infobox = get_suburb_wiki_infobox(per_wiki_urls)

In [None]:
perth_suburbs_dict = get_suburb_info(per_suburb_infobox)

In [None]:
# Convert Canberra suburbs nested dictionary to a dataframe
perth_suburbs_df = pd.DataFrame(perth_suburbs_dict).T
perth_suburbs_df['Suburb'] = perth_suburbs_df.index
perth_suburbs_df.index = range(perth_suburbs_df.shape[0])

# Convert population to an integer
perth_suburbs_df['Population'] = pd.to_numeric(perth_suburbs_df['Population'], downcast='integer')

perth_suburbs_df.head()

In [None]:
# Group LGA's into one column
df = pd.DataFrame(columns=['LGA', 'Population', 'Postcode', 'Suburb'])

for row in perth_suburbs_df.iterrows():
    for col in range(0,3):
        if row[1][col] == row[1][col]:
            # 9 = Population
            # 10 = Postcode
            # 11 = Suburb
            df.loc[len(df)] = [row[1][col],
                               row[1][3],
                               row[1][4],
                               row[1][5]]
        else:
            continue

perth_suburbs_df = df
perth_suburbs_df.sort_values(by='Suburb', ascending=True, inplace=True)
perth_suburbs_df.reset_index(inplace=True, drop=True)

Check if missing postcodes

In [None]:
perth_suburbs_df[perth_suburbs_df['Postcode'].isnull()]

In [None]:
perth_suburbs_df = perth_suburbs_df.groupby(['Suburb','Postcode']).agg({'LGA' : ', '.join,
                                                                           'Population': 'mean'}).reset_index().reindex(columns=perth_suburbs_df.columns)

Adjust West Perth [area](https://profile.id.com.au/s_west-perth/suburb-boundaries)

# Get geographical coordinates

In [None]:
# Set Google Maps API key
gmaps_key = googlemaps.Client(key = GOOGLE_MAPS_API)

# Create Geocode result object
perth_suburbs_df['LAT'] = None
perth_suburbs_df['LON'] = None

for index, row in perth_suburbs_df.iterrows():
    geocode_result = gmaps_key.geocode("{}, {}, Western Australia".format(row['Suburb'], row['Postcode']))
    try:
        lat = geocode_result[0]['geometry']['location']['lat']
        lon = geocode_result[0]['geometry']['location']['lng']
                
        perth_suburbs_df.iat[index, perth_suburbs_df.columns.get_loc("LAT")] = lat
        perth_suburbs_df.iat[index, perth_suburbs_df.columns.get_loc("LON")] = lon
    except:
        lat = None
        lon = None

# Calculate geographical distance

In [None]:
perth_suburbs_df['distance'] = None

CBD_LAT = perth_suburbs_df[perth_suburbs_df['Suburb'].str.contains("central business district")].iloc[0]['LAT']
CBD_LON = perth_suburbs_df[perth_suburbs_df['Suburb'].str.contains("central business district")].iloc[0]['LON']

for index, row in perth_suburbs_df.iterrows():
    
    row_lat = row['LAT']
    row_lon = row['LON']
    distance = calc_geo_dist(CBD_LAT, CBD_LON, row_lat, row_lon)
    
    perth_suburbs_df.iat[index, perth_suburbs_df.columns.get_loc("distance")] = distance

# Get geographical area

In [None]:
with open('suburb_boundaries_wa.json') as f:
    d = json.load(f)
    
geo_boundary = pd.DataFrame.from_dict(d['features'])
geo_boundary.drop(['geometry_name', 'id', 'type'], axis=1, inplace=True)

geo_boundary['Postcode'] = None

for index, row in geo_boundary.iterrows():
    suburb = row['properties']['wa_local_2']
    postcode = row['properties']['wa_local_4']

    geo_boundary.iat[index, geo_boundary.columns.get_loc('properties')] = suburb.title()
    
    try:
        if postcode == None:
            postcode = perth_suburbs_df.loc[perth_suburbs_df['Suburb'] == suburb.title()]['Postcode'].values[0]
            geo_boundary.iat[index, geo_boundary.columns.get_loc('Postcode')] = postcode
        else:
            geo_boundary.iat[index, geo_boundary.columns.get_loc('Postcode')] = postcode
    except(IndexError):
        continue
    
geo_boundary.rename(columns={'properties':'Suburb'}, inplace=True)

# Calculate geographical area
geo_boundary['area'] = geo_boundary['geometry'].apply(lambda x : round(shape(x).area * (10**4), 3))

# Merge area and polygons to dataframe
perth_suburbs_df = perth_suburbs_df.merge(geo_boundary, how = 'inner', on = ['Suburb', 'Postcode'])

# Drop duplicate rows
perth_suburbs_df.drop_duplicates(subset =["Suburb", "LGA"], keep = 'first', inplace = True)    

Adjust West Perth area

In [None]:
index = perth_suburbs_df.loc[perth_suburbs_df.Suburb == 'West Perth'].index[0]
perth_suburbs_df.at[index, 'area'] = 1.07

Adjust Cannington [population](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC50245)

In [None]:
index = perth_suburbs_df.loc[perth_suburbs_df.Suburb == 'Cannington'].index[0]
perth_suburbs_df.at[index, 'Population'] = 5929

# Calculate population density

In [None]:
perth_suburbs_df['density'] = (perth_suburbs_df['Population'] / perth_suburbs_df['area'])

# Investigate missing data

In [None]:
# Split dataframe into full data vs missing data
full_data = perth_suburbs_df[perth_suburbs_df['Population'].notnull() & (perth_suburbs_df['density'].notnull())]
full_data.reset_index(inplace=True, drop=True)

missing_data = perth_suburbs_df[perth_suburbs_df['Population'].isnull() | (perth_suburbs_df['density'].isnull())]
missing_data.reset_index(inplace=True, drop=True)

In [None]:
missing_data['Suburb'].values

#### Suburb population information from [ABS](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC50033?opendocument):
* Ashendon: 15
* Balga: 12685
* Byford: 14908
* Henderson: 14
* Karrakatta: 5
* Karrakup: 190
* Kwinana Beach: 6
* Kwinana Town Centre: 249
* Lexia: 29
* Malaga: 6
* Mandogalup: 55
* O'Connor: 463
* Pinjar: 96
* Welshpool: 19
* Whiteman: 7

#### Suburb population information from other sources:
* Melaleuca: [4](https://homesales.com.au/location/melaleuca-wa/)

#### Following suburbs are dropped from the dataframe
* Herdsman: Lake and park
* Naval Base: predominately an [industrial suburb](https://en.wikipedia.org/wiki/Naval_Base,_Western_Australia) 
* Postans: area for waste water treatment plant 
* The Spectacles: [wetland reservce and bushland](https://en.wikipedia.org/wiki/The_Spectacles,_Western_Australia)


In [None]:
ABS_population = {'Ashendon': 15,
                  'Balga': 12685,
                  'Byford': 14908,
                  'Henderson': 14,
                  'Karrakatta': 5,
                  'Karrakup': 190,
                  'Kwinana Beach': 6,
                  'Kwinana Town Centre': 249,
                  'Lexia': 29,
                  'Malaga': 6,
                  'Mandogalup': 55,
                  '''O'Connor''': 463,
                  'Pinjar': 96,
                  'Welshpool': 19,
                  'Whiteman': 7,
                  'Melaleuca': 4}

In [None]:
missing_data.Population = missing_data.Population.fillna(missing_data.Suburb.map(ABS_population))

In [None]:
# Drop the following suburbs
missing_data = missing_data[~missing_data['Suburb'].isin(['Herdsman',
                                                          'Naval Base',
                                                          'Postans',
                                                          'The Spectacles'])]

# Calculate missing population densities

In [None]:
missing_data['density'] = round(missing_data['Population'] / missing_data['area'], 0)

In [None]:
perth_suburbs = pd.concat([full_data, missing_data], ignore_index=True)

In [None]:
perth_suburbs['State'] = 'Western Australia'

In [None]:
perth_suburbs.to_csv('perth_suburbs.csv')

# Plotting suburbs on maps

In [None]:
address = "Sydney, New South Wales"
geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Sydney, NSW are {}, {}.'.format(latitude, longitude))

In [None]:
# create map of Sydney using latitude and longitude values
map_sydney = folium.Map(location=[latitude, longitude], zoom_start=9.5)
# add markers to map
for lat, lng, postcode, suburb in zip(sydney_suburbs['LAT'],\
                                      sydney_suburbs['LON'],\
                                      sydney_suburbs['Postcode'],\
                                      sydney_suburbs['Suburb']):
    #print(lat, lng)
    label = '{}: {}'.format(postcode, suburb)
    label = folium.Popup(label, parse_html=True)

    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_sydney)  
    
map_sydney