In [1]:
from area import area
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim # converts an address into latitude and longitude values
from math import sin, cos, sqrt, atan2, radians
from scipy.ndimage.filters import gaussian_filter1d
from shapely.geometry import shape
from sklearn.cluster import KMeans 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

import folium
import geocoder
import googlemaps
import itertools
import json
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import requests
import unittest
import re
import seaborn as sns
%matplotlib inline

# Scrape Sydney suburbs from Wikipedia

Here I use [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) to scrape the suburbs from Sydney as listed on [wikipedia](https://en.wikipedia.org/wiki/List_of_Sydney_suburbs). For each suburb, I identify the wikipage url for the indexed suburb, request access to the webpage, and then collect the following from their information box:
* **Postcode**: Postcode
* **Density**: Population density
* **Area**: Area size of suburb
* **LGA**: Local government assocation (council)
* **Location**: Distance from the city

In [2]:
with open("view-source_https___en.wikipedia.org_wiki_List_of_Sydney_suburbs.html") as html_file:
    soup = BeautifulSoup(html_file,
                         "lxml"
                        )

syd_suburbs_section = soup.find('div',
                                attrs={'class':'mw-parser-output'}
                               )

syd_suburbs_section = syd_suburbs_section.findAll('a',
                                                  href=True
                                                 )

In [3]:
# Scrape url for each suburb
def get_wiki_urls(html_suburbs_section,
                  wiki_link_extension,
                  wiki_state_name,
                  state,
                 ):
    
    url_list = {}
    for i in range(len(html_suburbs_section)):
        #print(i)
        url = html_suburbs_section[i]['href']
        if wiki_link_extension in url:
            
            check_list = []
            for x in ['/File:',
                      '/List_of_']:
                
                if x in url:
                    check_list.append(1)
            
            if sum(check_list) >= 1:
                continue
            else:
                suburb = html_suburbs_section[i]['title'].replace(wiki_state_name,'')

                if suburb in url_list:
                    continue           

                else:
                    url_list[suburb] = 'https://en.wikipedia.org{}'.format(url)
        
        # Sydney cbd
        elif '{}_CBD'.format(state) in url:
            suburb = html_suburbs_section[i]['title'] 
            url_list[suburb] = 'https://en.wikipedia.org{}'.format(url)

        # Brisbane cbd
        elif '{}_central_business_district'.format(state) in url:
            suburb = html_suburbs_section[i]['title']
            url_list[suburb] = 'https://en.wikipedia.org{}'.format(url)
        else:
            continue
    
    return url_list

In [4]:
wiki_state_name = ', New South Wales'
wiki_link_extension = ',_New_South_Wales'
state = 'Sydney'
syd_wiki_urls = get_wiki_urls(syd_suburbs_section,
                              wiki_link_extension,
                              wiki_state_name,
                              state,
                             )

In [5]:
# Scrape information box for each suburb from their wikipage.
# If an information box doesn't exist a try/catch error for the AttributeError will pass it.

def get_suburb_wiki_infobox(wiki_urls):
    
    suburbs_infobox = {}
    for key,value in wiki_urls.items():

        try:
            page = requests.get(value)
            soup_page = BeautifulSoup(page.text, 'html.parser')

            try:
#                 print('Check redirection: {}'.format(key))
                soup_redirect = soup_page.find('span', class_='mw-redirectedfrom').a['href']
                soup_redirect_title = soup_redirect.replace(' ','_')
                soup_redirect_url = 'https://en.wikipedia.org{}'.format(soup_redirect)

                page = requests.get(soup_redirect_url)
                soup_redirect_page = BeautifulSoup(page.text, 'html.parser')
                soup_redirect_page_title = soup_redirect_page.find('ul', class_='redirectText').a['title']
                
                if key not in soup_redirect_page_title:
#                     print('Incorrect Redirection exists: {}\n'.format(key))
                    # Link redirects to wrong page
                    # Go to next suburb.
                    # Suburb not added to list
                    continue
                elif key in soup_redirect_page_title:
#                     print('Correct Redirection exists: {}'.format(key))
                    pass
                
            except:
                # If statement may fail because redirection doesn't exist in wikipage.
                # Therefore pass onto with next block of code.
#                 print('Redirection does not exists: {}'.format(key))               
                pass
            # Scrape information box from suburb wiki page
#             print('Scraping infobox: {}\n'.format(key))  
            infobox = soup_page.find('table', class_='infobox vcard')
            suburbs_infobox[key] = infobox.find_all('tr', class_='')

        except AttributeError:
            # print("No infobox: {}".format(key))
            suburbs_infobox[key] = None
            continue
    
    return suburbs_infobox

In [6]:
syd_suburb_infobox = get_suburb_wiki_infobox(syd_wiki_urls)

Function that scrapes data from wikipedia htmls of different suburbs in Australia. Here I apply it Sydney and Brisbane.

In [7]:
def get_suburb_info(suburb_infobox):
    CHECK_STRINGS = (
                     'Postcode',
                     'Population',
                     'LGA'
                    )

    suburbs = {}

    for key,value in suburb_infobox.items():
        key = key.split('(')
        key = key[0]
        print(key)
        
        if value:
            items = {}

            for val in value:        

                check_list = []
                for check_sting in CHECK_STRINGS:
                    if check_sting in val.text:
                        break
                                        
                if 'LGA' in val.text:
                    val_td = val.find('td', class_='')
                    lga_list = val_td.findAll('a', href=True)

                    dummy_list = []
                    for lga in lga_list:
                        try:
                            lga_title = lga['title']
                            if '(New South Wales)' in lga_title:
                                lga_title = lga_title.replace('(New South Wales)','')
                            elif '(Queensland)' in lga_title:
                                lga_title = lga_title.replace('(Queensland)','')
                            elif '(Brisbane City)' in lga_title:
                                lga_title = lga_title.replace('(Brisbane City)','')
                            elif '(City of Brisbane)' in lga_title:
                                lga_title = lga_title.replace('(City of Brisbane)','')
                            dummy_list.append(lga_title)
                        except(KeyError):
                            pass

                    if len(dummy_list) == 1:
                        items['LGA_1'] = ', '.join(dummy_list)
                        suburbs[key] = items
                    elif len(dummy_list) > 1:
                        for index, lga in enumerate(dummy_list):
                            items['LGA_{}'.format(index + 1)] = lga
                            suburbs[key] = items
                            
#                 if any(s in check_list for s in ['Population', 'Area', 'Density', 'Postcode']):
                elif any(s in val.text for s in ['Population', 'Postcode']):
                    keyword = check_sting
                    infobox_split = val.text.split(' ')

                    if len(infobox_split) > 1:
                        infobox_item = [s for s in infobox_split if str(keyword) in s]
                        # Remove substrings from string
                        info = infobox_item[0]
                        info = info.replace('Population','')\
                                   .replace(',','')\
                                   .replace('Postcode(s)','')
#                                    .replace('\xa0km2',' ')\
#                                    .replace('/km2','')\
#                                    .replace('km','')

                        info = info.split('\xa0(')
                        info = info[0]

                        try:
                            if keyword == 'Population':
                                info = int(info)
                            items[keyword] = info
                        except(ValueError):
                            pass
#                             items[keyword] = 'NaN'

                        suburbs[key] = items

                    elif len(infobox_split) == 1:
                        keyword = check_sting
                        info = infobox_split[0]
                        info = info.replace('Postcode(s)','')\
                                   .replace('Population','')\
                                   .replace(',','')\
                                   .replace('\xa0(2016)','')\
                                   .replace('\xa0(2006)','')\
                                   .replace('\xa0(2011)','')
                        if '[' in info:
                            info = info.split('[')
                            info = info[0]
                        if keyword == 'Population':
                            info = int(info)
                        items[keyword] = info

                        suburbs[key] = items

                else:
                    continue
        else:
            # For suburbs that don't have an information box on their Wikipedia page.
            continue
            
    return suburbs

In [8]:
sydney_suburbs_dict = get_suburb_info(syd_suburb_infobox)

Bankstown
Bondi Beach
Sydney central business district
Chatswood
Cronulla
Liverpool
Manly
Milsons Point
Mosman
Newtown
Parramatta
The Rocks
Abbotsbury
Abbotsford
Acacia Gardens
Agnes Banks
Airds
Alexandria
Alfords Point
Allambie Heights
Allawah
Ambarvale
Annandale
Annangrove
Arcadia
Arncliffe
Arndell Park
Artarmon
Ashbury
Ashcroft
Ashfield
Asquith
Auburn
Austral
Avalon Beach
Badgerys Creek
Balgowlah
Balgowlah Heights
Balmain
Balmain East
Bangor
Banksia
Banksmeadow
Bankstown Airport
Barangaroo
Barden Ridge
Bardia
Bardwell Park
Bardwell Valley
Bass Hill
Baulkham Hills
Bayview
Beacon Hill
Beaconsfield
Beaumont Hills
Beecroft
Belfield
Bella Vista
Bellevue Hill
Belmore
Belrose
Berala
Berkshire Park
Berowra
Berowra Creek
Berowra Heights
Berowra Waters
Berrilee
Beverley Park
Beverly Hills
Bexley
Bexley North
Bickley Vale
Bidwill
Bilgola Beach
Bilgola Plateau
Birchgrove
Birrong
Blackett
Blacktown
Blair Athol
Blairmount
Blakehurst
Bligh Park
Bondi
Bondi Junction
Bonnet Bay
Bonnyrigg
Bonnyrigg H

In [9]:
# Convert Sydney suburbs nested dictionary to a dataframe
sydney_suburbs_df = pd.DataFrame(sydney_suburbs_dict).T
sydney_suburbs_df['Suburb'] = sydney_suburbs_df.index
sydney_suburbs_df.index = range(sydney_suburbs_df.shape[0])
# sydney_suburbs_df

In [10]:
sydney_suburbs_df['Population'] = pd.to_numeric(sydney_suburbs_df['Population'], downcast='integer')

In [11]:
sydney_suburbs_df.head()

Unnamed: 0,LGA_1,LGA_2,LGA_3,LGA_4,Population,Postcode,Suburb
0,Canterbury-Bankstown Council,,,,32113.0,2200,Bankstown
1,Waverley Council,,,,11656.0,2026,Bondi Beach
2,City of Sydney,,,,17252.0,2000,Sydney central business district
3,City of Willoughby,,,,24913.0,2067,Chatswood
4,Sutherland Shire,,,,18070.0,2230,Cronulla


In [12]:
df = pd.DataFrame(columns=['LGA', 'Population', 'Postcode', 'Suburb'])

for row in sydney_suburbs_df.iterrows():
    for col in range(0,4):
        if row[1][col] == row[1][col]:
            # 4 = Population
            # 5 = Postcode
            # 6 = Suburb
            df.loc[len(df)] = [row[1][col],
                               row[1][4],
                               row[1][5],
                               row[1][6]]
        else:
            continue

sydney_suburbs_df = df
sydney_suburbs_df.sort_values(by='Suburb', ascending=True, inplace=True)
sydney_suburbs_df.reset_index(inplace=True, drop=True)
# sydney_suburbs_df

In [13]:
sydney_suburbs_df.head()

Unnamed: 0,LGA,Population,Postcode,Suburb
0,City of Fairfield,4253.0,2176,Abbotsbury
1,City of Canada Bay,5373.0,2046,Abbotsford
2,City of Blacktown,3798.0,2763,Acacia Gardens
3,City of Penrith,911.0,2753,Agnes Banks
4,City of Hawkesbury,911.0,2753,Agnes Banks


### Suburbs that didn't have a postcode on their wikipage.

In [14]:
missing_postcodes = sydney_suburbs_df[sydney_suburbs_df['Postcode'].isnull()]
missing_postcodes.reset_index(inplace=True, drop=True)
# missing_postcodes

Two things that stick out:

* Macarther is a south-west NSW region made up of three local LGA's and therefore is considered as as a suburb.
* Bankstown Airport is an airport not a suburb.
* Royal National Park is a coastline park.
* Ku-ring-gai Chase is a park in the northern region of sydney.

Therefore, Macarthur and Bankstown Airport will be removed from the dataframes.

In [15]:
# Drop Macarthur row.
to_drop = ['Bankstown Airport',
           'Macarthur',
           'Royal National Park',
           'Ku-ring-gai Chase'
          ]

sydney_suburbs_df = sydney_suburbs_df[~sydney_suburbs_df['Suburb'].isin(to_drop)]
sydney_suburbs_df.reset_index(inplace=True, drop=True)
missing_postcodes = missing_postcodes[~missing_postcodes['Suburb'].isin(to_drop)]
missing_postcodes.reset_index(inplace=True, drop=True)

Function to get postcodes with geopy

In [16]:
def get_missing_postcodes_geopy(missing_pc_df, city_str, state_str):
    geolocator = Nominatim(user_agent="specify_your_app_name_here", timeout=3)
    
    geo_pc = {}
    
    for index,row in missing_pc_df.iterrows():
        suburb = row['Suburb']
        location = geolocator.geocode("{}, {}, {}, Australia".format(suburb, city_str, state_str), geometry='geojson')
        
        display_name = location.raw['display_name']
        pc = re.findall(r'\b\d+\b', display_name)
        
        try:
            geo_pc[suburb] = pc[0]
        except(IndexError):
            # Suburbs that don't have postcodes. Need to find and fill in manually.
            geo_pc[suburb] = 'nan'

    return geo_pc

In [17]:
geo_pc_dict = get_missing_postcodes_geopy(missing_postcodes, 'Sydney', 'New South Wales')

Fill nan postcodes from sydney_suburbs_df with postcodes from dictionary

In [18]:
sydney_suburbs_df.Postcode = sydney_suburbs_df.Postcode.fillna(sydney_suburbs_df.Suburb.map(geo_pc_dict))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


Group suburbs by LGA in dataframe

In [19]:
sydney_suburbs_df = sydney_suburbs_df.groupby(['Suburb', 'Postcode']).agg({'LGA' : ', '.join, 'Population': 'mean'}).reset_index().reindex(columns=sydney_suburbs_df.columns)

# Get Geographical Coordinates

Function to get the list of geographical coordiantes for each suburb in the city

In [20]:
gmaps_key = googlemaps.Client(key = 'AIzaSyBxqCRTDqaRrDHDh_LBPO2QFOWqGDFeY0w')

# Create Geocode result object
sydney_suburbs_df['LAT'] = None
sydney_suburbs_df['LON'] = None

for index, row in sydney_suburbs_df.iterrows():
    geocode_result = gmaps_key.geocode("{}, {}, New South Wales".format(row['Suburb'], row['Postcode']))
    try:
        lat = geocode_result[0]['geometry']['location']['lat']
        lon = geocode_result[0]['geometry']['location']['lng']
                
        sydney_suburbs_df.iat[index, sydney_suburbs_df.columns.get_loc("LAT")] = lat
        sydney_suburbs_df.iat[index, sydney_suburbs_df.columns.get_loc("LON")] = lon
    except:
        lat = None
        lon = None

# Calculate geographical distance to Sydney CBD

Function that calculates the geographical distance between two sets of latitude and longitude coordinates and outputs hte list as a dataframe

In [21]:
def calc_geo_dist(lat1, lon1, lat2, lon2):
    R = 6373.0
    
    lat1 = radians(lat1)
    lon1 = radians(lon1)
    lat2 = radians(lat2)
    lon2 = radians(lon2)
    
    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    return R * c

In [22]:
sydney_suburbs_df['distance'] = None

CBD_LAT = sydney_suburbs_df[sydney_suburbs_df['Suburb'].str.contains("central business district")].iloc[0]['LAT']
CBD_LON = sydney_suburbs_df[sydney_suburbs_df['Suburb'].str.contains("central business district")].iloc[0]['LON']

for index, row in sydney_suburbs_df.iterrows():
    
    row_lat = row['LAT']
    row_lon = row['LON']
    distance = calc_geo_dist(CBD_LAT, CBD_LON, row_lat, row_lon)
    
    sydney_suburbs_df.iat[index, sydney_suburbs_df.columns.get_loc("distance")] = distance

# Get geographical area

In [23]:
with open('suburb_boundaries_nsw.json') as f:
    d = json.load(f)

In [24]:
geo_boundary = pd.DataFrame.from_dict(d['features'])
geo_boundary.drop(['geometry_name', 'id', 'type'], axis=1, inplace=True)

geo_boundary['Postcode'] = None

for index, row in geo_boundary.iterrows():

    suburb = row['properties']['nsw_loca_2']
    postcode = row['properties']['nsw_loca_4']
    geo_boundary.iat[index, geo_boundary.columns.get_loc('properties')] = suburb.title()
    
    try:
        if postcode == None:
            postcode = sydney_suburbs_df.loc[sydney_suburbs_df['Suburb'] == suburb.title()]['Postcode'].values[0]
            geo_boundary.iat[index, geo_boundary.columns.get_loc('Postcode')] = postcode
        else:
            geo_boundary.iat[index, geo_boundary.columns.get_loc('Postcode')] = postcode
    except(IndexError):
        continue
    
geo_boundary.rename(columns={'properties':'Suburb'}, inplace=True)

# Calculate geographical area

In [25]:
geo_boundary['area'] = geo_boundary['geometry'].apply(lambda x : round(shape(x).area * (10**4), 3))

Add area and polygons to dataframe

In [26]:
sydney_suburbs_df = sydney_suburbs_df.merge(geo_boundary, how = 'inner', on = ['Suburb', 'Postcode'])

In [27]:
sydney_suburbs_df.drop_duplicates(subset =["Suburb", "LGA"], 
                     keep = 'first', inplace = True)

# Calculate population density

In [28]:
sydney_suburbs_df['density'] = (sydney_suburbs_df['Population'] / sydney_suburbs_df['area'])

# Investigate missing data

In [29]:
print('Total number of missing values in Area column: {}'.format(sydney_suburbs_df.shape[0]-sydney_suburbs_df['Population'].count()))
print('Total number of missing values in Density column: {}'.format(sydney_suburbs_df.shape[0]-sydney_suburbs_df['density'].count()))

Total number of missing values in Area column: 22
Total number of missing values in Density column: 22


In [30]:
print('Percentage of missing data in Population column: {} %'.format(round(((sydney_suburbs_df.shape[0]-sydney_suburbs_df['Population'].count())/sydney_suburbs_df.shape[0])*100, 2)))
print('Percentage of missing data in Density column: {} %'.format(round(((sydney_suburbs_df.shape[0]-sydney_suburbs_df['density'].count())/sydney_suburbs_df.shape[0])*100, 2)))

Percentage of missing data in Population column: 3.3 %
Percentage of missing data in Density column: 3.3 %


Number of missing values for both columns is approximately the same. However, I will inspect the suburbs that only have area data but not density data.

In [31]:
# Split dataframe into full data vs missing data
full_data = sydney_suburbs_df[sydney_suburbs_df['Population'].notnull() & (sydney_suburbs_df['density'].notnull())]
full_data.reset_index(inplace=True, drop=True)

missing_data = sydney_suburbs_df[sydney_suburbs_df['Population'].isnull() | (sydney_suburbs_df['density'].isnull())]
missing_data.reset_index(inplace=True, drop=True)
# missing_data

In [32]:
list(np.unique(missing_data['Suburb'].values))

['Bickley Vale',
 'Camellia',
 'Cattai',
 'Chullora',
 'Clyde',
 'Cornwallis',
 'Currawong Beach',
 'Forest Glen',
 'Huntingwood',
 'Len Waters Estate',
 'Lucas Heights',
 'Macquarie Links',
 'Minchinbury',
 'North Kellyville',
 'Norwest',
 'Picnic Point',
 'Pitt Town Bottoms',
 'Pleasure Point',
 'Port Botany',
 'Rookwood',
 'South Windsor',
 'Wisemans Ferry']

Here we see that the data that is mainly missing is the suburb's population (and density, however density is calculated from the population value).

I will search for the population information individually for the suburbs listed above on the Australian Bureau Statistics. If the suburb does not contain any population information, I will either remove the suburb from the list or replace with some constant value outside a fixed value range (-999).

#### Suburbs with population statistics gathered from Australian Bureau Statistics (ABS) 
* **Cattai** population of [790](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC10859?opendocument).
* **Cornwallis** population of [53](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC11078?opendocument).
* **Forest Glen** population of [65](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC11542?opendocument).
* **Macquarie Links** population of [1360](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC12435?opendocument).
* **Minchinbury** population of [5619](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC12633?opendocument).
* **Pleasure Point** population of [528](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC13229?opendocument).
* **Picnic Point** population of [6160](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC13199?opendocument)
* **Pitt Town Bottoms** population of [102](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC13227?opendocument)
* **South Windsor** population of [5892](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2011/quickstat/SSC12119).
* **Wisemans Ferry** population of [220](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC14344?opendocument).

In [33]:
ABS_population = {'Cattai': 790,
                  'Cornwallis': 53,
                  'Forest Glen': 65,
                  'Macquarie Links': 1360,
                  'Minchinbury': 5619,
                  'Pleasure Point': 528,
                  'Picnic Point': 6160,
                  'Pitt Town Bottoms': 102,
                  'South Windsor': 5892,
                  'Wisemans Ferry': 220}

In [34]:
missing_data.Population = missing_data.Population.fillna(missing_data.Suburb.map(ABS_population))

#### Suburbs to be removed from the dataframe
Ports, industrial and commercial suburbs, and suburbs that specically state it does not contain a residential area will be removed from the dataframe. These suburbs include:
* **Camellia** is predominately an industrial suburb and no information on ABS.
* **Chullora** is predominately an industrial area and no information on ABS.
* **Clyde** is exclusively an industrial and commercial area. Wikipedia states, 'Clyde has no permanent population'. And no information on ABS.
* **Huntingwood** is predominately an industrial suburb and no information on ABS.
* **Lucas Heights** 'does not contain a residential area' according to Wikipedia and no information on ABS.
* **Port Botany** is a seaport suburb dominated by traide in containerised manufactured products. Therefore no residence. And no information on ABS.

In [35]:
missing_data = missing_data[~missing_data['Suburb'].isin(['Camellia', 'Chullora', 'Clyde', 'Huntingwood', 'Lucas Heights', 'Port Botany'])]

#### Suburbs where the population will be interpolated
* **North Kellyville** was officially proclaimed a suburb on 29th June 2018 and therefore has no information on ABS. However, since it was [previously part of Kellyville](https://en.wikipedia.org/wiki/North_Kellyville,_New_South_Wales) I will use Kellyville's population density for North Kellyville and back calculate the population.
* **Norwest** was officially proclaimed a suburb on 29th June 2018 and therefore has no information on ABS. However, since it was [previously part of Kellyville and Baulkham Hills](https://en.wikipedia.org/wiki/Norwest,_New_South_Wales) I will use the mean population density of Kellyville and Baulkham Hills as the density of Norwest and back calculate the population.

In [36]:
# Get population density for Kellyville
kellyville_density = sydney_suburbs_df.loc[sydney_suburbs_df['Suburb'] == 'Kellyville']['density'].values[0]

# Get index for North Kellyville
index = missing_data.loc[missing_data['Suburb'] == 'North Kellyville'].index.values[0]

# Replace density of North Kellyville with Kellyville density
missing_data.at[index, 'density'] = kellyville_density

# Get North Kellyville area
north_kellyville_area = sydney_suburbs_df.loc[sydney_suburbs_df['Suburb'] == 'North Kellyville']['area'].values[0]

# Calculate population of North Kellyville with North Kellyville area with Kellyville density
missing_data.at[index, 'Population'] = round(north_kellyville_area * kellyville_density, 0)

In [37]:
# Get population for Baulkham Hills
BaulkhamHills_density = sydney_suburbs_df.loc[sydney_suburbs_df['Suburb'] == 'Baulkham Hills']['density'].values[0]

# Calculate mean density of Kellyville and Baulkham Hills
mean_density = round(np.mean([kellyville_density, BaulkhamHills_density]), 0)

# Replace density of Norwest
missing_data.at[index, 'density'] = mean_density

# Get Norwest area
norwest_area = sydney_suburbs_df.loc[sydney_suburbs_df['Suburb'] == 'Norwest']['area'].values[0]

# Get index for Norwest
index = missing_data.loc[missing_data['Suburb'] == 'Norwest'].index.values[0]

# Calculate population of Norwest with Norwest area with mean density
missing_data.at[index, 'Population'] = round(norwest_area * mean_density, 0)

# Calculate missing population densities

In [38]:
missing_data['density'] = round(missing_data['Population'] / missing_data['area'], 0)

#### Suburbs that will be assigned a constant value outside a fixed value range (-999)
Since we cannot find any population information on the following suburbs we will assign them with a fixed value.
* **Bickley Vale**, no information on Australian Bureau Statistics (ABS).
* **Currawang Beach**, no information on ABS.
* **McCarrs Creek**, new suburb since 2012 and no information on ABS.
* **Rookwood**, no information on ABS.



In [39]:
missing_data['Population'].fillna(-999, inplace=True)
missing_data['density'].fillna(-999, inplace=True)

# Combine the full data dataframe with the missing data dataframe

In [40]:
sydney_suburbs = pd.concat([full_data, missing_data], ignore_index=True)

In [41]:
sydney_suburbs['State'] = 'New South Wales'

In [42]:
sydney_suburbs.to_csv('sydney_suburbs.csv')

# Scrape Brisbane suburbs from Wikipedia

In [43]:
with open("view-source_https___en.wikipedia.org_wiki_List_of_Brisbane_suburbs.html") as html_file:
    soup = BeautifulSoup(html_file, "lxml")

bri_suburbs_section = soup.find('div',attrs={'id':'mw-content-text', 'class':'mw-content-ltr'})
bri_suburbs_section = bri_suburbs_section.find_all('a', href=True)

In [44]:
wiki_state_name = ', Queensland'
wiki_link_extension = ',_Queensland'
state = 'Brisbane'
bne_wiki_urls = get_wiki_urls(bri_suburbs_section, wiki_link_extension, wiki_state_name, state)

In [45]:
bne_suburb_infobox = get_suburb_wiki_infobox(bne_wiki_urls)

In [46]:
brisbane_suburbs_dict = get_suburb_info(bne_suburb_infobox)

Brisbane central business district
Bowen Hills
East Brisbane
Fortitude Valley
Herston
Highgate Hill
Kangaroo Point
Kelvin Grove
New Farm
Newstead
Paddington
Petrie Terrace
Red Hill
South Brisbane
Spring Hill
Teneriffe
West End
Albion
Alderley
Ascot
Aspley
Bald Hills
Banyo
Bracken Ridge
Bridgeman Downs
Brighton
Carseldine
Chermside
Chermside West
Clayfield
Deagon
Eagle Farm
Everton Park
Ferny Grove
Fitzgibbon
Gaythorne
Geebung
Gordon Park
Grange
Hamilton
Hendra
Kalinga
Kedron
Keperra
Lutwyche
McDowall
Mitchelton
Myrtletown
Newmarket
Northgate
Nudgee
Nudgee Beach
Nundah
Pinkenba
Sandgate
Shorncliffe
Stafford
Stafford Heights
Taigum
Virginia
Wavell Heights
Wilston
Windsor
Wooloowin
Zillmere
Acacia Ridge
Algester
Annerley
Archerfield
Burbank
Calamvale
Coopers Plains
Darra
Doolandella
Drewvale
Durack
Dutton Park
Eight Mile Plains
Ellen Grove
Fairfield
Forest Lake
Greenslopes
Heathwood
Holland Park
Holland Park West
Inala
Karawatha
Kuraby
Larapinta
MacGregor
Mackenzie
Mansfield
Moorooka
Moun

In [47]:
# Convert Brisbane suburbs nested dictionary to a dataframe
brisbane_suburbs_df = pd.DataFrame(brisbane_suburbs_dict).T
brisbane_suburbs_df['Suburb'] = brisbane_suburbs_df.index
brisbane_suburbs_df.index = range(brisbane_suburbs_df.shape[0])

brisbane_suburbs_df.head(10)

Unnamed: 0,LGA_1,LGA_2,LGA_3,LGA_4,Population,Postcode,Suburb
0,City of Brisbane,Central Ward,,,9460,4000,Brisbane central business district
1,City of Brisbane,Central Ward,,,3226,4006,Bowen Hills
2,City of Brisbane,The Gabba Ward,,,5934,4169,East Brisbane
3,City of Brisbane,Central Ward,,,6978,4006,Fortitude Valley
4,City of Brisbane,Central Ward,,,2215,4006,Herston
5,City of Brisbane,The Gabba Ward,,,6194,4101,Highgate Hill
6,City of Brisbane,The Gabba Ward,,,8063,4169,Kangaroo Point
7,City of Brisbane,Paddington Ward,,,7927,4059,Kelvin Grove
8,City of Brisbane,Central Ward,,,12542,4005,New Farm
9,City of Brisbane,Central Ward,,,2193,4006,Newstead


In [48]:
brisbane_suburbs_df['Population'] = pd.to_numeric(brisbane_suburbs_df['Population'], downcast='integer')

In [49]:
df = pd.DataFrame(columns=['LGA', 'Population', 'Postcode', 'Suburb'])

for row in brisbane_suburbs_df.iterrows():
    for col in range(0,4):
        if row[1][col] == row[1][col]:
            # 4 = Population
            # 5 = Postcode
            # 6 = Suburb
            df.loc[len(df)] = [row[1][col],
                               row[1][4],
                               row[1][5],
                               row[1][6]]
        else:
            continue

brisbane_suburbs_df = df
brisbane_suburbs_df.sort_values(by='Suburb', ascending=True, inplace=True)
brisbane_suburbs_df.reset_index(inplace=True, drop=True)

### Suburbs that didn't have a postcode on their wikipage.

In [50]:
missing_postcodes = brisbane_suburbs_df[brisbane_suburbs_df['Postcode'].isnull()]
missing_postcodes.reset_index(inplace=True, drop=True)
missing_postcodes

Unnamed: 0,LGA,Population,Postcode,Suburb
0,City of Brisbane,,,Kalinga


Kalinga does not have a postcode in the information box on its Wikipedia page.

The postcode for Kalinga is [4030](https://www.australiapostcode.com/qld-kalinga.html).

In [51]:
# Get index for Kalinga
index = brisbane_suburbs_df.loc[brisbane_suburbs_df['Suburb'] == 'Kalinga'].index.values[0]

# Replace density of North Kellyville with Kellyville density
brisbane_suburbs_df.at[index, 'Postcode'] = 4030

Group suburbs by LGA in dataframe

In [52]:
brisbane_suburbs_df = brisbane_suburbs_df.groupby(['Suburb', 'Postcode']).agg({'LGA' : ', '.join,
                                        'Population': 'mean'}).reset_index().reindex(columns=brisbane_suburbs_df.columns)

# Get Geographical Coordinates

In [53]:
# Set Google Maps API key
gmaps_key = googlemaps.Client(key = 'AIzaSyBxqCRTDqaRrDHDh_LBPO2QFOWqGDFeY0w')

In [54]:
# Create Geocode result object
brisbane_suburbs_df['LAT'] = None
brisbane_suburbs_df['LON'] = None

for index, row in brisbane_suburbs_df.iterrows():
    geocode_result = gmaps_key.geocode("{}, {}, Queensland".format(row['Suburb'], row['Postcode']))
    try:
        lat = geocode_result[0]['geometry']['location']['lat']
        lon = geocode_result[0]['geometry']['location']['lng']
                
        brisbane_suburbs_df.iat[index, brisbane_suburbs_df.columns.get_loc("LAT")] = lat
        brisbane_suburbs_df.iat[index, brisbane_suburbs_df.columns.get_loc("LON")] = lon
    except:
        lat = None
        lon = None

# Calculate geographical distance

In [55]:
brisbane_suburbs_df['distance'] = None

CBD_LAT = brisbane_suburbs_df[brisbane_suburbs_df['Suburb'].str.contains("central business district")].iloc[0]['LAT']
CBD_LON = brisbane_suburbs_df[brisbane_suburbs_df['Suburb'].str.contains("central business district")].iloc[0]['LON']

for index, row in brisbane_suburbs_df.iterrows():
    
    row_lat = row['LAT']
    row_lon = row['LON']
    distance = calc_geo_dist(CBD_LAT, CBD_LON, row_lat, row_lon)
    
    brisbane_suburbs_df.iat[index, brisbane_suburbs_df.columns.get_loc("distance")] = distance

In [56]:
brisbane_suburbs_df.head()

Unnamed: 0,LGA,Population,Postcode,Suburb,LAT,LON,distance
0,"City of Brisbane, Moorooka Ward",7429.0,4110,Acacia Ridge,-27.5833,153.033,12.6574
1,Moreton Bay Region,15769.0,4035,Albany Creek,-27.3469,152.968,14.7785
2,"Hamilton Ward , City of Brisbane",2296.0,4010,Albion,-27.4307,153.043,4.67542
3,"City of Brisbane, Enoggera Ward, Marchant Ward",6117.0,4051,Alderley,-27.4239,153.005,5.47636
4,Redland City,16728.0,4161,Alexandra Hills,-27.523,153.221,20.1726


# Get Geographical Boundary

In [57]:
with open('suburb_boundaries_qld.json') as f:
    d = json.load(f)

In [58]:
geo_boundary = pd.DataFrame.from_dict(d['features'])
geo_boundary.drop(['geometry_name', 'id', 'type'], axis=1, inplace=True)

geo_boundary['Postcode'] = None

for index, row in geo_boundary.iterrows():

    suburb = row['properties']['qld_loca_2']
    postcode = row['properties']['qld_loca_4']
    geo_boundary.iat[index, geo_boundary.columns.get_loc('properties')] = suburb.title()
    
    try:
        if postcode == None:
            postcode = brisbane_suburbs_df.loc[brisbane_suburbs_df['Suburb'] == suburb.title()]['Postcode'].values[0]
            geo_boundary.iat[index, geo_boundary.columns.get_loc('Postcode')] = postcode
        else:
            geo_boundary.iat[index, geo_boundary.columns.get_loc('Postcode')] = postcode
    except(IndexError):
        continue
    
geo_boundary.rename(columns={'properties':'Suburb'}, inplace=True)

# Calculate geographical area

In [59]:
geo_boundary['area'] = geo_boundary['geometry'].apply(lambda x : round(shape(x).area * (10**4), 3))

In [60]:
brisbane_suburbs_df = geo_boundary.merge(brisbane_suburbs_df, how = 'inner', on = ['Suburb', 'Postcode'])

In [61]:
brisbane_suburbs_df.drop_duplicates(subset =["Suburb", "LGA"], 
                     keep = 'first', inplace = True)

In [62]:
brisbane_suburbs_df.head()

Unnamed: 0,geometry,Suburb,Postcode,area,LGA,Population,LAT,LON,distance
0,"{'type': 'MultiPolygon', 'coordinates': [[[[15...",Wamuran,4512,60.448,Moreton Bay Region,3196.0,-27.0359,152.863,50.8619
1,"{'type': 'MultiPolygon', 'coordinates': [[[[15...",Highvale,4520,22.384,Moreton Bay Region,1545.0,-27.379,152.812,23.3385
2,"{'type': 'MultiPolygon', 'coordinates': [[[[15...",Lawnton,4501,7.881,Moreton Bay Region,5658.0,-27.2839,152.982,21.103
3,"{'type': 'MultiPolygon', 'coordinates': [[[[15...",Mount Gravatt East,4122,4.207,City of Brisbane,11838.0,-27.532,153.084,9.03622
4,"{'type': 'MultiPolygon', 'coordinates': [[[[15...",Chandler,4155,11.886,"City of Brisbane, Chandler Ward",1442.0,-27.514,153.156,13.8151


# Calculate population density

In [63]:
brisbane_suburbs_df['density'] = (brisbane_suburbs_df['Population'] / brisbane_suburbs_df['area'])

# Investigate missing data

In [64]:
print('Total number of missing values in Population column: {}'.format(brisbane_suburbs_df.shape[0]-brisbane_suburbs_df['Population'].count()))

Total number of missing values in Population column: 19


In [65]:
print('Percentage of missing data in Population column: {} %'.format(round(((brisbane_suburbs_df.shape[0]-brisbane_suburbs_df['Population'].count())/brisbane_suburbs_df.shape[0])*100, 2)))

Percentage of missing data in Population column: 4.61 %


In [66]:
# Split dataframe into full data vs missing data
full_data = brisbane_suburbs_df[brisbane_suburbs_df['Population'].notnull() & (brisbane_suburbs_df['density'].notnull())]
full_data.reset_index(inplace=True, drop=True)

missing_data = brisbane_suburbs_df[brisbane_suburbs_df['Population'].isnull() | (brisbane_suburbs_df['density'].isnull())]
missing_data.reset_index(inplace=True, drop=True)
# missing_data

In [67]:
list(np.unique(missing_data['Suburb'].values))

['Bellthorpe',
 'Blacksoil',
 'Campbells Pocket',
 'Jeebropilly',
 'Jollys Lookout',
 'Kagaru',
 'Kalinga',
 'Lyons',
 'Mount Forbes',
 'Mutdapilly',
 'New Chum',
 'Samford Valley',
 'Samford Village',
 'South Maclean',
 'Stones Corner',
 'Swanbank',
 'Undullah',
 'Veresdale',
 'Woodhill']

#### Suburbs with population statistics gathered from Australian Bureau Statistics (ABS) 
* Bellthorpe population of [124](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC30225?opendocument).
* Blacksoil population of [104](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC30280?opendocument).
* Campbells Pocket population of [80](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC30512?opendocument).
* Jeebropilly population of [7](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC31450?opendocument).
* Jollys Lookout population of [76](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC31464?opendocument).
* Kagaru population of [13](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC31481?opendocument).
* Kalinga population of [2126](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC31487?opendocument).
* Lyons population of [32](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC31725?opendocument).
* Mount Forbes population of [263](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC31979?opendocument).
* Mutdapilly population of [308](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC32111?opendocument).
* New Chum population of ...
* Samford Valley population of [3068](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC32529?opendocument).
* Samford Village population of [796](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC32530?opendocument).
* South Maclean population of [1362](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC32620?opendocument).
* Stones Corner population of ...
* Undullah population of [45](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC32927?opendocument).
* Veresdale population of [392](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC32966?opendocument).
* Woodhill population of [723](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC33164?opendocument).



#### Population information gathered from other sources
* New Chum population of [3074](https://profile.id.com.au/ipswich/population?WebID=260).
* Stones Corner population of [9720](https://www.brisbane.qld.gov.au/sites/default/files/20170512-stones_corner_local_business_and_destination_plan.pdf).

In [68]:
ABS_population = {'Bellthorpe': 124,
                  'Blacksoil': 104,
                  'Campbells Pocket': 80,
                  'Jeebropilly': 7,
                  'Jollys Lookout': 76,
                  'Kagaru': 13,
                  'Kalinga': 2126,
                  'Lyons': 32,
                  'Mount Forbes': 263,
                  'Mutdapilly': 308,
                  'New Chum': 3074,
                  'Samford Valley': 3068,
                  'Samford Village': 796,
                  'South Maclean': 1362,
                  'Stones Corner': 9720,
                  'Undullah': 45,
                  'Veresdale': 392,
                  'Woodhill': 723}

In [69]:
missing_data.Population = missing_data.Population.fillna(missing_data.Suburb.map(ABS_population))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


# Calculating missing population densities

In [70]:
missing_data['density'] = round(missing_data['Population'] / missing_data['area'], 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


# Combine full dataframe with missing dataframe

In [71]:
brisbane_suburbs = pd.concat([full_data, missing_data], ignore_index=True)

In [72]:
brisbane_suburbs['State'] = 'Queensland'

In [73]:
brisbane_suburbs.to_csv('brisbane_suburbs.csv')

# Plotting Sydney and Brisbane suburbs on maps

### Plotting Sydney suburb map

In [74]:
address = "Sydney, New South Wales"
geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Sydney, NSW are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Sydney, NSW are -33.8548157, 151.2164539.


In [75]:
# create map of Sydney using latitude and longitude values
map_sydney = folium.Map(location=[latitude, longitude], zoom_start=9.5)
# add markers to map
for lat, lng, postcode, suburb in zip(sydney_suburbs['LAT'],\
                                      sydney_suburbs['LON'],\
                                      sydney_suburbs['Postcode'],\
                                      sydney_suburbs['Suburb']):
    #print(lat, lng)
    label = '{}: {}'.format(postcode, suburb)
    label = folium.Popup(label, parse_html=True)

    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_sydney)  
    
map_sydney

### Plotting Brisbane suburb map

In [76]:
address = "Brisbane, Queensland"
geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Brisbane, Queensland are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Brisbane, Queensland are -27.4689682, 153.0234991.


In [77]:
# create map of Brisbane using latitude and longitude values
map_brisbane = folium.Map(location=[latitude, longitude], zoom_start=9.5)
# add markers to map
for lat, lng, postcode, suburb in zip(brisbane_suburbs['LAT'],\
                                      brisbane_suburbs['LON'],\
                                      brisbane_suburbs['Postcode'],\
                                      brisbane_suburbs['Suburb']):
    #print(lat, lng)
    label = '{}: {}'.format(postcode, suburb)
    label = folium.Popup(label, parse_html=True)

    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_brisbane)  
    
map_brisbane

# Search for local venues in Sydney and Brisbane suburbs with Foursquare API

### Define Foursquare credentials and version

In [78]:
CLIENT_ID = 'JVA5H5NJXBQTUPSMHXXD0V0NKNP0OVJO0GKU1WJGLER5Q0DU' # Foursquare ID
CLIENT_SECRET = 'DMZV42OBBRZNSPQSGEUD3PE3N5EHUKRZYRTMSCLSORKPAO2W' # Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Credentails:
CLIENT_ID: JVA5H5NJXBQTUPSMHXXD0V0NKNP0OVJO0GKU1WJGLER5Q0DU
CLIENT_SECRET:DMZV42OBBRZNSPQSGEUD3PE3N5EHUKRZYRTMSCLSORKPAO2W


# Explore suburbs of Sydney

Function that finds top 100 venues within 500m radius of the neighbourhood

In [171]:
def getNearbyVenues(names, postcode, latitudes, longitudes, radius=500, LIMIT=100):
    
    venues_list=[]
    index = 0
    for name, postcode, lat, lng in zip(names, postcode, latitudes, longitudes):
        index += 1
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        if results:
            print('{}. RESULTS {}'.format(index, name))
            venues_list.append([(
                                 name,
                                 postcode,
                                 lat, 
                                 lng, 
                                 v['venue']['name'], 
                                 v['venue']['location']['lat'], 
                                 v['venue']['location']['lng'],  
                                 v['venue']['categories'][0]['name']
                                ) for v in results])
        elif not results:
            print('{}. NONE    {}'.format(index, name))
            venues_list.append([(
                                 name,
                                 postcode,
                                 lat, 
                                 lng, 
                                 None, 
                                 None, 
                                 None,  
                                 None
                                )])
    
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = [
                             'Suburb',
                             'Postcode',
                             'Suburb Latitude', 
                             'Suburb Longitude', 
                             'Venue', 
                             'Venue Latitude', 
                             'Venue Longitude', 
                             'Venue Category'
                            ]
    
    return(nearby_venues)

Here, I use the function to create a new dataframe that includes the venues.

In [137]:
while True:
    try:
        top_venues_sydney = getNearbyVenues(names=sydney_suburbs['Suburb'],
                                            postcode=sydney_suburbs['Postcode'],
                                            latitudes=sydney_suburbs['LAT'],
                                            longitudes=sydney_suburbs['LON'])
    except(KeyError):
        continue
    break

1. NONE    Abbotsbury
2. RESULTS Abbotsford
3. RESULTS Acacia Gardens
4. NONE    Agnes Banks
5. RESULTS Airds
6. RESULTS Alexandria
7. RESULTS Alfords Point
8. RESULTS Allambie Heights
9. RESULTS Allawah
10. RESULTS Ambarvale
11. RESULTS Annandale
12. RESULTS Annangrove
13. RESULTS Arcadia
14. RESULTS Arncliffe
15. RESULTS Arndell Park
16. RESULTS Artarmon
17. RESULTS Ashbury
18. NONE    Ashcroft
19. RESULTS Ashfield
20. RESULTS Asquith
21. RESULTS Auburn
22. NONE    Austral
23. RESULTS Avalon Beach
24. NONE    Badgerys Creek
25. RESULTS Balgowlah
26. RESULTS Balgowlah Heights
27. RESULTS Balmain
28. RESULTS Balmain East
29. RESULTS Bangor
30. RESULTS Banksia
31. RESULTS Banksmeadow
32. RESULTS Bankstown
33. RESULTS Barangaroo
34. RESULTS Barden Ridge
35. RESULTS Bardia
36. RESULTS Bardwell Park
37. RESULTS Bardwell Valley
38. RESULTS Bass Hill
39. RESULTS Baulkham Hills
40. RESULTS Bayview
41. RESULTS Beacon Hill
42. RESULTS Beaconsfield
43. RESULTS Beaumont Hills
44. RESULTS Beecroft

343. RESULTS Lansvale
344. NONE    Laughtondale
345. RESULTS Lavender Bay
346. NONE    Leets Vale
347. RESULTS Leichhardt
348. RESULTS Leonay
349. NONE    Leppington
350. NONE    Lethbridge Park
351. RESULTS Leumeah
352. RESULTS Lewisham
353. RESULTS Liberty Grove
354. RESULTS Lidcombe
355. RESULTS Lilli Pilli
356. RESULTS Lilyfield
357. RESULTS Lindfield
358. RESULTS Linley Point
359. RESULTS Little Bay
360. RESULTS Liverpool
361. RESULTS Llandilo
362. RESULTS Loftus
363. RESULTS Londonderry
364. NONE    Long Point
365. RESULTS Longueville
366. RESULTS Lovett Bay
367. NONE    Lower Portland
368. NONE    Luddenham
369. RESULTS Lugarno
370. RESULTS Lurnea
371. RESULTS Macquarie Fields
372. RESULTS Macquarie Park
373. RESULTS Malabar
374. RESULTS Manly
375. RESULTS Manly Vale
376. RESULTS Maraylya
377. NONE    Marayong
378. NONE    Maroota
379. RESULTS Maroubra
380. RESULTS Marrickville
381. NONE    Marsden Park
382. RESULTS Marsfield
383. RESULTS Mascot
384. RESULTS Matraville
385. RESU

# Explore suburbs of Brisbane

In [138]:
while True:
    try:
        top_venues_brisbane = getNearbyVenues(names=brisbane_suburbs['Suburb'],
                                              postcode=brisbane_suburbs['Postcode'],
                                              latitudes=brisbane_suburbs['LAT'],
                                              longitudes=brisbane_suburbs['LON'])
    except(KeyError):
        continue
    break

1. RESULTS Wamuran
2. RESULTS Highvale
3. RESULTS Lawnton
4. RESULTS Mount Gravatt East
5. RESULTS Chandler
6. RESULTS Clayfield
7. NONE    Warner
8. NONE    Tallegalla
9. RESULTS Hillcrest
10. RESULTS Parkinson
11. NONE    Carole Park
12. RESULTS East Ipswich
13. RESULTS Thorneside
14. NONE    Karawatha
15. RESULTS Albany Creek
16. RESULTS Dinmore
17. NONE    Ocean View
18. RESULTS Neurum
19. NONE    Laceys Creek
20. RESULTS Auchenflower
21. RESULTS Bald Hills
22. NONE    Welsby
23. RESULTS Fairfield
24. RESULTS North Booval
25. RESULTS Camira
26. RESULTS Springfield
27. NONE    Kobble Creek
28. RESULTS Red Hill
29. RESULTS Newmarket
30. RESULTS Mitchelton
31. RESULTS Everton Park
32. RESULTS Kallangur
33. NONE    Caboolture South
34. RESULTS Stretton
35. RESULTS Kelvin Grove
36. RESULTS Grange
37. RESULTS Kangaroo Point
38. RESULTS East Brisbane
39. RESULTS Runcorn
40. RESULTS New Farm
41. RESULTS Rush Creek
42. RESULTS Churchill
43. NONE    Mount Marrow
44. RESULTS Teneriffe
45. NON

350. RESULTS Coopers Plains
351. RESULTS Upper Mount Gravatt
352. RESULTS Coorparoo
353. NONE    Lota
354. RESULTS Lytton
355. RESULTS Bracken Ridge
356. RESULTS Deagon
357. RESULTS Sandgate
358. NONE    Tennyson
359. RESULTS Yeerongpilly
360. NONE    Calvert
361. NONE    Park Ridge
362. RESULTS Yeronga
363. RESULTS Wacol
364. RESULTS Dayboro
365. RESULTS Paddington
366. RESULTS Stafford
367. RESULTS Stafford Heights
368. RESULTS Ningi
369. RESULTS Jamboree Heights
370. NONE    Woolshed
371. RESULTS Sherwood
372. RESULTS Alderley
373. RESULTS Wynnum
374. RESULTS Salisbury
375. RESULTS Greenslopes
376. RESULTS Wishart
377. RESULTS Taigum
378. RESULTS Richlands
379. NONE    Pallara
380. RESULTS Kenmore
381. RESULTS Oxley
382. RESULTS Bardon
383. RESULTS Heathwood
384. RESULTS Forest Lake
385. NONE    Redbank Plains
386. RESULTS Rocklea
387. NONE    Brookfield
388. RESULTS Indooroopilly
389. NONE    Willowbank
390. NONE    Moggill
391. NONE    Pinjarra Hills
392. RESULTS Larapinta
393. RE

Check size of dataframe

In [139]:
print(top_venues_sydney.shape)
print(top_venues_brisbane.shape)

(6104, 9)
(1862, 9)


The number of unique categories that can be curated from all of the returned venues

In [140]:
print('There are {} uniques categories in Sydney.'.format(len(top_venues_sydney['Venue Category'].unique())))
print('There are {} uniques categories in Brisbane.'.format(len(top_venues_brisbane['Venue Category'].unique())))

There are 333 uniques categories in Sydney.
There are 231 uniques categories in Brisbane.


In [141]:
print(len(top_venues_sydney['Suburb'].unique()))
print(len(top_venues_brisbane['Suburb'].unique()))

660
412


# Analysing each Suburb in Sydney and Brisbane

In [142]:
# Concatenate dataframes
top_venues = pd.concat([top_venues_sydney, top_venues_brisbane])
top_venues.shape

(7966, 9)

In [167]:
# one hot encoding
top_venues_onehot = pd.get_dummies(top_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
top_venues_onehot['Suburb'] = top_venues['Suburb'] 
top_venues_onehot['Postcode'] = top_venues['Postcode'] 
#top_venues_onehot['LGA'] = top_venues['LGA'] 

# move neighborhood column to the first column
fixed_columns = list(top_venues_onehot.columns[-2:]) + list(top_venues_onehot.columns[:-2])
top_venues_onehot = top_venues_onehot[fixed_columns]

top_venues_onehot.head()

Unnamed: 0,Suburb,Postcode,Accessories Store,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,American Restaurant,Antique Shop,Arcade,...,Volcano,Water Park,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,Abbotsbury,2176,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Abbotsford,2046,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Abbotsford,2046,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Abbotsford,2046,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Abbotsford,2046,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [168]:
top_venues_onehot.shape

(7966, 371)

Group suburbs and calculate the mean of the frequency of occurance for each category

In [169]:
top_venues_grouped = top_venues_onehot.groupby(['Suburb', 'Postcode']).mean().reset_index()
top_venues_grouped.head()

Unnamed: 0,Suburb,Postcode,Accessories Store,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,American Restaurant,Antique Shop,Arcade,...,Volcano,Water Park,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,Abbotsbury,2176,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Abbotsford,2046,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0
2,Acacia Gardens,2763,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Acacia Ridge,4110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Agnes Banks,2753,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Some suburbs were not returned because those suburbs didn't have 10 venues within 500m listed on Foursquare.


Function that sorts most frequent venues in descending order

In [192]:
# def return_most_common_venues(row, num_top_venues):
#     row_categories = row.iloc[1:]
#     row_categories_sorted = row_categories.sort_values(ascending=False)
    
#     return row_categories_sorted.index.values[0:num_top_venues]

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[2:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    top_ten = pd.DataFrame(row_categories_sorted.iloc[0:10])
    top_ten = top_ten.loc[(top_ten!=0).any(axis=1)]
    if len(top_ten) == 0:
        top_ten_list = [None] * num_top_venues
    else:
        top_ten_list = list(top_ten.index.values)
        if len(top_ten_list) < num_top_venues:
            n_empty_cells = num_top_venues - len(top_ten_list)
            top_ten_list.extend(itertools.repeat(None, n_empty_cells))
    
    return top_ten_list

Creating a new dataframe and display the top 10 venues for each neighbourhood

In [193]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Suburb', 'Postcode']
for index in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(index+1, indicators[index]))
    except:
        columns.append('{}th Most Common Venue'.format(index+1))

# create a new dataframe
top_venues_sorted = pd.DataFrame(columns=columns)
top_venues_sorted['Suburb'] = top_venues_grouped['Suburb']
top_venues_sorted['Postcode'] = top_venues_grouped['Postcode']

for index in np.arange(top_venues_grouped.shape[0]):
    top_venues_sorted.iloc[index, 2:] = return_most_common_venues(top_venues_grouped.iloc[index, :], num_top_venues)

In [194]:
top_venues_sorted.head()

Unnamed: 0,Suburb,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Abbotsbury,2176,,,,,,,,,,
1,Abbotsford,2046,Café,Pizza Place,Wine Shop,Gift Shop,Park,,,,,
2,Acacia Gardens,2763,Park,Home Service,Dry Cleaner,,,,,,,
3,Acacia Ridge,4110,,,,,,,,,,
4,Agnes Banks,2753,,,,,,,,,,


In [195]:
top_venues_sorted.shape

(1072, 12)

# Suburb Cluster Analysis

1. Extract and normalise data:
   * Area
   * Density
   * Distance

2. Combine data with local venues
3. Cluster analysis

In [97]:
print(sydney_suburbs.shape)
print(brisbane_suburbs.shape)
print('')
print(sydney_suburbs.shape[0] + brisbane_suburbs.shape[0])
print(top_venues_grouped.shape)


(660, 11)
(412, 11)

1072
(1058, 370)


In [None]:
# Split data from NSW to QLD
search_values = syd_suburbs_coord['Suburb'].values
syd_section = top_venues_grouped[top_venues_grouped.Suburb.str.contains('|'.join(search_values ))]
syd_section.index = pd.RangeIndex(len(syd_section.index))

In [None]:
syd_section.shape

In [None]:
search_values = bne_suburbs_coord['Suburb'].values
bne_section = top_venues_grouped[top_venues_grouped.Suburb.str.contains('|'.join(search_values ))]
bne_section.index = pd.RangeIndex(len(bne_section.index))

In [None]:
bne_section

Check shape of dataframe

In [None]:
print(syd_section.shape)
print(bne_section.shape)
print(top_venues_grouped.shape)

Perform Cluster analysis with Brisbane suburbs

In [None]:
# Drop suburb column for cluster analysis
venues_matrix_values = top_venues_sorted.drop('Suburb', 1)

Function that calculates euclidean distances for all points to their assigned cluster centroid.

In [None]:
def k_mean_distance(data, cantroid_matrix, i_centroid, cluster_labels):
    distances = [np.linalg.norm(x-cantroid_matrix) for x in data[cluster_labels == i_centroid]]
    return np.mean(distances)

In [None]:
k_optimisation = {}
nclusters = 20
clustering_matrix = bne_venues_matrix_values.values

for k in range(nclusters):
    k_means = KMeans(init = "k-means++", n_clusters = k+1, n_init = 100, random_state=10)
    k_means.fit(bne_venues_matrix_values)

    k_means_labels = k_means.labels_
    k_means_cluster_centers = k_means.cluster_centers_

    c_mean_distances = []
    for i, cent_features in enumerate(k_means_cluster_centers):
                mean_distance = k_mean_distance(clustering_matrix, cent_features, i, k_means_labels)
                c_mean_distances.append(mean_distance)

    k_optimisation[k+1] = np.mean(c_mean_distances)

Plot mean distances against number of k's

In [None]:
# sorted by key, return a list of tuples
lists = sorted(k_optimisation.items())
# unpack a list of pairs into two tuples
x, y = zip(*lists)
# Smooth data
ysmoothed = gaussian_filter1d(y, sigma=2)

x_ticks = x[1::2]
plt.plot(x, ysmoothed)
plt.xticks(x[1::2])
plt.xlabel('''Number of K's''')
plt.ylabel('Mean distance')
plt.title('Mean distance to centroid per K')
plt.show()

The inflexion point on the graph is approximately at 11. Therefore we will use 11 as our K value.

In [None]:
kclusters = 11
k_means = KMeans(init = "k-means++", n_clusters = kclusters, n_init = 100, random_state=10)
k_means.fit(bne_venues_matrix_values)

k_means_labels = k_means.labels_
k_means_cluster_centers = k_means.cluster_centers_

In [None]:
k_means_labels

Add cluster labels

In [None]:
# add clustering labels
search_values = bne_section['Suburb'].values
bne_venues_sorted = top_venues_sorted[top_venues_sorted.Suburb.str.contains('|'.join(search_values ))]

bne_suburbs_coord_update = bne_suburbs_coord[bne_suburbs_coord.Suburb.str.contains('|'.join(search_values ))]


bne_venues_sorted.insert(0, 'Cluster Labels', k_means_labels)
bne_merged = bne_suburbs_coord_update

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
bne_merged = bne_merged.join(bne_venues_sorted.set_index('Suburb'), on='Suburb')
bne_merged.index = pd.RangeIndex(len(bne_merged.index))
bne_merged.head(10) # check the last columns!

Plot clusters onto map

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, suburb, cluster in zip(bne_merged['lat'], bne_merged['long'], bne_merged['Suburb'], bne_merged['Cluster Labels']):
    label = folium.Popup(str(suburb) + ': Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Here the suburbs are clustered according to their top venues. Therefore suburbs that are north of Brisbane such as Burpengary (cluster 1) would be similar to suburbs close to the city such as East Brisbane and Kangaroo Point.
Although the suburbs are similar in regards to the local venues, they are vastly different in population density, area, and distance from the cbd. For example the Burpengary the population density is These attributes should also be taken into consideration when buyers from Sydney are looking for a home in Brisbane.
These attributes were conveniently scraped earlier from Wikipedia.

# KNN classification with Sydney suburbs and Brisbane clustered data

In [None]:
bne_merged.shape