In [1]:
from area import area
from bs4 import BeautifulSoup
from collections import defaultdict
from math import sin, cos, sqrt, atan2, radians
from sklearn import preprocessing
from shapely.geometry import shape
from sklearn.cluster import KMeans 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

import geocoder
import googlemaps
import itertools
import json
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import requests
import unittest
import re
import seaborn as sns
%matplotlib inline

In [2]:
GOOGLE_MAPS_API = 'AIzaSyBxqCRTDqaRrDHDh_LBPO2QFOWqGDFeY0w'

# Scrape Sydney suburbs from Wikipedia

Here I use [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) to scrape the suburbs from Sydney as listed on [wikipedia](https://en.wikipedia.org/wiki/List_of_Sydney_suburbs). For each suburb, I identify the wikipage url for the indexed suburb, request access to the webpage, and then collect the following from their information box:
* **Postcode**: Postcode
* **Density**: Population density
* **Area**: Area size of suburb
* **LGA**: Local government assocation (council)
* **Location**: Distance from the city

Function that gets the suburb information section from the wikipedia html page

In [619]:
def get_wiki_suburb_section(wiki_html):
    with open(wiki_html) as html_file:
        soup = BeautifulSoup(html_file, "lxml")

    suburb_section = soup.find('div', attrs={'class':'mw-parser-output'})

    suburb_section = suburb_section.findAll('a', href=True)
    
    return suburb_section

In [620]:
syd_suburbs_section = get_wiki_suburb_section("view-source_https___en.wikipedia.org_wiki_List_of_Sydney_suburbs.html")

Function that saves the urls for each suburb from the wikipedia html suburbs section

In [4]:
# Scrape url for each suburb
def get_wiki_urls(html_suburbs_section,
                  wiki_link_extension,
                  wiki_state_name,
                  city):
    
    url_list = {}
    for i in range(len(html_suburbs_section)):
        url = html_suburbs_section[i]['href']
        # Sydney cbd
        if 'Sydney_CBD' in url:
            suburb = 'Sydney central business district'
            url_list[suburb] = 'https://en.wikipedia.org/wiki/Sydney_central_business_district'

        # Brisbane cbd
        elif 'Brisbane_central_business_district' in url:
            suburb = 'Brisbane central business district'
            url_list[suburb] = 'https://en.wikipedia.org/wiki/Brisbane_central_business_district'
            
        # Melbourne and Adelaide cbd
        elif '{}_city_centre'.format(city) in url:
            suburb = '{} central business district'.format(city)
            url_list[suburb] = 'https://en.wikipedia.org{}'.format(url)
        
        # Canberra cbd
        elif 'City,_Australian_Capital_Territory' in url:
            suburb = 'Civic central business district'
            url_list[suburb] = 'https://en.wikipedia.org/wiki/Civic,_Australian_Capital_Territory'
        
        # Hobart cbd
        # elif 'Hobart_city_centre' in url:
        if url == '/wiki/Hobart_city_centre':
            suburb = 'Hobart central business district'
            url_list[suburb] = 'https://en.wikipedia.org/wiki/Hobart_City_Centre'

        # Darwin cbd
        elif 'Darwin_City' in url:
            suburb = 'Darwin central business district'
            url_list[suburb] = 'https://en.wikipedia.org/wiki/Darwin_City,_Northern_Territory'
        
        # Perth cbd
        elif 'Perth_(suburb)' in url:
            suburb = 'Perth central business district'
            url_list[suburb] = 'https://en.wikipedia.org/wiki/Perth_(suburb)'
            
        elif wiki_link_extension in url:
            
            check_list = []
            for x in ['/File:', '/List_of_', '/City_of_', '/commons.', '/Categories:']:
                
                if x in url:
                    check_list.append(1)
            
            if check_list:
                continue
            elif not check_list:
                suburb = html_suburbs_section[i]['title'].replace(wiki_state_name,'')

                if suburb in url_list:
                    continue           

                else:
                    url_list[suburb] = 'https://en.wikipedia.org{}'.format(url)
            
        else:
            continue
    
    return url_list

In [5]:
wiki_state_name = ', New South Wales'
wiki_link_extension = ',_New_South_Wales'
city = 'Sydney'
syd_wiki_urls = get_wiki_urls(syd_suburbs_section,
                              wiki_link_extension,
                              wiki_state_name,
                              city)

Fucntion that requests access to a suburbs wikipedia page and then saves its information box

In [6]:
def get_suburb_wiki_infobox(wiki_urls):
    
    suburbs_infobox = {}
    for key,value in wiki_urls.items():
        try:
            page = requests.get(value)
            soup_page = BeautifulSoup(page.text, 'html.parser')

            try:
                soup_redirect = soup_page.find('span', class_='mw-redirectedfrom').a['href']
                soup_redirect_title = soup_redirect.replace(' ','_')
                soup_redirect_url = 'https://en.wikipedia.org{}'.format(soup_redirect)

                page = requests.get(soup_redirect_url)
                soup_redirect_page = BeautifulSoup(page.text, 'html.parser')
                soup_redirect_page_title = soup_redirect_page.find('ul', class_='redirectText').a['title']
                
                if key not in soup_redirect_page_title:
                    continue
                elif key in soup_redirect_page_title:
                    pass
                
            except:
                pass
            infobox = soup_page.find('table', class_='infobox vcard')
            suburbs_infobox[key] = infobox.find_all('tr', class_='')

        except AttributeError:
            suburbs_infobox[key] = None
            continue
    
    return suburbs_infobox


In [205]:
syd_suburb_infobox = get_suburb_wiki_infobox(syd_wiki_urls)

Function that saves the suburbs name and used as a key later on.

In [534]:
def get_suburb_name(suburb_infobox):
    
    suburb = []
    
    for key, value in suburb_infobox.items():
        
        key = key.split('(')
        key = key[0]
        
        suburb += [key]
    
    return suburb

Function that saves suburbs local government area from wikipedia's information box.

In [535]:
def get_suburb_lga(suburb_infobox):
    
    LGA = {}
    
    for key,value in suburb_infobox.items():
        
        key = key.split('(')
        key = key[0]
        
        if value:
            items = {}
            
            for val in value:
                if any(s in val.text for s in ['LGA',
                                               'District']):
                    
                    val_td = val.find('td', class_='')
                    lga_list = val_td.findAll('a', href=True)

                    dummy_list = []
                    for lga in lga_list:
                        try:
                            lga_title = lga['title']
                            if '(New South Wales)' in lga_title:
                                lga_title = lga_title.replace('(New South Wales)','')
                            elif '(Queensland)' in lga_title:
                                lga_title = lga_title.replace('(Queensland)','')
                            elif '(Brisbane City)' in lga_title:
                                lga_title = lga_title.replace('(Brisbane City)','')
                            elif '(City of Brisbane)' in lga_title:
                                lga_title = lga_title.replace('(City of Brisbane)','')
                            elif ' (district)' in lga_title:
                                lga_title = lga_title.replace(' (district)','')
                            elif 'District of' in lga_title:
                                lga_title = lga_title.replace('District of ','')
                            elif '(Tasmania)' in lga_title:
                                lga_title = lga_title.replace(' (Tasmania)','')
                            elif ' (page does not exist)' in lga_title:
                                lga_title = lga_title.replace(' (page does not exist)','')
                                
                            dummy_list.append(lga_title)
                                
                        except(KeyError):
                            pass

                    if len(dummy_list) == 1:
                        items['LGA_1'] = ', '.join(dummy_list)
                    elif len(dummy_list) > 1:
                        for index, lga in enumerate(dummy_list):
                            items['LGA_{}'.format(index + 1)] = lga
        LGA[key] = items
    
    return LGA

Function that saves suburbs postcodes from wikipedia's information box.

In [564]:
def get_suburb_postcode(suburb_infobox):
    
    POSTCODE = {}
    
    for key,value in suburb_infobox.items():
        
        key = key.split('(')
        key = key[0]
        
        if value:
            items = {}
            
            for val in value:
                if 'Postcode' in val.text:

                    infobox_split = val.text.split(' ')
                    infobox_item = [s for s in infobox_split if 'Postcode' in s]

                    info = infobox_item[0]
                    info = info.replace('Postcode(s)','')

                    if '[' in info:
                        info = info.split('[')
                        info = info[0]
                    
#                     items['Postcode'] = info
        POSTCODE[key] = info
    
    return POSTCODE

Function that saves suburbs population value from wikipedia's information box.

In [565]:
def get_suburb_population(suburb_infobox):
    POPULATION = {}
    
    for key,value in suburb_infobox.items():
        
        key = key.split('(')
        key = key[0]
                
        if value:
            items = {}
            
            for val in value:
                if 'Population' in val.text:

                    infobox_split = val.text.split(' ')
                    
                    infobox_item = [s for s in infobox_split if 'Population' in s]
                    info = infobox_item[0]
                    info = info.replace('Population','').replace(',','')

                    info = info.split('\xa0(')
                    
                    info = info[0]
                    
                    try:
                        info = int(info)
                    except(ValueError):
                        info = float('nan')
                    
                    break
                    
        POPULATION[key] = info

    return POPULATION

Function that generates a dataframe from the dictionaries of suburb information.

In [612]:
def conv2df(suburb_infobox):
    
    suburb_name = get_suburb_name(syd_suburb_infobox)
    suburb_lga = get_suburb_lga(suburb_infobox)
    suburb_postcode = get_suburb_postcode(suburb_infobox)
    suburb_population = get_suburb_population(suburb_infobox)
    
    # Convert dictionary to dataframe
    lga_df = pd.DataFrame(suburb_lga).T

    postcode_df = pd.DataFrame.from_dict(suburb_postcode, orient='index')
    postcode_df.columns = ['Postcode']

    population_df = pd.DataFrame.from_dict(suburb_population, orient='index')
    population_df.columns = ['Population']

    # Merge dataframes
    dfs = [lga_df, population_df, postcode_df]
    df_final = reduce(lambda left,right: pd.merge(left, right, left_index=True,right_index=True), dfs)
    
    # Change index and add Suburb column 
    df_final['Suburb'] = df_final.index
    df_final.index = range(df_final.shape[0])
    
    df_final['Population'] = pd.to_numeric(df_final['Population'], downcast='integer')
    
    return df_final

In [613]:
sydney_suburbs_df = conv2df(syd_suburb_infobox)

Function that rearranges dataframe by grouping local government areas

In [12]:
def rearrange_lga(df):
    df_temp = pd.DataFrame(columns=['LGA',
                                    'Population',
                                    'Postcode',
                                    'Suburb'])

    for row in df.iterrows():
        for col in range(0,df.columns.get_loc("Population")):
            if row[1][col] == row[1][col]:
                df_temp.loc[len(df_temp)] = [row[1][col],
                                             row[1][df.columns.get_loc('Population')],
                                             row[1][df.columns.get_loc('Postcode')],
                                             row[1][df.columns.get_loc('Suburb')]]
            else:
                continue

    df = df_temp
    df.sort_values(by='Suburb', ascending=True, inplace=True)
    df.reset_index(inplace=True, drop=True)
    
    return df

In [13]:
sydney_suburbs_df = rearrange_lga(sydney_suburbs_df)

#### Suburbs that didn't have a postcode on their wikipage.

In [14]:
missing_postcodes = sydney_suburbs_df[sydney_suburbs_df['Postcode'].isnull()]
missing_postcodes.reset_index(inplace=True, drop=True)
# missing_postcodes

Function to get postcodes with geopy

In [15]:
def get_missing_postcodes_geopy(API_KEY, missing_pc_df, state_str):
    gmaps_key = googlemaps.Client(key = GOOGLE_MAPS_API)
    
    geo_postcode = {}
    
    for index,row in missing_pc_df.iterrows():
        suburb = row['Suburb']
        geocode_result = gmaps_key.geocode("{}, {}".format(row['Suburb'], state_str))
        postcode = geocode_result[0]['address_components'][-1]['long_name']
        
        try:
            geo_postcode[suburb] = postcode
        except(IndexError):
            # Suburbs that don't have postcodes. Need to find and fill in manually.
            geo_postcode[suburb] = 'nan'

    return geo_postcode

In [16]:
geo_postcode_dict = get_missing_postcodes_geopy(GOOGLE_MAPS_API, missing_postcodes, 'New South Wales')

# Fill nan postcodes from sydney_suburbs_df with postcodes from dictionary
sydney_suburbs_df.Postcode = sydney_suburbs_df.Postcode.fillna(sydney_suburbs_df.Suburb.map(geo_postcode_dict))

Group suburbs by LGA in dataframe

In [17]:
sydney_suburbs_df = sydney_suburbs_df.groupby(['Suburb', 'Postcode']).agg({'LGA' :', '.join, 'Population':'mean'}).reset_index().reindex(columns=sydney_suburbs_df.columns)

#### Get Geographical Coordinates

Function to get the list of geographical coordiantes for each suburb in the city

In [18]:
def get_geo_coords(API_KEY, df, state_string):
    gmaps_key = googlemaps.Client(key = GOOGLE_MAPS_API)

    # Create Geocode result object
    suburbs_geo = {}
    
    for index, row in df.iterrows():
        suburb = row['Suburb']
        item = {}
        geocode_result = gmaps_key.geocode("{}, {}, {}".format(row['Suburb'], row['Postcode'], state_string))
        try:
            item['LAT'] = geocode_result[0]['geometry']['location']['lat']
            item['LON'] = geocode_result[0]['geometry']['location']['lng']
            item['Postcode'] = row['Postcode']
            suburbs_geo[suburb] = item
        except:
            lat = None
            lon = None
    
    # Convert dictionary to dataframe
    df_geo = pd.DataFrame(suburbs_geo).T
    df_geo['Suburb'] = df_geo.index
    df_geo.index = range(df_geo.shape[0])
    
    return df_geo

In [19]:
geo_coords = get_geo_coords(GOOGLE_MAPS_API, sydney_suburbs_df, 'New South Wales')

Merge geographical coordinates with suburbs dataframe

In [20]:
sydney_suburbs_df = sydney_suburbs_df.merge(geo_coords, how = 'inner', on = ['Suburb', 'Postcode'])

#### Calculate geographical distance to Sydney CBD

Function that calculates the geographical distance between two sets of latitude and longitude coordinates and outputs hte list as a dataframe

In [21]:
def calc_geo_dist(lat1, lon1, lat2, lon2):
    R = 6373.0
    
    lat1 = radians(lat1)
    lon1 = radians(lon1)
    lat2 = radians(lat2)
    lon2 = radians(lon2)
    
    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    return R * c

In [22]:
def calc_dist_to_cbd(df):
    suburbs_dist = {}
    
    CBD_LAT = df[df['Suburb'].str.contains('central business district')].iloc[0]['LAT']
    CBD_LON = df[df['Suburb'].str.contains('central business district')].iloc[0]['LON']

    for index, row in df.iterrows():
        suburb = row['Suburb']
        item = {}
        
        row_lat = row['LAT']
        row_lon = row['LON']
        item['distance'] = calc_geo_dist(CBD_LAT, CBD_LON, row_lat, row_lon)
        item['Postcode'] = row['Postcode']
        
        suburbs_dist[suburb] = item
    
    # Convert dictionary to dataframe
    df_dist = pd.DataFrame(suburbs_dist).T
    df_dist['Suburb'] = df_dist.index
    df_dist.index = range(df_dist.shape[0])
    
    return df_dist

In [23]:
geo_dist = calc_dist_to_cbd(sydney_suburbs_df)

Merge suburb distances to cbd with suburbs dataframe

In [24]:
sydney_suburbs_df = sydney_suburbs_df.merge(geo_dist, how = 'inner', on = ['Suburb', 'Postcode'])

#### Get geographical boundary cooridinates and area

In [25]:
with open('suburb_boundaries_nsw.json') as f:
    geo_boundary_dict = json.load(f)

Function that gets boundary polygon information from each suburb

In [26]:
def get_geo_boundary(geo_boundary_dict, state_string, df):
    
    state_string = state_string.lower()
    
    geo_boundary = pd.DataFrame.from_dict(geo_boundary_dict['features'])
    geo_boundary.drop(['geometry_name', 'id', 'type'], axis=1, inplace=True)

    geo_boundary['Postcode'] = None

    for index, row in geo_boundary.iterrows():
        try:
            suburb = row['properties']['{}_loca_2'.format(state_string)]
            postcode = row['properties']['{}_loca_4'.format(state_string)]
        except:
            suburb = row['properties']['{}_local_2'.format(state_string)]
            postcode = row['properties']['{}_local_4'.format(state_string)]
            
        geo_boundary.iat[index, geo_boundary.columns.get_loc('properties')] = suburb.title()

        try:
            if postcode == None:
                postcode = df.loc[df['Suburb'] == suburb.title()]['Postcode'].values[0]
                geo_boundary.iat[index, geo_boundary.columns.get_loc('Postcode')] = postcode
            else:
                geo_boundary.iat[index, geo_boundary.columns.get_loc('Postcode')] = postcode
        except(IndexError):
            continue

    geo_boundary.rename(columns={'properties':'Suburb'}, inplace=True)
    
    return geo_boundary

In [27]:
geo_boundary = get_geo_boundary(geo_boundary_dict, 'NSW', sydney_suburbs_df)

sydney_suburbs_df = sydney_suburbs_df.merge(geo_boundary, how = 'inner', on = ['Suburb', 'Postcode'])

#### Calculate geographical area

In [28]:
sydney_suburbs_df['area'] = sydney_suburbs_df['geometry'].apply(lambda x : round(shape(x).area * (10**4), 3))

# Drop duplicate rows
sydney_suburbs_df.drop_duplicates(subset =["Suburb", "LGA"], keep = 'first', inplace = True)

#### Calculate population density

In [29]:
sydney_suburbs_df['density'] = (sydney_suburbs_df['Population'] / sydney_suburbs_df['area'])

#### Investigate missing data

In [30]:
print('Total number of missing values in Area column: {}'.format(sydney_suburbs_df.shape[0]-sydney_suburbs_df['Population'].count()))
print('Total number of missing values in Density column: {}'.format(sydney_suburbs_df.shape[0]-sydney_suburbs_df['density'].count()))

Total number of missing values in Area column: 21
Total number of missing values in Density column: 21


In [31]:
print('Percentage of missing data in Population column: {} %'.format(round(((sydney_suburbs_df.shape[0]-sydney_suburbs_df['Population'].count())/sydney_suburbs_df.shape[0])*100, 2)))
print('Percentage of missing data in Density column: {} %'.format(round(((sydney_suburbs_df.shape[0]-sydney_suburbs_df['density'].count())/sydney_suburbs_df.shape[0])*100, 2)))

Percentage of missing data in Population column: 3.15 %
Percentage of missing data in Density column: 3.15 %


Number of missing values for both columns is approximately the same. However, I will inspect the suburbs that only have area data but not density data.

In [32]:
# Split dataframe into full data vs missing data
full_data = sydney_suburbs_df[sydney_suburbs_df['Population'].notnull() & (sydney_suburbs_df['density'].notnull())]
full_data.reset_index(inplace=True, drop=True)

missing_data = sydney_suburbs_df[sydney_suburbs_df['Population'].isnull() | (sydney_suburbs_df['density'].isnull())]
missing_data.reset_index(inplace=True, drop=True)
# missing_data

Here we see that suburb population is predominately missing from the dataframe. The missing population information will be searched on the Australian Bureau Statistics.

Suburbs with population statistics gathered from Australian Bureau Statistics (ABS) 
* **Cattai** population of [790](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC10859?opendocument).
* **Cornwallis** population of [53](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC11078?opendocument).
* **Forest Glen** population of [65](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC11542?opendocument).
* **Macquarie Links** population of [1360](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC12435?opendocument).
* **Minchinbury** population of [5619](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC12633?opendocument).
* **Pleasure Point** population of [528](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC13229?opendocument).
* **Picnic Point** population of [6160](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC13199?opendocument)
* **Pitt Town Bottoms** population of [102](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC13227?opendocument)
* **South Windsor** population of [5892](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2011/quickstat/SSC12119).
* **Wisemans Ferry** population of [220](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC14344?opendocument).

In [33]:
ABS_population = {'Cattai': 790,
                  'Cornwallis': 53,
                  'Forest Glen': 65,
                  'Macquarie Links': 1360,
                  'Minchinbury': 5619,
                  'Pleasure Point': 528,
                  'Picnic Point': 6160,
                  'Pitt Town Bottoms': 102,
                  'South Windsor': 5892,
                  'Wisemans Ferry': 220}

In [34]:
missing_data.Population = missing_data.Population.fillna(missing_data.Suburb.map(ABS_population))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


Suburbs to be removed from the dataframe
* **Camellia** is predominately an industrial suburb and no information on ABS.
* **Chullora** is predominately an industrial area and no information on ABS.
* **Clyde** is exclusively an industrial and commercial area. Wikipedia states, 'Clyde has no permanent population'. And no information on ABS.
* **Huntingwood** is predominately an industrial suburb and no information on ABS.
* **Lucas Heights** 'does not contain a residential area' according to Wikipedia and no information on ABS.
* **Port Botany** is a seaport suburb dominated by traide in containerised manufactured products. Therefore no residence. And no information on ABS.
* **Bickley Vale**, no information on Australian Bureau Statistics (ABS).
* **Currawang Beach**, no information on ABS.
* **McCarrs Creek**, new suburb since 2012 and no information on ABS.
* **Rookwood**, no information on ABS.


In [35]:
missing_data = missing_data[~missing_data['Suburb'].isin(['Bickley Vale',
                                                          'Camellia',
                                                          'Chullora',
                                                          'Clyde',
                                                          'Currawong Beach',
                                                          'Huntingwood',
                                                          'Len Waters Estate',
                                                          'Lucas Heights',
                                                          'McCarrs Creek',
                                                          'Port Botany',
                                                          'Rookwood'])]

Suburbs where the population will be interpolated
* **North Kellyville** was officially proclaimed a suburb on 29th June 2018 and therefore has no information on ABS. However, since it was [previously part of Kellyville](https://en.wikipedia.org/wiki/North_Kellyville,_New_South_Wales) I will use Kellyville's population density for North Kellyville and back calculate the population.
* **Norwest** was officially proclaimed a suburb on 29th June 2018 and therefore has no information on ABS. However, since it was [previously part of Kellyville and Baulkham Hills](https://en.wikipedia.org/wiki/Norwest,_New_South_Wales) I will use the mean population density of Kellyville and Baulkham Hills as the density of Norwest and back calculate the population.

In [36]:
# Get population density for Kellyville
kellyville_density = sydney_suburbs_df.loc[sydney_suburbs_df['Suburb'] == 'Kellyville']['density'].values[0]

# Get index for North Kellyville
index = missing_data.loc[missing_data['Suburb'] == 'North Kellyville'].index.values[0]

# Replace density of North Kellyville with Kellyville density
missing_data.at[index, 'density'] = kellyville_density

# Get North Kellyville area
north_kellyville_area = sydney_suburbs_df.loc[sydney_suburbs_df['Suburb'] == 'North Kellyville']['area'].values[0]

# Calculate population of North Kellyville with North Kellyville area with Kellyville density
missing_data.at[index, 'Population'] = round(north_kellyville_area * kellyville_density, 0)

In [37]:
# Get population for Baulkham Hills
BaulkhamHills_density = sydney_suburbs_df.loc[sydney_suburbs_df['Suburb'] == 'Baulkham Hills']['density'].values[0]

# Calculate mean density of Kellyville and Baulkham Hills
mean_density = round(np.mean([kellyville_density, BaulkhamHills_density]), 0)

# Replace density of Norwest
missing_data.at[index, 'density'] = mean_density

# Get Norwest area
norwest_area = sydney_suburbs_df.loc[sydney_suburbs_df['Suburb'] == 'Norwest']['area'].values[0]

# Get index for Norwest
index = missing_data.loc[missing_data['Suburb'] == 'Norwest'].index.values[0]

# Calculate population of Norwest with Norwest area with mean density
missing_data.at[index, 'Population'] = round(norwest_area * mean_density, 0)

### Calculate missing population densities

In [38]:
missing_data['density'] = round(missing_data['Population'] / missing_data['area'], 0)

### Combine the full data dataframe with the missing data dataframe

In [39]:
sydney_suburbs = pd.concat([full_data, missing_data], ignore_index=True)

In [40]:
sydney_suburbs['State'] = 'New South Wales'
sydney_suburbs['City'] = 'Sydney'

# Scrape Brisbane suburbs from Wikipedia

In [622]:
bri_suburbs_section = get_wiki_suburb_section("view-source_https___en.wikipedia.org_wiki_List_of_Brisbane_suburbs.html")

In [42]:
wiki_state_name = ', Queensland'
wiki_link_extension = ',_Queensland'
city = 'Brisbane'
bne_wiki_urls = get_wiki_urls(bri_suburbs_section, wiki_link_extension, wiki_state_name, city)

In [43]:
bne_suburb_infobox = get_suburb_wiki_infobox(bne_wiki_urls)

In [615]:
brisbane_suburbs_df = conv2df(bne_suburb_infobox)

In [47]:
brisbane_suburbs_df = rearrange_lga(brisbane_suburbs_df)

#### Suburbs that didn't have a postcode on their wikipage.

In [48]:
missing_postcodes = brisbane_suburbs_df[brisbane_suburbs_df['Postcode'].isnull()]
missing_postcodes.reset_index(inplace=True, drop=True)

In [49]:
geo_postcode_dict = get_missing_postcodes_geopy(GOOGLE_MAPS_API, missing_postcodes, 'Queensland')

In [50]:
brisbane_suburbs_df.Postcode = brisbane_suburbs_df.Postcode.fillna(brisbane_suburbs_df.Suburb.map(geo_postcode_dict))

Group suburbs by LGA in dataframe

In [51]:
brisbane_suburbs_df = brisbane_suburbs_df.groupby(['Suburb', 'Postcode']).agg({'LGA' : ', '.join,
                                        'Population': 'mean'}).reset_index().reindex(columns=brisbane_suburbs_df.columns)

#### Get Geographical Coordinates

In [52]:
geo_coords = get_geo_coords(GOOGLE_MAPS_API, brisbane_suburbs_df, 'Queensland')

In [53]:
brisbane_suburbs_df = brisbane_suburbs_df.merge(geo_coords, how = 'inner', on = ['Suburb', 'Postcode'])

#### Calculate geographical distance

In [54]:
geo_dist = calc_dist_to_cbd(brisbane_suburbs_df)

In [55]:
brisbane_suburbs_df = brisbane_suburbs_df.merge(geo_dist, how = 'inner', on = ['Suburb', 'Postcode'])

#### Get Geographical Boundary

In [56]:
with open('suburb_boundaries_qld.json') as f:
    geo_boundary_dict = json.load(f)

In [57]:
geo_boundary = get_geo_boundary(geo_boundary_dict, 'QLD', brisbane_suburbs_df)

In [58]:
brisbane_suburbs_df = brisbane_suburbs_df.merge(geo_boundary, how = 'inner', on = ['Suburb', 'Postcode'])

#### Calculate suburb area

In [59]:
brisbane_suburbs_df['area'] = brisbane_suburbs_df['geometry'].apply(lambda x : round(shape(x).area * (10**4), 3))

Drop duplicate suburbs

In [60]:
brisbane_suburbs_df.drop_duplicates(subset =["Suburb", "LGA"], keep = 'first', inplace = True)

#### Calculate population density

In [61]:
brisbane_suburbs_df['density'] = (brisbane_suburbs_df['Population'] / brisbane_suburbs_df['area'])

#### Investigate missing data

In [62]:
print('Total number of missing values in Population column: {}'.format(brisbane_suburbs_df.shape[0]-brisbane_suburbs_df['Population'].count()))

Total number of missing values in Population column: 19


In [63]:
print('Percentage of missing data in Population column: {} %'.format(round(((brisbane_suburbs_df.shape[0]-brisbane_suburbs_df['Population'].count())/brisbane_suburbs_df.shape[0])*100, 2)))

Percentage of missing data in Population column: 4.61 %


In [64]:
# Split dataframe into full data vs missing data
full_data = brisbane_suburbs_df[brisbane_suburbs_df['Population'].notnull() & (brisbane_suburbs_df['density'].notnull())]
full_data.reset_index(inplace=True, drop=True)

missing_data = brisbane_suburbs_df[brisbane_suburbs_df['Population'].isnull() | (brisbane_suburbs_df['density'].isnull())]
missing_data.reset_index(inplace=True, drop=True)
# missing_data

Suburbs with population statistics gathered from Australian Bureau Statistics (ABS) 
* Bellthorpe population of [124](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC30225?opendocument).
* Blacksoil population of [104](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC30280?opendocument).
* Campbells Pocket population of [80](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC30512?opendocument).
* Jeebropilly population of [7](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC31450?opendocument).
* Jollys Lookout population of [76](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC31464?opendocument).
* Kagaru population of [13](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC31481?opendocument).
* Kalinga population of [2126](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC31487?opendocument).
* Lyons population of [32](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC31725?opendocument).
* Mount Forbes population of [263](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC31979?opendocument).
* Mutdapilly population of [308](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC32111?opendocument).
* New Chum population of ...
* Samford Valley population of [3068](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC32529?opendocument).
* Samford Village population of [796](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC32530?opendocument).
* South Maclean population of [1362](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC32620?opendocument).
* Stones Corner population of ...
* Undullah population of [45](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC32927?opendocument).
* Veresdale population of [392](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC32966?opendocument).
* Woodhill population of [723](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC33164?opendocument).



Population information gathered from other sources
* New Chum population of [3074](https://profile.id.com.au/ipswich/population?WebID=260).
* Stones Corner population of [9720](https://www.brisbane.qld.gov.au/sites/default/files/20170512-stones_corner_local_business_and_destination_plan.pdf).

In [66]:
ABS_population = {'Bellthorpe': 124,
                  'Blacksoil': 104,
                  'Campbells Pocket': 80,
                  'Jeebropilly': 7,
                  'Jollys Lookout': 76,
                  'Kagaru': 13,
                  'Kalinga': 2126,
                  'Lyons': 32,
                  'Mount Forbes': 263,
                  'Mutdapilly': 308,
                  'New Chum': 3074,
                  'Samford Valley': 3068,
                  'Samford Village': 796,
                  'South Maclean': 1362,
                  'Stones Corner': 9720,
                  'Undullah': 45,
                  'Veresdale': 392,
                  'Woodhill': 723}

In [67]:
missing_data.Population = missing_data.Population.fillna(missing_data.Suburb.map(ABS_population))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


Swanbank is predominately an industrial area and has no information on ABS.

In [68]:
missing_data = missing_data[~missing_data['Suburb'].isin(['Swanbank'])]

#### Calculating missing population densities

In [69]:
missing_data['density'] = round(missing_data['Population'] / missing_data['area'], 0)

#### Combine full dataframe with missing dataframe

In [70]:
brisbane_suburbs = pd.concat([full_data, missing_data], ignore_index=True)

In [71]:
brisbane_suburbs['State'] = 'Queensland'
brisbane_suburbs['City'] = 'Brisbane'

# Scrape Melbourne suburbs from Wikipedia

In [None]:
melb_suburbs_section = get_wiki_suburb_section("view-source_https___en.wikipedia.org_wiki_List_of_Melbourne_suburbs.html")

In [73]:
wiki_state_name = ', Victoria'
wiki_link_extension = ',_Victoria'
city = 'Melbourne'
melb_wiki_urls = get_wiki_urls(melb_suburbs_section, wiki_link_extension, wiki_state_name, city)

In [74]:
melb_suburb_infobox = get_suburb_wiki_infobox(melb_wiki_urls)

In [75]:
melbourne_suburbs_df = conv2df(melb_suburb_infobox)

In [77]:
melbourne_suburbs_df = rearrange_lga(melbourne_suburbs_df)

Group suburbs by LGA in dataframe

In [78]:
melbourne_suburbs_df = melbourne_suburbs_df.groupby(['Suburb', 'Postcode']).agg({'LGA' : ', '.join,
                                        'Population': 'mean'}).reset_index().reindex(columns=melbourne_suburbs_df.columns)

#### Get geographical coordinates

In [79]:
geo_coords = get_geo_coords(GOOGLE_MAPS_API, melbourne_suburbs_df, 'Victoria')

melbourne_suburbs_df = melbourne_suburbs_df.merge(geo_coords, how = 'inner', on = ['Suburb', 'Postcode'])

#### Calculate geographical distance

In [80]:
geo_dist = calc_dist_to_cbd(melbourne_suburbs_df)

melbourne_suburbs_df = melbourne_suburbs_df.merge(geo_dist, how = 'inner', on = ['Suburb', 'Postcode'])

#### Get geographical boundary

In [81]:
with open('suburb_boundaries_vic.json') as f:
    geo_boundary_dict = json.load(f)

In [82]:
geo_boundary = get_geo_boundary(geo_boundary_dict, 'VIC', melbourne_suburbs_df)

melbourne_suburbs_df = melbourne_suburbs_df.merge(geo_boundary, how = 'inner', on = ['Suburb', 'Postcode'])

#### Calculate suburb area

In [83]:
melbourne_suburbs_df['area'] = melbourne_suburbs_df['geometry'].apply(lambda x : round(shape(x).area * (10**4), 3))

melbourne_suburbs_df.drop_duplicates(subset =["Suburb", "LGA"], keep = 'first', inplace = True)

#### Calculate population density

In [84]:
melbourne_suburbs_df['density'] = (melbourne_suburbs_df['Population'] / melbourne_suburbs_df['area'])

#### Investigate missing data

In [85]:
print('Total number of missing values in Population column: {}'.format(melbourne_suburbs_df.shape[0]-melbourne_suburbs_df['Population'].count()))

Total number of missing values in Population column: 28


In [86]:
print('Percentage of missing data in Population column: {} %'.format(round(((melbourne_suburbs_df.shape[0]-melbourne_suburbs_df['Population'].count())/melbourne_suburbs_df.shape[0])*100, 2)))

Percentage of missing data in Population column: 5.34 %


In [87]:
# Split dataframe into full data vs missing data
full_data = melbourne_suburbs_df[melbourne_suburbs_df['Population'].notnull() & (melbourne_suburbs_df['density'].notnull())]
full_data.reset_index(inplace=True, drop=True)

missing_data = melbourne_suburbs_df[melbourne_suburbs_df['Population'].isnull() | (melbourne_suburbs_df['density'].isnull())]
missing_data.reset_index(inplace=True, drop=True)

Suburb population for the following suburbs on ABS:
   * Beenak - [25](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC20186?opendocument)
   * Big Pats Creek - [73](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC20236?opendocument)
   * Gilderoy - [65](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC20988?opendocument)
   * Mount Toolebewong - [140](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC21802?opendocument)
   * Reefton - [59](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC22147?opendocument)
   * Tarrawarra - [78](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC22445?opendocument)
   * The Patch - [1065](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC22489?opendocument)
   * Wandin North - [3051](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC22660?opendocument)
   * Yering - [115](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC22915?opendocument)

Suburb population from other sources:
   * Manor Lakes - [8667](https://www.wyndham.vic.gov.au/sites/default/files/2017-07/Manor%20Lakes%20suburb%20profile.docx)
   * Somerton - [5288](https://profile.id.com.au/s_campbellfield-somerton/population-of-campbellfield-somerton)

The following suburbs are removed from the list:
   * Aintree - [new suburb since mid-2017](https://en.wikipedia.org/wiki/Aintree,_Victoria) therefore no population information.
   * Bonnie Brook - [new suburb since mid-2017](https://en.wikipedia.org/wiki/Aintree,_Victoria) therefore no population information.
   * Calder Park - primarily a [race track](https://en.wikipedia.org/wiki/Calder_Park,_Victoria) therefore no residence
   * Camabarville - much of its [area is part of the Yarra Ranges National Park](https://en.wikipedia.org/wiki/Cambarville,_Victoria)
   * Cobblebank - [new suburb since mid-2017](https://en.wikipedia.org/wiki/Aintree,_Victoria) therefore no population information.
   * Cocoroc - primarily a [treatment plant](https://en.wikipedia.org/wiki/Cocoroc,_Victoria)
   * Deanside - [new suburb since mid-2017](https://en.wikipedia.org/wiki/Aintree,_Victoria) therefore no population information.
   * Essendon Fields - is primarily an [airport and commercial area](https://en.wikipedia.org/wiki/Essendon_Fields,_Victoria)
   * Fernshaw - much of its [area is part of the Yarra Ranges National Park](https://en.wikipedia.org/wiki/Fernshaw,_Victoria)
   * Fieldstone - [new suburb since mid-2017](https://en.wikipedia.org/wiki/Fieldstone,_Victoria) therefore no population information.
   * Fraser Rise - [new suburb since mid-2017](https://en.wikipedia.org/wiki/Fraser_Rise,_Victoria) therefore no population information.
   * Grangefields - [new suburb since mid-2017](https://en.wikipedia.org/wiki/Grangefields,_Victoria) therefore no population information.
   * Harkness - [new suburb since mid-2017](https://en.wikipedia.org/wiki/Harkness,_Victoria) therefore no population information.
   * Matlock - no information
   * Quandong - [no population information](https://www.realestateinvestar.com.au/property/quandong)
   * Strathtulloh - [new suburb since mid-2017](https://en.wikipedia.org/wiki/Strathtulloh,_Victoria) therefore no population information.
   * Thornhill Park - [new suburb since mid-2017](https://en.wikipedia.org/wiki/Thornhill_Park,_Victoria) therefore no population information.
   * Toorongo - much of its [area is part of the Yarra Ranges National Park](https://en.wikipedia.org/wiki/Toorongo,_Victoria)
   * Weir Views - [new suburb since mid-2017](https://en.wikipedia.org/wiki/Weir_Views,_Victoria) therefore no population information.

In [88]:
ABS_population = {'Beenak': 25,
                  'Big Pats Creek': 73,
                  'Gilderoy': 65,
                  'Mount Toolebewong': 140,
                  'Reefton': 59,
                  'Tarrawarra': 78,
                  'The Patch': 1065,
                  'Wandin North': 3051,
                  'Yering': 115,
                  'Manor Lakes': 8667,
                  'Matlock': 4,
                  'Somerton': 5288}

missing_data.Population = missing_data.Population.fillna(missing_data.Suburb.map(ABS_population))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


Drop the following suburbs.

In [89]:
missing_data = missing_data[~missing_data['Suburb'].isin(['Aintree',
                                                          'Bonnie Brook',
                                                          'Calder Park',
                                                          'Cambarville',
                                                          'Cobblebank',
                                                          'Cocoroc',
                                                          'Deanside',
                                                          'Essendon Fields',
                                                          'Fernshaw',
                                                          'Fieldstone',
                                                          'Fraser Rise',
                                                          'Grangefields',
                                                          'Harkness',
                                                          'Matlock',
                                                          'Quandong',
                                                          'Strathtulloh',
                                                          'Thornhill Park',
                                                          'Toorongo',
                                                          'Weir Views'])]

#### Calculating missing population densities

In [90]:
missing_data['density'] = round(missing_data['Population'] / missing_data['area'], 0)

In [91]:
melbourne_suburbs = pd.concat([full_data, missing_data], ignore_index=True)

In [92]:
melbourne_suburbs['State'] = 'Victoria'
melbourne_suburbs['City'] = 'Melbourne'

# Scrape Canberra suburbs from Wikipedia

In [623]:
cbr_suburbs_section = get_wiki_suburb_section("view-source_https___en.wikipedia.org_wiki_List_of_Canberra_suburbs.html")

In [94]:
wiki_state_name = ', Australian Capital Territory'
wiki_link_extension = ',_Australian_Capital_Territory'
city = 'Canberra'
cbr_wiki_urls = get_wiki_urls(cbr_suburbs_section, wiki_link_extension, wiki_state_name, city)

In [95]:
cbr_suburb_infobox = get_suburb_wiki_infobox(cbr_wiki_urls)

In [96]:
canberra_suburbs_df = conv2df(cbr_suburb_infobox)

In [98]:
canberra_suburbs_df = rearrange_lga(canberra_suburbs_df)

Check for suburbs with missing postcodes

In [99]:
missing_postcodes = canberra_suburbs_df[canberra_suburbs_df['Postcode'].isnull()]
missing_postcodes.reset_index(inplace=True, drop=True)

geo_postcode_dict = get_missing_postcodes_geopy(GOOGLE_MAPS_API, missing_postcodes, 'Australian Capital Territory')

canberra_suburbs_df.Postcode = canberra_suburbs_df.Postcode.fillna(canberra_suburbs_df.Suburb.map(geo_postcode_dict))

Group suburbs by LGA in dataframe

In [100]:
canberra_suburbs_df = canberra_suburbs_df.groupby(['Suburb', 'Postcode']).agg({'LGA' : ', '.join,
                                        'Population': 'mean'}).reset_index().reindex(columns=canberra_suburbs_df.columns)

#### Get geographical coordinates

In [101]:
geo_coords = get_geo_coords(GOOGLE_MAPS_API, canberra_suburbs_df, 'Australian Capital Territory')

canberra_suburbs_df = canberra_suburbs_df.merge(geo_coords, how = 'inner', on = ['Suburb', 'Postcode'])

#### Calculate geographical distance

In [102]:
geo_dist = calc_dist_to_cbd(canberra_suburbs_df)

canberra_suburbs_df = canberra_suburbs_df.merge(geo_dist, how = 'inner', on = ['Suburb', 'Postcode'])

#### Get geographical boundary

In [103]:
with open('suburb_boundaries_act.json') as f:
    geo_boundary_dict = json.load(f)
    
geo_boundary = get_geo_boundary(geo_boundary_dict, 'ACT', canberra_suburbs_df)

canberra_suburbs_df = canberra_suburbs_df.merge(geo_boundary, how = 'inner', on = ['Suburb', 'Postcode'])

#### Calculate geographical area

In [104]:
canberra_suburbs_df['area'] = canberra_suburbs_df['geometry'].apply(lambda x : round(shape(x).area * (10**4), 3))

# Drop duplicates
canberra_suburbs_df.drop_duplicates(subset =["Suburb", "LGA"], keep = 'first', inplace = True)

#### Calculate population density

In [105]:
canberra_suburbs_df['density'] = (canberra_suburbs_df['Population'] / canberra_suburbs_df['area'])

#### Investigate missing data

In [106]:
print('Total number of missing values in Population column: {}'.format(canberra_suburbs_df.shape[0]-canberra_suburbs_df['Population'].count()))
print('')
print('Percentage of missing data in Population column: {} %'.format(round(((canberra_suburbs_df.shape[0]-canberra_suburbs_df['Population'].count())/canberra_suburbs_df.shape[0])*100, 2)))

Total number of missing values in Population column: 10

Percentage of missing data in Population column: 8.77 %


In [107]:
# Split dataframe into full data vs missing data
full_data = canberra_suburbs_df[canberra_suburbs_df['Population'].notnull() & (canberra_suburbs_df['density'].notnull())]
full_data.reset_index(inplace=True, drop=True)

missing_data = canberra_suburbs_df[canberra_suburbs_df['Population'].isnull() | (canberra_suburbs_df['density'].isnull())]
missing_data.reset_index(inplace=True, drop=True)
missing_data

Unnamed: 0,LGA,Population,Postcode,Suburb,LAT,LON,distance,geometry,area,density
0,Jerrabomberra,,2620,Beard,-35.3413,149.21,9.9151,"{'type': 'MultiPolygon', 'coordinates': [[[[14...",0.443,
1,South Canberra,,2600,Capital Hill,-35.3081,149.124,3.48566,"{'type': 'MultiPolygon', 'coordinates': [[[[14...",0.693,
2,Molonglo Valley,,2611,Denman Prospect,-35.3031,149.024,10.3288,"{'type': 'MultiPolygon', 'coordinates': [[[[14...",4.701,
3,Belconnen,,2615,Macnamara,-35.2139,149.006,13.5535,"{'type': 'MultiPolygon', 'coordinates': [[[[14...",4.119,
4,Molonglo Valley,,2611,Molonglo,-35.286,149.064,6.39584,"{'type': 'MultiPolygon', 'coordinates': [[[[14...",2.17,
5,North Canberra,,2600,Russell,-35.2969,149.15,2.6303,"{'type': 'MultiPolygon', 'coordinates': [[[[14...",0.491,
6,Belconnen,,2615,Strathnairn,-35.2324,148.994,13.6328,"{'type': 'MultiPolygon', 'coordinates': [[[[14...",4.185,
7,Gungahlin,,2913,Taylor,-35.1783,149.11,11.2572,"{'type': 'MultiPolygon', 'coordinates': [[[[14...",4.038,
8,Gungahlin,,2914,Throsby,-35.1881,149.164,10.3371,"{'type': 'MultiPolygon', 'coordinates': [[[[14...",1.459,
9,Molonglo Valley,,2611,Whitlam,-35.2899,149.07,5.94766,"{'type': 'MultiPolygon', 'coordinates': [[[[14...",3.258,


In [108]:
print(missing_data['Suburb'].values)

['Beard' 'Capital Hill' 'Denman Prospect' 'Macnamara' 'Molonglo' 'Russell'
 'Strathnairn' 'Taylor' 'Throsby' 'Whitlam']


There was no residential population information on ABS or else where.
* Beard - no information.
* Capital Hill - site of the Parliament house.
* Denman Prospect - no information.
* Macnamara - no information.
* Molonglo - [suburb underdevelopment](https://en.wikipedia.org/wiki/Molonglo,_Australian_Capital_Territory)
* Russell - comprised of [government offices and no residence](https://en.wikipedia.org/wiki/Russell,_Australian_Capital_Territory)
* Strathnairn - no information.
* Taylor - no information.
* Throsby - no information.
* Whitlam - future suburb in [2020](https://en.wikipedia.org/wiki/Whitlam,_Australian_Capital_Territory).

Therefore missing_data table is discarded entirely.

In [109]:
canberra_suburbs = full_data

In [110]:
canberra_suburbs['State'] = 'Australian Capital Territory'
canberra_suburbs['City'] = 'Canberra'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


# Scrape Hobart suburbs from Wikipedia

In [624]:
hob_suburbs_section = get_wiki_suburb_section("view-source_https___en.wikipedia.org_wiki_List_of_Hobart_suburbs.html")

In [112]:
wiki_state_name = ', Tasmania'
wiki_link_extension = ',_Tasmania'
city = 'Hobart'
hob_wiki_urls = get_wiki_urls(hob_suburbs_section, wiki_link_extension, wiki_state_name, city)

In [113]:
hob_suburb_infobox = get_suburb_wiki_infobox(hob_wiki_urls)

In [114]:
hobart_suburbs_df = conv2df(hob_suburb_infobox)

In [116]:
hobart_suburbs_df = rearrange_lga(hobart_suburbs_df)

In [117]:
hobart_suburbs_df = hobart_suburbs_df.groupby(['Suburb', 'Postcode']).agg({'LGA' : ', '.join,
                                        'Population': 'mean'}).reset_index().reindex(columns=hobart_suburbs_df.columns)

#### Get geographical coordinates

In [118]:
geo_coords = get_geo_coords(GOOGLE_MAPS_API, hobart_suburbs_df, 'Tasmania')

hobart_suburbs_df = hobart_suburbs_df.merge(geo_coords, how = 'inner', on = ['Suburb', 'Postcode'])

#### Calculate geographical distance

In [119]:
geo_dist = calc_dist_to_cbd(hobart_suburbs_df)

hobart_suburbs_df = hobart_suburbs_df.merge(geo_dist, how = 'inner', on = ['Suburb', 'Postcode'])

#### Get geographical boundary

In [120]:
with open('suburb_boundaries_tas.json') as f:
    geo_boundary_dict = json.load(f)
    
geo_boundary = get_geo_boundary(geo_boundary_dict, 'TAS', hobart_suburbs_df)

hobart_suburbs_df = hobart_suburbs_df.merge(geo_boundary, how = 'inner', on = ['Suburb', 'Postcode'])

#### Calculate geographical area

In [121]:
hobart_suburbs_df['area'] = hobart_suburbs_df['geometry'].apply(lambda x : round(shape(x).area * (10**4), 3))

# Drop duplicates
hobart_suburbs_df.drop_duplicates(subset =["Suburb", "LGA"], keep = 'first', inplace = True)

#### Calculate population density

In [122]:
hobart_suburbs_df['density'] = (hobart_suburbs_df['Population'] / hobart_suburbs_df['area'])

#### Investigate missing data

In [123]:
print('Total number of missing values in Population column: {}'.format(hobart_suburbs_df.shape[0]-hobart_suburbs_df['Population'].count()))
print('')
print('Percentage of missing data in Population column: {} %'.format(round(((hobart_suburbs_df.shape[0]-hobart_suburbs_df['Population'].count())/hobart_suburbs_df.shape[0])*100, 2)))

Total number of missing values in Population column: 7

Percentage of missing data in Population column: 10.77 %


In [124]:
# Split dataframe into full data vs missing data
full_data = hobart_suburbs_df[hobart_suburbs_df['Population'].notnull() & (hobart_suburbs_df['density'].notnull())]
full_data.reset_index(inplace=True, drop=True)

missing_data = hobart_suburbs_df[hobart_suburbs_df['Population'].isnull() | (hobart_suburbs_df['density'].isnull())]
missing_data.reset_index(inplace=True, drop=True)
missing_data

Unnamed: 0,LGA,Population,Postcode,Suburb,LAT,LON,distance,geometry,area,density
0,City of Clarence,,7170,Acton Park,-42.8658,147.47,11.6096,"{'type': 'MultiPolygon', 'coordinates': [[[[14...",21.14,
1,City of Glenorchy,,7011,Chigwell,-42.8111,147.242,10.4493,"{'type': 'MultiPolygon', 'coordinates': [[[[14...",3.302,
2,City of Glenorchy,,7012,Glenlusk,-42.8239,147.201,12.1767,"{'type': 'MultiPolygon', 'coordinates': [[[[14...",8.017,
3,City of Clarence,,7021,Lauderdale,-42.9043,147.489,13.315,"{'type': 'MultiPolygon', 'coordinates': [[[[14...",5.09,
4,Sorell Council,,7171,Midway Point,-42.794,147.529,18.9756,"{'type': 'MultiPolygon', 'coordinates': [[[[14...",2.379,
5,City of Hobart,,7007,Mount Nelson,-42.9226,147.329,4.69367,"{'type': 'MultiPolygon', 'coordinates': [[[[14...",6.479,
6,City of Hobart,,7054,Ridgeway,-42.9254,147.285,6.12161,"{'type': 'MultiPolygon', 'coordinates': [[[[14...",6.819,


Suburb population from ABS:
* Midway Point - [2859](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC60399?opendocument)
* Acton Park - [2078](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC60006?opendocument)
* Lauderdale - [2411](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC60318?opendocument)
* Chigwell - [2002](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC60110?opendocument)
* Glenlusk - [200](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC60231?opendocument)
* Mount Nelson - [2495](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC60425?opendocument)
* Ridgeway - [175](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC60547?opendocument)





In [125]:
ABS_population = {'Midway Point': 2859,
                  'Acton Park': 2078,
                  'Lauderdale': 2411,
                  'Chigwell': 2022,
                  'Glenlusk': 200,
                  'Mount Nelson': 2495,
                  'Ridgeway': 175}

# Add new population data to the main dataframe
missing_data.Population = missing_data.Population.fillna(missing_data.Suburb.map(ABS_population))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


#### Calculate missing population densities

In [126]:
missing_data['density'] = round(missing_data['Population'] / missing_data['area'], 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [127]:
hobart_suburbs = pd.concat([full_data, missing_data], ignore_index=True)

In [128]:
hobart_suburbs['State'] = 'Tasmania'
hobart_suburbs['City'] = 'Hobart'

# Scrape Adelaide suburbs from wikipedia

In [625]:
adl_suburbs_section = get_wiki_suburb_section("view-source_https___en.wikipedia.org_wiki_List_of_Adelaide_suburbs.html")

In [130]:
wiki_state_name = ', South Australia'
wiki_link_extension = ',_South_Australia'
city = 'Adelaide'
adl_wiki_urls = get_wiki_urls(adl_suburbs_section, wiki_link_extension, wiki_state_name, city)

In [131]:
adl_suburb_infobox = get_suburb_wiki_infobox(adl_wiki_urls)

In [132]:
adelaide_suburbs_df = conv2df(adl_suburb_infobox)

In [134]:
adelaide_suburbs_df = rearrange_lga(adelaide_suburbs_df)

Group suburbs by LGA

In [135]:
adelaide_suburbs_df = adelaide_suburbs_df.groupby(['Suburb', 'Postcode']).agg({'LGA' : ', '.join,
                                        'Population': 'mean'}).reset_index().reindex(columns=adelaide_suburbs_df.columns)

Adjust Gawler [population](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40469?opendocument)

In [136]:
index = adelaide_suburbs_df.loc[adelaide_suburbs_df.Suburb == 'Gawler'].index[0]
adelaide_suburbs_df.at[index, 'Population'] = 650

#### Get geographical coordinates

In [137]:
geo_coords = get_geo_coords(GOOGLE_MAPS_API, adelaide_suburbs_df, 'South Australia')

adelaide_suburbs_df = adelaide_suburbs_df.merge(geo_coords, how = 'inner', on = ['Suburb', 'Postcode'])

#### Calculate geographical distance

In [138]:
geo_dist = calc_dist_to_cbd(adelaide_suburbs_df)

adelaide_suburbs_df = adelaide_suburbs_df.merge(geo_dist, how = 'inner', on = ['Suburb', 'Postcode'])

#### Get geographical boundary

In [139]:
with open('suburb_boundaries_sa.json') as f:
    geo_boundary_dict = json.load(f)
    
geo_boundary = get_geo_boundary(geo_boundary_dict, 'SA', adelaide_suburbs_df)

adelaide_suburbs_df = adelaide_suburbs_df.merge(geo_boundary, how = 'inner', on = ['Suburb', 'Postcode'])

#### Calculate geographical area

In [140]:
adelaide_suburbs_df['area'] = adelaide_suburbs_df['geometry'].apply(lambda x : round(shape(x).area * (10**4), 3))

# Drop duplicates
adelaide_suburbs_df.drop_duplicates(subset =["Suburb", "LGA"], keep = 'first', inplace = True)

Adjust Hindmarsh [area](https://en.wikipedia.org/wiki/Hindmarsh,_South_Australia)

In [141]:
index = adelaide_suburbs_df.loc[adelaide_suburbs_df.Suburb == 'Hindmarsh'].index[0]
adelaide_suburbs_df.at[index, 'area'] = 0.88

#### Calculate population density

In [142]:
adelaide_suburbs_df['density'] = (adelaide_suburbs_df['Population'] / adelaide_suburbs_df['area'])

#### Investigate missing data

In [143]:
print('Total number of missing values in Population column: {}'.format(adelaide_suburbs_df.shape[0]-adelaide_suburbs_df['Population'].count()))
print('')
print('Percentage of missing data in Population column: {} %'.format(round(((adelaide_suburbs_df.shape[0]-adelaide_suburbs_df['Population'].count())/adelaide_suburbs_df.shape[0])*100, 2)))

Total number of missing values in Population column: 48

Percentage of missing data in Population column: 12.28 %


In [144]:
# Split dataframe into full data vs missing data
full_data = adelaide_suburbs_df[adelaide_suburbs_df['Population'].notnull() & (adelaide_suburbs_df['density'].notnull())]
full_data.reset_index(inplace=True, drop=True)

missing_data = adelaide_suburbs_df[adelaide_suburbs_df['Population'].isnull() | (adelaide_suburbs_df['density'].isnull())]
missing_data.reset_index(inplace=True, drop=True)
missing_data

Unnamed: 0,LGA,Population,Postcode,Suburb,LAT,LON,distance,geometry,area,density
0,City of Mitcham,,5062,Brown Hill Creek,-34.987,138.653,8.77448,"{'type': 'MultiPolygon', 'coordinates': [[[[13...",6.783,
1,City of Playford,,5120,Buckland Park,-34.6458,138.51,31.8237,"{'type': 'MultiPolygon', 'coordinates': [[[[13...",68.249,
2,City of Playford,,5114,Craigmore,-34.7086,138.706,25.8143,"{'type': 'MultiPolygon', 'coordinates': [[[[13...",4.816,
3,City of Salisbury,,5110,Direk,-34.7235,138.609,22.2048,"{'type': 'MultiPolygon', 'coordinates': [[[[13...",2.402,
4,"City of Onkaparinga, District Council of Mount...",,5157,Dorset Vale,-35.0858,138.678,19.5626,"{'type': 'MultiPolygon', 'coordinates': [[[[13...",30.551,
5,City of Playford,,5113,Edinburgh North,-34.705,138.66,24.9091,"{'type': 'MultiPolygon', 'coordinates': [[[[13...",2.528,
6,City of Marion,,5039,Edwardstown,-34.98,138.571,6.77973,"{'type': 'MultiPolygon', 'coordinates': [[[[13...",2.282,
7,City of Playford,,5113,Elizabeth Downs,-34.6991,138.693,26.3964,"{'type': 'MultiPolygon', 'coordinates': [[[[13...",3.1,
8,City of Playford,,5113,Elizabeth North,-34.7008,138.676,25.7305,"{'type': 'MultiPolygon', 'coordinates': [[[[13...",1.74,
9,City of Playford,,5113,Elizabeth Park,-34.7124,138.685,24.7676,"{'type': 'MultiPolygon', 'coordinates': [[[[13...",2.404,


Suburb population from ABS:
* Brown Hill Creek: 50 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40170?opendocument)
* Buckland Park: 173 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40178?opendocument)
* Craigmore: 10895 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40322?opendocument)
* Edwardstown: 4328 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40391?opendocument)
* Elizabeth Downs: 5069 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40394?opendocument)
* Elizabeth North: 3463 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40397?opendocument)
* Elizabeth Park: 3861 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40398?opendocument)
* Evanston South: 341 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40420?opendocument)
* Eyre: 503 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/IARE402003?opendocument)
* Fairview Park: 3599 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40424?opendocument)
* Fitzroy: 781 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40437?opendocument)
* Gawler East: 5338 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40471?opendocument)
* Gawler West: 948 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40475?opendocument)
* Gould Creek: 242 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40521?opendocument)
* Greenwith: 8988 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40532?opendocument)
* Gulfview Heights: 3642 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40533?opendocument)
* Hillbank: 4610 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40585?opendocument)
* Leawood Gardens: 61 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40750?opendocument)
* Medindie Gardens: 340 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40846?opendocument)
* Munno Para Downs: 228 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC40971?opendocument)
* Para Hills West: 3190 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41086?opendocument)
* Para Vista: 2904 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41087?opendocument)
* Parafield: 105 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41091?opendocument)
* Penfield Gardens: 335 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41121?opendocument)
* Pooraka: 7228 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41171?opendocument)
* Redwood Park: 5421 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41229?opendocument)
* Salisbury Downs: 5984 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41276?opendocument)
* Salisbury Park: 2164 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41280?opendocument)
* Salisbury South: 99 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41282?opendocument)
* Sampson Flat: 124 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41286?opendocument)
* Sefton Park: 1210 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41313?opendocument)
* Semaphore South: 1019 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41318?opendocument)
* Smithfield Plains: 2871 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41334?opendocument)
* St Agnes: 4134 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41353?opendocument)
* Taperoo: 3091 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41407?opendocument)
* Tea Tree Gully: 3242 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41418?opendocument)
* Uleybury: 292 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41468?opendocument)
* Upper Hermitage: 285 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41477?opendocument)
* Vista: 972 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41492?opendocument)
* Windsor Gardens: 5272 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41601?opendocument)
* Wingfield: 478 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41602?opendocument)
* Woodville South: 3179 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41632?opendocument)
* Yatala Vale: 251 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41664?opendocument)
* Yattalunga: 313 [link](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC41666?opendocument)

Suburb population information from other sources:
* Lightsview: 327 [link](https://profile.id.com.au/s_lightsview/population-of-lightsview)


* Direk: primarily an industrial area
* Dorset Vale: Reservoir in Adelaide
* Edinburgh North: primarily an industrial area


In [145]:
ABS_population = {'Brown Hill Creek': 50,
                  'Buckland Park': 173,
                  'Craigmore': 10895,
                  'Edwardstown': 4328,
                  'Elizabeth Downs': 5069,
                  'Elizabeth North': 3463,
                  'Elizabeth Park': 3861,
                  'Evanston South': 341,
                  'Eyre': 503,
                  'Fairview Park': 3599,
                  'Fitzroy': 781,
                  'Gawler East': 5338,
                  'Gawler West': 948,
                  'Gould Creek': 242,
                  'Greenwith': 8988,
                  'Gulfview Heights': 3642,
                  'Hillbank': 4610,
                  'Leawood Gardens': 61,
                  'Medindie Gardens': 340,
                  'Munno Para Downs': 228,
                  'Para Hills West': 3190,
                  'Para Vista': 2904,
                  'Parafield': 105,
                  'Penfield Gardens': 335,
                  'Pooraka': 7228,
                  'Redwood Park': 5421,
                  'Salisbury Downs': 5984,
                  'Salisbury Park': 2164,
                  'Salisbury South': 99,
                  'Sampson Flat': 124,
                  'Sefton Park': 1210,
                  'Semaphore South': 1019,
                  'Smithfield Plains': 2871,
                  'St Agnes': 4134,
                  'Taperoo': 3091,
                  'Tea Tree Gully': 3242,
                  'Uleybury': 292,
                  'Upper Hermitage': 285,
                  'Vista': 972,
                  'Windsor Gardens': 5272,
                  'Wingfield': 478,
                  'Woodville South': 3179,
                  'Yatala Vale': 251,
                  'Yattalunga': 313,
                  'Lightsview': 327}

In [146]:
# Add new population data to the main dataframe
missing_data.Population = missing_data.Population.fillna(missing_data.Suburb.map(ABS_population))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [147]:
# Drop the following suburbs
missing_data = missing_data[~missing_data['Suburb'].isin(['Direk',
                                                          'Dorset Vale',
                                                          'Edinburgh North'])]

#### Calculate missing population densities

In [148]:
missing_data['density'] = round(missing_data['Population'] / missing_data['area'], 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [149]:
adelaide_suburbs = pd.concat([full_data, missing_data], ignore_index=True)

In [150]:
adelaide_suburbs['State'] = 'South Australia'
adelaide_suburbs['City'] = 'Adelaide'

# Scrape Darwin suburbs from wikipedia

In [626]:
drw_suburbs_section = get_wiki_suburb_section("view-source_https___en.wikipedia.org_wiki_List_of_Darwin_suburbs.html")

In [152]:
wiki_state_name = ', Northern Territory'
wiki_link_extension = ',_Northern_Territory'
city = 'Darwin'
drw_wiki_urls = get_wiki_urls(drw_suburbs_section, wiki_link_extension, wiki_state_name, city)

In [153]:
drw_suburb_infobox = get_suburb_wiki_infobox(drw_wiki_urls)

In [154]:
darwin_suburbs_df = conv2df(drw_suburb_infobox)

In [156]:
darwin_suburbs_df = rearrange_lga(darwin_suburbs_df)

Find missing postcodes

In [157]:
missing_postcodes = darwin_suburbs_df[darwin_suburbs_df['Postcode'].isnull()]
missing_postcodes.reset_index(inplace=True, drop=True)

geo_postcode_dict = get_missing_postcodes_geopy(GOOGLE_MAPS_API, missing_postcodes, 'Northern Territory')

darwin_suburbs_df.Postcode = darwin_suburbs_df.Postcode.fillna(darwin_suburbs_df.Suburb.map(geo_postcode_dict))

Group suburbs by LGA

In [158]:
darwin_suburbs_df = darwin_suburbs_df.groupby(['Suburb','Postcode']).agg({'LGA' : ', '.join,
                                                                           'Population': 'mean'}).reset_index().reindex(columns=darwin_suburbs_df.columns)

#### Get geographical coordinates

In [159]:
geo_coords = get_geo_coords(GOOGLE_MAPS_API, darwin_suburbs_df, 'Northern Territory')

darwin_suburbs_df = darwin_suburbs_df.merge(geo_coords, how = 'inner', on = ['Suburb', 'Postcode'])

#### Calculate geographical distance

In [160]:
geo_dist = calc_dist_to_cbd(darwin_suburbs_df)

darwin_suburbs_df = darwin_suburbs_df.merge(geo_dist, how = 'inner', on = ['Suburb', 'Postcode'])

#### Get geographical boundary

In [161]:
with open('suburb_boundaries_nt.json') as f:
    geo_boundary_dict = json.load(f)
    
geo_boundary = get_geo_boundary(geo_boundary_dict, 'NT', darwin_suburbs_df)

darwin_suburbs_df = darwin_suburbs_df.merge(geo_boundary, how = 'inner', on = ['Suburb', 'Postcode'])  

#### Calculate geographical area

In [162]:
darwin_suburbs_df['area'] = darwin_suburbs_df['geometry'].apply(lambda x : round(shape(x).area * (10**4), 3))

# Drop duplicates
darwin_suburbs_df.drop_duplicates(subset =["Suburb", "LGA"], keep = 'first', inplace = True)

#### Calculate population density

In [163]:
darwin_suburbs_df['density'] = (darwin_suburbs_df['Population'] / darwin_suburbs_df['area'])

#### Investigate missing data

In [164]:
# Split dataframe into full data vs missing data
full_data = darwin_suburbs_df[darwin_suburbs_df['Population'].notnull() & (darwin_suburbs_df['density'].notnull())]
full_data.reset_index(inplace=True, drop=True)

missing_data = darwin_suburbs_df[darwin_suburbs_df['Population'].isnull() | (darwin_suburbs_df['density'].isnull())]
missing_data.reset_index(inplace=True, drop=True)

There's no missing population data

In [165]:
darwin_suburbs = full_data

In [166]:
darwin_suburbs['State'] = 'Northern Territory'
darwin_suburbs['City'] = 'Darwin'

# Scrape Perth suburbs from wikipedia

In [627]:
per_suburbs_section = get_wiki_suburb_section("view-source_https___en.wikipedia.org_wiki_List_of_Perth_suburbs.html")

In [168]:
wiki_state_name = ', Western Australia'
wiki_link_extension = ',_Western_Australia'
city = 'Perth'
per_wiki_urls = get_wiki_urls(per_suburbs_section, wiki_link_extension, wiki_state_name, city)

In [169]:
per_suburb_infobox = get_suburb_wiki_infobox(per_wiki_urls)

In [170]:
perth_suburbs_df = conv2df(per_suburb_infobox)

In [172]:
perth_suburbs_df = rearrange_lga(perth_suburbs_df)

In [173]:
perth_suburbs_df = perth_suburbs_df.groupby(['Suburb','Postcode']).agg({'LGA' : ', '.join,
                                                                           'Population': 'mean'}).reset_index().reindex(columns=perth_suburbs_df.columns)

#### Get geographical coordinates

In [174]:
geo_coords = get_geo_coords(GOOGLE_MAPS_API, perth_suburbs_df, 'Western Australia')

perth_suburbs_df = perth_suburbs_df.merge(geo_coords, how = 'inner', on = ['Suburb', 'Postcode'])

#### Calculate geographical distance

In [175]:
geo_dist = calc_dist_to_cbd(perth_suburbs_df)

perth_suburbs_df = perth_suburbs_df.merge(geo_dist, how = 'inner', on = ['Suburb', 'Postcode'])

#### Get geographical boundaries

In [176]:
with open('suburb_boundaries_wa.json') as f:
    geo_boundary_dict = json.load(f)
    
geo_boundary = get_geo_boundary(geo_boundary_dict, 'WA', perth_suburbs_df)

perth_suburbs_df = perth_suburbs_df.merge(geo_boundary, how = 'inner', on = ['Suburb', 'Postcode'])    

#### Calculate geographical area

In [177]:
perth_suburbs_df['area'] = perth_suburbs_df['geometry'].apply(lambda x : round(shape(x).area * (10**4), 3))

# Drop duplicates
perth_suburbs_df.drop_duplicates(subset =["Suburb", "LGA"], keep = 'first', inplace = True)

Adjust West Perth area

In [178]:
index = perth_suburbs_df.loc[perth_suburbs_df.Suburb == 'West Perth'].index[0]
perth_suburbs_df.at[index, 'area'] = 1.07

Adjust Cannington [population](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC50245)

In [179]:
index = perth_suburbs_df.loc[perth_suburbs_df.Suburb == 'Cannington'].index[0]
perth_suburbs_df.at[index, 'Population'] = 5929

#### Calculate population density

In [180]:
perth_suburbs_df['density'] = (perth_suburbs_df['Population'] / perth_suburbs_df['area'])

#### Investigate missing data

In [181]:
# Split dataframe into full data vs missing data
full_data = perth_suburbs_df[perth_suburbs_df['Population'].notnull() & (perth_suburbs_df['density'].notnull())]
full_data.reset_index(inplace=True, drop=True)

missing_data = perth_suburbs_df[perth_suburbs_df['Population'].isnull() | (perth_suburbs_df['density'].isnull())]
missing_data.reset_index(inplace=True, drop=True)

In [182]:
missing_data['Suburb'].values

array(['Ashendon', 'Balga', 'Henderson', 'Herdsman', 'Karrakatta',
       'Karrakup', 'Kwinana Beach', 'Kwinana Town Centre', 'Lexia',
       'Malaga', 'Mandogalup', 'Melaleuca', 'Naval Base', "O'Connor",
       'Pinjar', 'Postans', 'The Spectacles', 'Welshpool', 'Whiteman'],
      dtype=object)

#### Suburb population information from [ABS](https://quickstats.censusdata.abs.gov.au/census_services/getproduct/census/2016/quickstat/SSC50033?opendocument):
* Ashendon: 15
* Balga: 12685
* Byford: 14908
* Henderson: 14
* Karrakatta: 5
* Karrakup: 190
* Kwinana Beach: 6
* Kwinana Town Centre: 249
* Lexia: 29
* Malaga: 6
* Mandogalup: 55
* O'Connor: 463
* Pinjar: 96
* Welshpool: 19
* Whiteman: 7

#### Suburb population information from other sources:
* Melaleuca: [4](https://homesales.com.au/location/melaleuca-wa/)

#### Following suburbs are dropped from the dataframe
* Herdsman: Lake and park
* Naval Base: predominately an [industrial suburb](https://en.wikipedia.org/wiki/Naval_Base,_Western_Australia) 
* Postans: area for waste water treatment plant 
* The Spectacles: [wetland reservce and bushland](https://en.wikipedia.org/wiki/The_Spectacles,_Western_Australia)


In [183]:
ABS_population = {'Ashendon': 15,
                  'Balga': 12685,
                  'Byford': 14908,
                  'Henderson': 14,
                  'Karrakatta': 5,
                  'Karrakup': 190,
                  'Kwinana Beach': 6,
                  'Kwinana Town Centre': 249,
                  'Lexia': 29,
                  'Malaga': 6,
                  'Mandogalup': 55,
                  '''O'Connor''': 463,
                  'Pinjar': 96,
                  'Welshpool': 19,
                  'Whiteman': 7,
                  'Melaleuca': 4}

In [184]:
missing_data.Population = missing_data.Population.fillna(missing_data.Suburb.map(ABS_population))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [185]:
# Drop the following suburbs
missing_data = missing_data[~missing_data['Suburb'].isin(['Herdsman',
                                                          'Naval Base',
                                                          'Postans',
                                                          'The Spectacles'])]

#### Calculate missing population densities

In [186]:
missing_data['density'] = round(missing_data['Population'] / missing_data['area'], 0)

In [187]:
perth_suburbs = pd.concat([full_data, missing_data], ignore_index=True)

In [188]:
perth_suburbs['State'] = 'Western Australia'
perth_suburbs['City'] = 'Perth'

#### Concatenate suburb dataframes

In [195]:
aus_suburbs = pd.concat([sydney_suburbs,
                         brisbane_suburbs,
                         melbourne_suburbs,
                         hobart_suburbs,
                         canberra_suburbs,
                         adelaide_suburbs,
                         darwin_suburbs,
                         perth_suburbs])

aus_suburbs.reset_index(inplace=True, drop=True)

In [196]:
# Add suburb_id column
aus_suburbs['Suburb_id'] = np.arange(0, aus_suburbs.shape[0])

# Move suburb_id column to the front
cols = list(aus_suburbs.columns)
cols = [cols[-1]] + cols[:-1]
aus_suburbs = aus_suburbs[cols]

In [628]:
# Re-order columns
aus_suburbs = aus_suburbs[['Suburb_id',
                           'Suburb',
                           'Postcode',
                           'LGA',
                           'City',
                           'State',
                           'Population',
                           'LAT',
                           'LON',
                           'distance',
                           'geometry',
                           'area',
                           'density']]

# Save files

In [199]:
sydney_suburbs.to_csv('sydney_suburbs.csv')
brisbane_suburbs.to_csv('brisbane_suburbs.csv')
melbourne_suburbs.to_csv('melbourne_suburbs.csv')
canberra_suburbs.to_csv('canberra_suburbs.csv')
hobart_suburbs.to_csv('hobart_suburbs.csv')
adelaide_suburbs.to_csv('adelaide_suburbs.csv')
darwin_suburbs.to_csv('darwin_suburbs.csv')
perth_suburbs.to_csv('perth_suburbs.csv')

aus_suburbs.to_csv('aus_suburbs.csv')