In [4]:
import os
import pandas as pd
import numpy as np

import selenium
from selenium import webdriver

import random
import time

from bs4 import BeautifulSoup
from urllib.request import urlopen

In [5]:
# Get suburb name and state from aus_suburb table.
def convert_to_acronym(state_var):
    if state_var == 'New South Wales':
        state_var = 'NSW'
    if state_var == 'Victoria':
        state_var = 'Vic'
    if state_var == 'Australian Capital Territory':
        state_var = 'ACT'
    if state_var == 'Queensland':
        state_var = 'Qld'
    if state_var == 'South Australia':
        state_var = 'SA'
    if state_var == 'Northern Territory':
        state_var = 'NT'
    if state_var == 'Western Australia':
        state_var = 'WA'
    if state_var == 'Tasmania':
        state_var = 'Tas'
    return state_var

In [35]:
def find_abs_quick_stat_webpage(suburb, state):
    # Select the id box
    id_box = driver.find_element_by_class_name('gwt-SearchWidget-SuggestBox-Fade')

    # Send id information
    id_box.send_keys(suburb)

    # Time delay of 0.2-60sec (minimum of 0.2 sec required)
    num = random.uniform(0.2,10.0)
    print(num)
    time.sleep(num)

    # Get pop-up table
    popup_table_id = driver.find_element_by_class_name('suggestPopupMiddleCenter')

    # Get suburb index from popup table
    rows = popup_table_id.find_elements_by_tag_name("tr")

    counter = -1

    for row in rows:
        counter = counter + 1
        popup_text = row.text
        if 'State Suburb (SSC)' in popup_text:
            if state in popup_text:
    #             print(counter, popup_text)
                break

    # Get Suburb gwt-uid from popup table 
    IDs = popup_table_id.find_elements_by_tag_name('td')
    suburb_uid = IDs[counter].get_attribute('id')

    # Find suburb by id
    popup_item = driver.find_element_by_id(suburb_uid)

    # Select suburb
    popup_item.click()

    # Find GO button
    go_button = driver.find_element_by_class_name('gwt-SearchWidget-Button')

    # Click GO
    go_button.click()

In [7]:
# Function for getting the tables on html page
def get_tables(soup_html, id_content):
    content_tables = soup_html.find('div', attrs={'class':'content', 'id':id_content})
    return content_tables.find_all('table')

In [8]:
# Function for getting data from the tables and store in a dictionary
def scrape_stats(tables):
    data = {}
        
    for table in tables:
        
        table_rows = table.find_all('tr')
        table_name = table_rows[0].find('a', attrs={'class':'dictionaryLink'}).text        

        for i in range(1, len(table_rows)):
            
            try:
                row_label = table_rows[i].find('th', attrs={'class':'firstCol','scope':'row'}).text
            except(AttributeError):
                row_label = table_rows[i].find('th', attrs={'class':'firstCol'}).text

            count = table_rows[i].find_all('td')[0].text
            percentage = table_rows[i].find_all('td')[1].text

            data['{}_{}_{}'.format(table_name, row_label, 'count')] = count
            data['{}_{}_{}'.format(table_name, row_label, 'percentage')] = percentage
            
    return data

In [9]:
# Function combining get_tables and scrape_data functions
def get_stats(soup_html, id_content):
    tables = get_tables(soup_html, id_content)
    table_data = scrape_stats(tables)
    return table_data

In [10]:
# Function for getting quick stat tables
def get_qs_tables(soup_html):
    quick_stat_tables = soup_html.find('div', attrs={'id':'summaryTableAP'})
    return quick_stat_tables.find_all('table')

In [11]:
# Function for scraping quick stat tables
def scrape_qs(qs_tables):
    data = {}

    for qs_table in qs_tables:

        qs_table_rows = qs_table.find_all('tr')
        
        # Table labels and values for first row
        qs_table_name = qs_table_rows[0].find('th', attrs={'class':'addBold', 'scope':'row'}).text        
        qs_row_label = qs_table_name
        qs_row_value = qs_table_rows[0].find('td', attrs={'class':'summaryData'}).text

        data['{}_{}_{}'.format(qs_table_name, qs_row_label, 'count')] = qs_row_value
        
        # Table labels and values for second row and onwards
        for i in range(1, len(qs_table_rows)):
            
            qs_row_label = qs_table_rows[i].find('th', attrs={'scope':'row'}).text

            try:            
                qs_row_value = qs_table_rows[i].find('td', attrs={'class':'summaryData'}).text                
                if "%" in qs_row_value:            
                    data['{}_{}_{}'.format(qs_table_name, qs_row_label, 'percentage')] = qs_row_value        
                else:                          
                    data['{}_{}_{}'.format(qs_table_name, qs_row_label, 'count')] = qs_row_value                
            except(AttributeError):            
                pass
    
    # Change row label names
    data['Families_Avg children per family_for families with children_count'] = data.pop('Families_for families with children_count')
    data['Families_Avg children per family_for all families_count'] = data.pop('Families_for all families_count')
    
    data['All private dwellings_Average number of people per household_count'] = data.pop('All private dwellings_Average people per household_count')

    return data    

In [12]:
# Function combining get_tables and scrape_data functions
def get_qs_stats(soup_html):
    qs_tables = get_qs_tables(soup_html)
    qs_table_data = scrape_qs(qs_tables)
    return qs_table_data

### Get demographic data from Australian Bureau of Statistics

In [14]:
# Load file
aus_suburbs = pd.read_csv(os.path.join('aus_suburbs.csv'), index_col=0)

In [46]:
# Using Chrome to access web
driver = webdriver.Chrome()

driver.get('https://www.abs.gov.au/websitedbs/D3310114.nsf/Home/2016%20QuickStats')

In [41]:
ABS_data = {}

for index,row in aus_suburbs.iterrows():

    # Get suburb and state names from aus_suburbs table
    suburb = row.Suburb
    state = row.State
    state = convert_to_acronym(state)

    print('{} {}'.format(suburb, state))

    while True:
        # Open the website
        driver.get('https://www.abs.gov.au/websitedbs/D3310114.nsf/Home/2016%20QuickStats')

        # Select census year
        year = '2016'
        census_cycle = driver.find_element_by_class_name('gwt-ListBox')
        census_year = census_cycle.find_element_by_xpath('//option[@value={}]'.format(year))
        census_year.click()

        # Go to suburb's ABS url page
        try:
            find_abs_quick_stat_webpage(suburb, state)
        except(Exception) as e:
            print('{} {} does not have a quickstat page'.format(suburb, state))
            break

        # Get suburb's QuickStat URL
        url = driver.current_url

        # Get html page from URL
        soup = BeautifulSoup(urlopen(url), "html.parser")
        print(soup.title.text)

        # Check if it is the correct URL page
        if suburb in soup.title.text:
            break

    # Scrape data from tables
    try:
        people = get_stats(soup, 'peopleContent')
        family = get_stats(soup, 'familyContent')
        dwelling = get_stats(soup, 'dwellingContent')
        ingp = get_stats(soup, 'INGPContent')

        # Merge datasets into one dictionary
        suburb_stats = {**people, **family, **dwelling, **ingp}

        # Add state as subkey
        suburb_stats['State'] = state

    except(AttributeError, NameError) as e:    
        # Scrape data that only have a quick stat table
        try:
            suburb_stats = get_qs_stats(soup)
            suburb_stats['State'] = state

        # Suburb does not contain any stats
        except(AttributeError):
            print(suburb)
            suburb_stats = {}

    # Add suburb dictionary to main dictionary
    ABS_data[suburb] = suburb_stats

    # Random time delay before starting next loop
    num = random.uniform(0.2,10.0)
    print(num)
    time.sleep(num)

#### Convert dictionary to table


In [127]:
df = pd.DataFrame(ABS_data).T

df.index.name = 'Suburb'
df.reset_index(inplace=True)

# Move 'State' column to the front
cols = list(df)
cols.insert(0, cols.pop(cols.index('State')))
df = df.loc[:, cols]

#### Clean data

In [128]:
# Remove characters from columns
for column in df:
    df[column] = df[column].str.replace(',', '')
    df[column] = df[column].str.replace('$', '')
    df[column] = df[column].str.replace('%', '')
    df[column] = df[column].replace('--', np.nan)
        
# Convert string to float
for column in df:
    if ('count' in column) or ('percentage' in column):
        df[column] = pd.to_numeric(df[column])

In [129]:
df.head()

Unnamed: 0,State,Age_0-4 years_count,Age_0-4 years_percentage,Age_10-14 years_count,Age_10-14 years_percentage,Age_15-19 years_count,Age_15-19 years_percentage,Age_20-24 years_count,Age_20-24 years_percentage,Age_25-29 years_count,...,"Unpaid domestic work, number of hours_Less than 5 hours per week_count","Unpaid domestic work, number of hours_Less than 5 hours per week_percentage",Unpaid work_Cared for child/children (last two weeks)_count,Unpaid work_Cared for child/children (last two weeks)_percentage,Unpaid work_Did unpaid domestic work (last week)_count,Unpaid work_Did unpaid domestic work (last week)_percentage,Unpaid work_Did voluntary work through an organisation or group (last 12 months)_count,Unpaid work_Did voluntary work through an organisation or group (last 12 months)_percentage,Unpaid work_Provided unpaid assistance to a person with a disability (last two weeks)_count,Unpaid work_Provided unpaid assistance to a person with a disability (last two weeks)_percentage
Abbotsbury,NSW,214.0,5.0,279.0,6.6,347.0,8.2,377.0,8.9,288.0,...,880.0,25.1,884.0,25.2,2346.0,67.0,439.0,12.5,507.0,14.5
Abbotsford,Vic,319.0,3.9,153.0,1.9,210.0,2.6,760.0,9.3,1697.0,...,2688.0,35.7,1132.0,15.0,5489.0,72.8,1506.0,20.0,563.0,7.5
Aberfeldie,Vic,179.0,4.6,291.0,7.5,348.0,8.9,284.0,7.3,203.0,...,891.0,28.1,921.0,29.0,2404.0,75.8,759.0,24.0,476.0,15.0
Aberfoyle Park,SA,587.0,5.3,677.0,6.1,846.0,7.7,691.0,6.3,606.0,...,2205.0,24.2,2775.0,30.5,7181.0,79.0,2151.0,23.7,1167.0,12.8
Acacia Gardens,NSW,295.0,7.8,267.0,7.0,234.0,6.2,250.0,6.6,235.0,...,728.0,25.0,1050.0,36.2,2157.0,74.2,491.0,16.9,357.0,12.3


#### Save dataframe as csv

In [130]:
df.to_csv('ABS_2016_data.csv')