In [969]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import geocoder
from geopy.geocoders import Nominatim
import unittest
#!conda install -c conda-forge folium=0.5.0 --yes # Library for Map - Folium
import folium
#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import re

# Scraping suburb information

### Scrape information about each Sydney suburb from Wikipedia

Here I use [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) to scrape the suburbs from Sydney as listed on [wikipedia](https://en.wikipedia.org/wiki/List_of_Sydney_suburbs). For each suburb, I identify the wikipage url for the indexed suburb, request access to the webpage, and then collect the following from their information box:
* **Postcode**: Postcode
* **Density**: Population density
* **Area**: Area size of suburb
* **LGA**: Local government assocation (council)
* **Location**: Distance from the city

In [410]:
with open("view-source_https___en.wikipedia.org_wiki_List_of_Sydney_suburbs.html") as html_file:
    soup = BeautifulSoup(html_file, "lxml")

syd_suburbs_section = soup.find('div',attrs={'class':'mw-parser-output'})
syd_suburbs_section = syd_suburbs_section.findAll('a', href=True)

In [1480]:
# Scrape url for each suburb
def get_wiki_urls(html_suburbs_section, wiki_link_extension, wiki_state_name, state):
    url_list = {}
    for i in range(len(html_suburbs_section)):
        url = html_suburbs_section[i]['href']
        if wiki_link_extension in url:

            if any(x in url for x in ['File:', 'List_of_']):
                continue        

            else:
                suburb = html_suburbs_section[i]['title'].replace(wiki_state_name,'')

                if suburb in url_list:
                    continue           

                else:
                    url_list[suburb] = 'https://en.wikipedia.org{}'.format(url)
        
        # Sydney cbd
        elif '{}_CBD'.format(state) in url:
            suburb = html_suburbs_section[i]['title'] 
            url_list[suburb] = 'https://en.wikipedia.org{}'.format(url)

        # Brisbane cbd
        elif '{}_central_business_district'.format(state) in url:
            suburb = html_suburbs_section[i]['title']
            url_list[suburb] = 'https://en.wikipedia.org{}'.format(url)
        else:
            continue
    
    return url_list

In [1235]:
wiki_state_name = ', New South Wales'
wiki_link_extension = ',_New_South_Wales'
state = 'Sydney'
syd_wiki_urls = get_wiki_urls(syd_suburbs_section, wiki_link_extension, wiki_state_name, state)
#syd_wiki_urls

In [1237]:
# Scrape information box for each suburb from their wikipage.
# If an information box doesn't exist a try/catch error for the AttributeError will pass it.

def get_suburb_wiki_infobox(wiki_urls):
    
    suburbs_infobox = {}
    for key,value in wiki_urls.items():

        try:
            page = requests.get(value)
            soup_page = BeautifulSoup(page.text, 'html.parser')
            infobox = soup_page.find('table', class_='infobox vcard')
            suburbs_infobox[key] = infobox.find_all('tr', class_='')

        except AttributeError:
            suburbs_infobox[key] = None
            #print("{}'s wikipage does not have an information box".format(key))
            pass
    
    return suburbs_infobox

In [1238]:
syd_suburb_infobox = get_suburb_wiki_infobox(syd_wiki_urls)

In [1427]:
def get_suburb_info(suburb_infobox):
    check_strings = ('Density', 'Postcode', 'Area', 'Location')
    check_strings_2 = ('LGA(s)')

    suburbs = {}

    for key,value in suburb_infobox.items():

        try:
            infobox_val = value
            items = {}

            for val in infobox_val:        

                if any(s in val.text for s in check_strings):
                    # Find check_strings in infobox list of strings
                    keyword = [s for s in check_strings if s in val.text]
                    keyword = keyword[0]

                    infobox_split = val.text.split(' ')
                    #infobox_split = re.split('  |\ ', val.text)

                    if len(infobox_split) > 1:
                        infobox_item = [s for s in infobox_split if str(keyword) in s]
                        # Remove substrings from string
                        info = infobox_item[0]
                        info = info.replace("Density","").replace('/km2',' ').replace(',','')\
                                   .replace("Area","").replace('\xa0km2',' ').replace(',','')\
                                   .replace("Location","").replace('\xa0km',' ').replace(',','')

                        info = info.split('\xa0(')
                        info = info[0]

                        # Convert Area, Density, and Location columns from string to numeric
                        try:
                            info = float(info)
                            items[keyword] = info
                        except:
                            info = None 
                            items[keyword] = info

                    elif len(infobox_split) == 1:
                        info = infobox_split[0]
                        info = info.replace("Postcode(s)","")
                        info = info.split('[')
                        info = info[0]
                        items[keyword] = info

                elif 'LGA' in val.text:
                    info = val.text.replace("LGA(s)","").replace(";",")")
                    info = info.split('[')
                    info = info[0]
                    items['LGA'] = info

                else:
                    continue

            suburbs[key] = items

        except TypeError:
            suburbs[key] = None
    
    return suburbs

In [1428]:
sydney_suburbs = get_suburb_info(syd_suburb_infobox)

In [1389]:
# Convert Sydney suburbs nested dictionary to a dataframe
sydney_suburbs_df = pd.DataFrame(sydney_suburbs).T
sydney_suburbs_df['Suburb'] = sydney_suburbs_df.index
sydney_suburbs_df.index = range(sydney_suburbs_df.shape[0])

sydney_suburbs_df.head(10)

Unnamed: 0,Area,Density,LGA,Location,Postcode,Suburb
0,,,Canterbury-Bankstown Council,16,2200,Bankstown
1,1.22,9550.0,Waverley Council,7,2026,Bondi Beach
2,2.9,8590.0,City of Willoughby,10,2067,Chatswood
3,,,Sutherland Shire,26,2230,Cronulla
4,6.3,4300.0,City of Liverpool,27,2170,Liverpool
5,5.6,2833.0,Northern Beaches Council,17,2095,Manly
6,0.2,10800.0,North Sydney Council,3,2061,Milsons Point
7,8.7,3273.0,Mosman Council,8,2088,Mosman
8,1.6,9390.0,City of Sydney Inner West Council,4,2042,Newtown
9,5.3,4870.0,City of Parramatta,24,2150,Parramatta


Not every suburb has information about it's area size, population density and location distance from the city's cbd. Therefore these columns will be removed for the time being and the dataframe will be grouped according to their postcode.

In [1398]:
syd_suburbs_grouped = sydney_suburbs_df.groupby(["Postcode"]).agg({'Suburb':', '.join,\
                                                                         'Area': lambda x: x.sum(skipna=True),\
                                                                         'Density': lambda x: x.mean(skipna=True),\
                                                                         'Location': lambda x: x.mean(skipna=True)}).reset_index()

syd_suburbs_grouped.rename(columns={'Area': 'Sum Area(km2)', 'Density':'Avg Pop Density (/km2)', 'Location':'Avg distance from cbd(km)'}, inplace=True)
syd_suburbs_grouped.head(10)
#Convert density column from float to int

Unnamed: 0,Postcode,Suburb,Sum Area(km2),Avg Pop Density (/km2),Avg distance from cbd(km)
0,2000,"The Rocks, Barangaroo, Sydney CBD, Dawes Point...",4.22,5979.833333,1.0
1,2007,Ultimo,0.6,14700.0,2.0
2,2008,"Chippendale, Darlington",1.2,8393.0,2.5
3,2009,Pyrmont,1.0,13000.0,2.0
4,2010,"Darlinghurst, Surry Hills",2.0,12585.0,1.0
5,2011,"Elizabeth Bay, Potts Point, Rushcutters Bay, W...",1.7,12748.325,2.5
6,2015,"Alexandria, Beaconsfield, Eveleigh",4.0,2442.5,4.0
7,2016,Redfern,1.2,11000.0,3.0
8,2017,"Waterloo, Zetland",1.9,12950.0,3.5
9,2018,"Eastlakes, Rosebery",2.7,6254.75,7.0



### Scrape information about each Brisbane suburb from Wikipedia

In [1481]:
with open("view-source_https___en.wikipedia.org_wiki_List_of_Brisbane_suburbs.html") as html_file:
    soup = BeautifulSoup(html_file, "lxml")

bri_suburbs_section = soup.find('div',attrs={'id':'mw-content-text', 'class':'mw-content-ltr'})
bri_suburbs_section = bri_suburbs_section.find_all('a', href=True)

In [1482]:
wiki_state_name = ', Queensland'
wiki_link_extension = ',_Queensland'
state = 'Brisbane'
bne_wiki_urls = get_wiki_urls(bri_suburbs_section, wiki_link_extension, wiki_state_name, state)

In [1484]:
bne_suburb_infobox = get_suburb_wiki_infobox(bne_wiki_urls)

In [1485]:
brisbane_suburbs = get_suburb_info(bne_suburb_infobox)

In [1486]:
# Convert Brisbane suburbs nested dictionary to a dataframe
brisbane_suburbs_df = pd.DataFrame(brisbane_suburbs).T
brisbane_suburbs_df['Suburb'] = brisbane_suburbs_df.index
brisbane_suburbs_df.index = range(brisbane_suburbs_df.shape[0])

brisbane_suburbs_df.head(10)

Unnamed: 0,Area,Density,LGA,Location,Postcode,Suburb
0,2.07,4570.0,City of Brisbane(Central Ward),,4000,Brisbane central business district
1,1.7,1900.0,City of Brisbane(Central Ward),,4006,Bowen Hills
2,2.0,3000.0,City of Brisbane (The Gabba Ward),2.5,4169,East Brisbane
3,1.4,4980.0,City of Brisbane(Central Ward),1.0,4006,Fortitude Valley
4,1.7,1300.0,City of Brisbane(Central Ward),3.0,4006,Herston
5,1.2,5160.0,City of Brisbane (The Gabba Ward),2.0,4101,Highgate Hill
6,1.3,6200.0,City of Brisbane (The Gabba Ward),,4169,Kangaroo Point
7,,,City of Brisbane(Paddington Ward),,4059,Kelvin Grove
8,2.0,6300.0,City of Brisbane(Central Ward),2.0,4005,New Farm
9,1.3,1690.0,City of Brisbane(Central Ward),3.0,4006,Newstead


In [1487]:
bne_suburbs_grouped = brisbane_suburbs_df.groupby(["Postcode"]).agg({'Suburb':', '.join,\
                                                                         'Area': lambda x: x.sum(skipna=True),\
                                                                         'Density': lambda x: x.mean(skipna=True),\
                                                                         'Location': lambda x: x.mean(skipna=True)}).reset_index()

bne_suburbs_grouped.rename(columns={'Area': 'Sum Area(km2)', 'Density':'Avg Pop Density (/km2)', 'Location':'Avg distance from cbd(km)'}, inplace=True)
bne_suburbs_grouped.head(10)
#Convert density column from float to int

Unnamed: 0,Postcode,Suburb,Sum Area(km2),Avg Pop Density (/km2),Avg distance from cbd(km)
0,4000,"Brisbane central business district, Petrie Ter...",3.27,4775.0,2.0
1,4005,"New Farm, Teneriffe",3.0,5820.0,2.25
2,4006,"Bowen Hills, Fortitude Valley, Herston, Newstead",6.1,2467.5,2.333333
3,4007,"Ascot, Hamilton",4.4,3125.0,7.5
4,4008,Pinkenba,0.0,,9.0
5,4009,Eagle Farm,0.0,,9.0
6,4010,Albion,1.4,1640.0,6.0
7,4011,"Clayfield, Hendra",5.6,2675.0,8.0
8,4012,"Nundah, Wavell Heights",8.2,2737.5,11.9
9,4013,Northgate,3.1,1388.0,


In [1488]:
print('Sydney dataframe: {}'.format(syd_suburbs_grouped.shape))
print("")
print('Brisbane dataframe: {}'.format(bne_suburbs_grouped.shape))

Sydney dataframe: (228, 5)

Brisbane dataframe: (130, 5)


# Get geographical coordinates for each suburb

In [1406]:
def get_latlong(df_pc, state):
    geolocator = Nominatim(user_agent="specify_your_app_name_here", timeout=3)
    
    lat = []
    long = []

    for index,row in df_pc.iterrows():
        postcode = row.Postcode
        location = geolocator.geocode("{} {} Australia".format(postcode, state))
        lat.append(location.latitude)
        long.append(location.longitude)

    latlong = dict({'lat':lat, 'long':long})
    latlong_df = pd.DataFrame(latlong)
    
    return latlong_df

In [1491]:
# Geospatial coordinates for Sydney suburbs
latlong_syd = get_latlong(syd_suburbs_grouped, 'New South Wales')

In [1492]:
# Geospatial coordinates for Brisbane suburbs
latlong_bne = get_latlong(bne_suburbs_grouped, 'Queensland')

### Merge the latlong dataframe with the suburbs_pc dataframe

In [1493]:
syd_suburbs_coord = pd.concat([syd_suburbs_grouped, latlong_syd], axis=1)
bne_suburbs_coord = pd.concat([bne_suburbs_grouped, latlong_bne], axis=1)

In [1494]:
print('Sydney dataframe: {}'.format(syd_suburbs_coord.shape))
print("")
print('Brisbane dataframe: {}'.format(bne_suburbs_coord.shape))

Sydney dataframe: (228, 7)

Brisbane dataframe: (130, 7)


# Plotting Sydney, Melbourne, and Hobart suburb maps

### Plotting Sydney suburb map

In [1495]:
address = "Sydney, New South Wales"
geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Sydney, NSW are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Sydney, NSW are -33.8548157, 151.2164539.


In [1496]:
# create map of Toronto using latitude and longitude values
map_sydney = folium.Map(location=[latitude, longitude], zoom_start=9.5)
# add markers to map
for lat, lng, postcode, suburb in zip(syd_suburbs_coord['lat'],\
                                      syd_suburbs_coord['long'],\
                                      syd_suburbs_coord['Postcode'],\
                                      syd_suburbs_coord['Suburb']):
    #print(lat, lng)
    label = '{}: {}'.format(postcode, suburb)
    label = folium.Popup(label, parse_html=True)

    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_sydney)  
    
map_sydney

### Plotting Brisbane suburb map

In [1497]:
address = "Brisbane, Queensland"
geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Brisbane, Queensland are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Brisbane, Queensland are -27.4689682, 153.0234991.


In [1498]:
# create map of Toronto using latitude and longitude values
map_brisbane = folium.Map(location=[latitude, longitude], zoom_start=9.5)
# add markers to map
for lat, lng, postcode, suburb in zip(bne_suburbs_coord['lat'],\
                                      bne_suburbs_coord['long'],\
                                      bne_suburbs_coord['Postcode'],\
                                      bne_suburbs_coord['Suburb']):
    #print(lat, lng)
    label = '{}: {}'.format(postcode, suburb)
    label = folium.Popup(label, parse_html=True)

    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_brisbane)  
    
map_brisbane

# Search for local venues in Sydney and Brisbane suburbs with Foursquare API

### Define Foursquare credentials and version

In [1431]:
CLIENT_ID = 'JVA5H5NJXBQTUPSMHXXD0V0NKNP0OVJO0GKU1WJGLER5Q0DU' # Foursquare ID
CLIENT_SECRET = 'DMZV42OBBRZNSPQSGEUD3PE3N5EHUKRZYRTMSCLSORKPAO2W' # Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: JVA5H5NJXBQTUPSMHXXD0V0NKNP0OVJO0GKU1WJGLER5Q0DU
CLIENT_SECRET:DMZV42OBBRZNSPQSGEUD3PE3N5EHUKRZYRTMSCLSORKPAO2W


# Explore suburbs of Sydney
Function that finds top 100 venues within 500m radius of the neighbourhood

In [1443]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Suburb', 
                  'Suburb Latitude', 
                  'Suburb Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Here, I use the function to create a new dataframe that includes the venues.

In [1444]:
sydney_venues = getNearbyVenues(names=syd_suburbs_coord['Suburb'],
                                latitudes=syd_suburbs_coord['lat'],
                                longitudes=syd_suburbs_coord['long'])

Check size of dataframe

In [1450]:
print(sydney_venues.shape)
sydney_venues.head(5)

(3100, 7)


Unnamed: 0,Suburb,Suburb Latitude,Suburb Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"The Rocks, Barangaroo, Sydney CBD, Dawes Point...",-33.869938,151.207212,The Baxter Inn,-33.869707,151.205467,Whisky Bar
1,"The Rocks, Barangaroo, Sydney CBD, Dawes Point...",-33.869938,151.207212,State Theatre,-33.871291,151.207049,Theater
2,"The Rocks, Barangaroo, Sydney CBD, Dawes Point...",-33.869938,151.207212,Queen Victoria Building (QVB),-33.871521,151.206741,Shopping Mall
3,"The Rocks, Barangaroo, Sydney CBD, Dawes Point...",-33.869938,151.207212,Uniqlo,-33.869744,151.208319,Clothing Store
4,"The Rocks, Barangaroo, Sydney CBD, Dawes Point...",-33.869938,151.207212,Haigh's Chocolates,-33.869207,151.207129,Candy Store


Check the number of venues returned for each suburb

In [1447]:
sydney_venues.groupby('Suburb').count()

Unnamed: 0_level_0,Suburb Latitude,Suburb Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Suburb,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Abbotsbury, Bossley Park, Edensor Park, Greenfield Park, Prairiewood, St Johns Park, Wakeley",3,3,3,3,3,3
"Abbotsford, Canada Bay, Chiswick, Five Dock, Rodd Point, Russell Lea, Wareemba",7,7,7,7,7,7
"Acacia Gardens, Quakers Hill",7,7,7,7,7,7
"Alexandria, Beaconsfield, Eveleigh",26,26,26,26,26,26
"Alfords Point, Bangor, Barden Ridge, Illawong, Lucas Heights, Menai",4,4,4,4,4,4
"Allambie Heights, Beacon Hill, Brookvale, North Manly, Oxford Falls",18,18,18,18,18,18
"Allawah, Carlton",4,4,4,4,4,4
Annandale,19,19,19,19,19,19
"Arncliffe, Turrella, Wolli Creek",4,4,4,4,4,4
"Arndell Park, Blacktown, Huntingwood, Marayong, Prospect",3,3,3,3,3,3


The number of unique categories that can be curated from all of the returned venues

In [1451]:
print('There are {} uniques categories.'.format(len(sydney_venues['Venue Category'].unique())))

There are 267 uniques categories.


# Analysing each Suburb in Sydney

In [1452]:
# one hot encoding
sydney_onehot = pd.get_dummies(sydney_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
sydney_onehot['Suburb'] = sydney_venues['Suburb'] 

# move neighborhood column to the first column
fixed_columns = [sydney_onehot.columns[-1]] + list(sydney_onehot.columns[:-1])
sydney_onehot = sydney_onehot[fixed_columns]

sydney_onehot.head()

Unnamed: 0,Suburb,Airport,Airport Terminal,American Restaurant,Arcade,Argentinian Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Veterinarian,Video Game Store,Video Store,Vietnamese Restaurant,Water Park,Whisky Bar,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,"The Rocks, Barangaroo, Sydney CBD, Dawes Point...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,"The Rocks, Barangaroo, Sydney CBD, Dawes Point...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"The Rocks, Barangaroo, Sydney CBD, Dawes Point...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"The Rocks, Barangaroo, Sydney CBD, Dawes Point...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"The Rocks, Barangaroo, Sydney CBD, Dawes Point...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Examine the shape of the dataframe size

In [1453]:
sydney_onehot.shape

(3100, 268)

Group suburbs and calculate the mean of the frequency of occurance for each category

In [1455]:
sydney_grouped = sydney_onehot.groupby('Suburb').mean().reset_index()
sydney_grouped

Unnamed: 0,Suburb,Airport,Airport Terminal,American Restaurant,Arcade,Argentinian Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Veterinarian,Video Game Store,Video Store,Vietnamese Restaurant,Water Park,Whisky Bar,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,"Abbotsbury, Bossley Park, Edensor Park, Greenf...",0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.000000
1,"Abbotsford, Canada Bay, Chiswick, Five Dock, R...",0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.000000
2,"Acacia Gardens, Quakers Hill",0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.000000
3,"Alexandria, Beaconsfield, Eveleigh",0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.000000
4,"Alfords Point, Bangor, Barden Ridge, Illawong,...",0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.250000,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.000000
5,"Allambie Heights, Beacon Hill, Brookvale, Nort...",0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.055556,0.000000
6,"Allawah, Carlton",0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.250000,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.000000
7,Annandale,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.052632,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.000000
8,"Arncliffe, Turrella, Wolli Creek",0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.000000
9,"Arndell Park, Blacktown, Huntingwood, Marayong...",0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.000000


In [1456]:
sydney_grouped.shape

(203, 268)

Function that sorts most frequent venues in descending order

In [1457]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Creating a new dataframe and display the top 10 venues for each neighbourhood

In [1508]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Suburb']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
sydney_venues_sorted = pd.DataFrame(columns=columns)
sydney_venues_sorted['Suburb'] = sydney_grouped['Suburb']

for ind in np.arange(sydney_grouped.shape[0]):
    sydney_venues_sorted.iloc[ind, 1:] = return_most_common_venues(sydney_grouped.iloc[ind, :], num_top_venues)

sydney_venues_sorted.head(5)

Unnamed: 0,Suburb,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Abbotsbury, Bossley Park, Edensor Park, Greenf...",Pub,Grocery Store,Café,Flower Shop,Filipino Restaurant,Fish & Chips Shop,Fish Market,Flea Market,Food & Drink Shop,Fruit & Vegetable Store
1,"Abbotsford, Canada Bay, Chiswick, Five Dock, R...",Gym,Martial Arts Dojo,Park,Mediterranean Restaurant,Convenience Store,Grocery Store,Café,Flea Market,Filipino Restaurant,Fish & Chips Shop
2,"Acacia Gardens, Quakers Hill",Fast Food Restaurant,Baseball Field,Pizza Place,Sandwich Place,Gas Station,Convenience Store,Snack Place,Flea Market,Filipino Restaurant,Fish & Chips Shop
3,"Alexandria, Beaconsfield, Eveleigh",Café,Furniture / Home Store,Coffee Shop,Pizza Place,Train Station,Bus Station,Sporting Goods Shop,Fast Food Restaurant,Liquor Store,Clothing Store
4,"Alfords Point, Bangor, Barden Ridge, Illawong,...",Fast Food Restaurant,Italian Restaurant,Asian Restaurant,Australian Restaurant,Food Court,Filipino Restaurant,Fish & Chips Shop,Fish Market,Flea Market,Flower Shop


In [1540]:
# Search for suburb
def suburb_top_venues(venues_sorted, suburb_of_interset):
    return venues_sorted[venues_sorted['Suburb'].str.contains(suburb_of_interset)]

suburb_top_venues(sydney_venues_sorted, 'Chatswood')

Unnamed: 0,Suburb,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
69,"Chatswood, Chatswood West",Café,Bed & Breakfast,Sandwich Place,Hotel,Flower Shop,Filipino Restaurant,Fish & Chips Shop,Fish Market,Flea Market,Yoga Studio


# Explore suburbs of Brisbane

Use function to collect venues in Brisbane suburbs

In [1499]:
brisbane_venues = getNearbyVenues(names=bne_suburbs_coord['Suburb'],
                                  latitudes=bne_suburbs_coord['lat'],
                                  longitudes=bne_suburbs_coord['long'])

Check size of dataframe

In [1500]:
print(brisbane_venues.shape)
brisbane_venues.head(5)

(913, 7)


Unnamed: 0,Suburb,Suburb Latitude,Suburb Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Brisbane central business district, Petrie Ter...",-27.467224,153.02774,Vege Rama,-27.46671,153.02737,Vegetarian / Vegan Restaurant
1,"Brisbane central business district, Petrie Ter...",-27.467224,153.02774,The Dark Chocolatier by the Noosa Chocolate Fa...,-27.46783,153.025723,Chocolate Shop
2,"Brisbane central business district, Petrie Ter...",-27.467224,153.02774,Apple Brisbane,-27.468421,153.027355,Electronics Store
3,"Brisbane central business district, Petrie Ter...",-27.467224,153.02774,The Gresham,-27.467169,153.02824,Whisky Bar
4,"Brisbane central business district, Petrie Ter...",-27.467224,153.02774,Corbett & Claude,-27.468011,153.029502,Bar


Check the number of venues returned for each suburb

In [1501]:
brisbane_venues.groupby('Suburb').count()

Unnamed: 0_level_0,Suburb Latitude,Suburb Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Suburb,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Acacia Ridge, Heathwood, Larapinta, Pallara, Willawong",1,1,1,1,1,1
Albion,12,12,12,12,12,12
"Alderley, Gaythorne, Grange, Newmarket, Wilston, Enoggera",5,5,5,5,5,5
"Algester, Parkinson",7,7,7,7,7,7
"Amity Point, Dunwich, Point Lookout",5,5,5,5,5,5
"Annerley, Fairfield",9,9,9,9,9,9
"Anstead, Bellbowrie, Moggill",4,4,4,4,4,4
"Archerfield, Coopers Plains",7,7,7,7,7,7
"Ascot, Hamilton",18,18,18,18,18,18
Ashgrove,7,7,7,7,7,7


The number of unique categories that can be curated from all of the returned venues

In [1502]:
print('There are {} uniques categories.'.format(len(brisbane_venues['Venue Category'].unique())))

There are 182 uniques categories.


Analysing each suburb in Brisbane

In [1503]:
# one hot encoding
brisbane_onehot = pd.get_dummies(brisbane_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
brisbane_onehot['Suburb'] = brisbane_venues['Suburb'] 

# move neighborhood column to the first column
fixed_columns = [brisbane_onehot.columns[-1]] + list(brisbane_onehot.columns[:-1])
brisbane_onehot = brisbane_onehot[fixed_columns]

brisbane_onehot.head()

Unnamed: 0,Suburb,Airport Terminal,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Australian Restaurant,Auto Garage,BBQ Joint,Bagel Shop,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Yoga Studio
0,"Brisbane central business district, Petrie Ter...",0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,"Brisbane central business district, Petrie Ter...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Brisbane central business district, Petrie Ter...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Brisbane central business district, Petrie Ter...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,"Brisbane central business district, Petrie Ter...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Examine the shape of the dataframe size

In [1505]:
brisbane_onehot.shape

(913, 183)

Group suburbs and calculate the mean of the frequency of occurance for each category

In [1506]:
brisbane_grouped = brisbane_onehot.groupby('Suburb').mean().reset_index()
brisbane_grouped

Unnamed: 0,Suburb,Airport Terminal,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Australian Restaurant,Auto Garage,BBQ Joint,Bagel Shop,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Yoga Studio
0,"Acacia Ridge, Heathwood, Larapinta, Pallara, W...",0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.00,0.000000,0.00,...,0.000000,0.00,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
1,Albion,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.00,0.000000,0.00,...,0.083333,0.00,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
2,"Alderley, Gaythorne, Grange, Newmarket, Wilsto...",0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.00,0.000000,0.00,...,0.000000,0.00,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
3,"Algester, Parkinson",0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.00,0.000000,0.00,...,0.000000,0.00,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
4,"Amity Point, Dunwich, Point Lookout",0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.00,0.000000,0.00,...,0.000000,0.00,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
5,"Annerley, Fairfield",0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.00,0.000000,0.00,...,0.000000,0.00,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
6,"Anstead, Bellbowrie, Moggill",0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.00,0.000000,0.00,...,0.000000,0.00,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
7,"Archerfield, Coopers Plains",0.0,0.000000,0.000000,0.0,0.142857,0.000000,0.00,0.000000,0.00,...,0.000000,0.00,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
8,"Ascot, Hamilton",0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.00,0.000000,0.00,...,0.000000,0.00,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
9,Ashgrove,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.00,0.000000,0.00,...,0.000000,0.00,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000


Creating a new dataframe and display the top 10 venues for each neighbourhood

In [1510]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Suburb']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
brisbane_venues_sorted = pd.DataFrame(columns=columns)
brisbane_venues_sorted['Suburb'] = brisbane_grouped['Suburb']

for ind in np.arange(brisbane_grouped.shape[0]):
    brisbane_venues_sorted.iloc[ind, 1:] = return_most_common_venues(brisbane_grouped.iloc[ind, :], num_top_venues)

brisbane_venues_sorted.head(5)

Unnamed: 0,Suburb,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Acacia Ridge, Heathwood, Larapinta, Pallara, W...",Furniture / Home Store,Yoga Studio,Frozen Yogurt Shop,French Restaurant,Frame Store,Forest,Food Court,Food & Drink Shop,Flower Shop,Flea Market
1,Albion,Café,Seafood Restaurant,Fast Food Restaurant,Liquor Store,Bakery,Train Station,Turkish Restaurant,Comedy Club,Sandwich Place,Indian Restaurant
2,"Alderley, Gaythorne, Grange, Newmarket, Wilsto...",Thrift / Vintage Store,Supermarket,Train Station,Grocery Store,Thai Restaurant,Yoga Studio,Frame Store,Forest,Food Court,Food & Drink Shop
3,"Algester, Parkinson",Pharmacy,Shopping Mall,Pizza Place,Seafood Restaurant,Bakery,Grocery Store,Liquor Store,Yoga Studio,Fast Food Restaurant,Food Court
4,"Amity Point, Dunwich, Point Lookout",Park,Bus Stop,Convenience Store,Café,Yoga Studio,Fish Market,Frame Store,Forest,Food Court,Food & Drink Shop


In [1531]:
suburb_top_venues(brisbane_venues_sorted, 'central business district')

Unnamed: 0,Suburb,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
25,"Brisbane central business district, Petrie Ter...",Café,Bar,Coffee Shop,Burger Joint,Hotel,Japanese Restaurant,Restaurant,Italian Restaurant,Sushi Restaurant,Clothing Store


# Cluster Suburbs