# Import packages

- requests to connect to webpage to get data
- BeautifulSoup to webscrape
- pandas to convert data into a dataframe

In [1]:
import bs4
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
import getpass
from geopy.geocoders import Nominatim 
import folium

# Retrieve Data

- use requests.get() to open a connection to the webpage
- BeautifulSoup function is used to parse data, helping identify different objects
- prettify repackages data into more user friendly format

In [2]:
my_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(my_url, 'lxml') #use soup function to parse webpage

#print(soup.prettify())

### Extract Table of Neighbourhood Data from Webpage

- .find() is used to extract only the 'table' needed

In [3]:
table = soup.find("table", {'class':"wikitable sortable"})

### Read table into a pandas dataframe

- pd.read_html() will convert the table into a pandas dataframe
- Then want only rows which have complete data i.e. remove rows with 'Not Assigned'

In [4]:
df = pd.read_html(str(table), header=0)[0]
df = df[df.Borough != 'Not assigned']

#reset index for consistent row labelling
df.reset_index(inplace=True)
del df['index']

df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


### Group Neighbourhoods by Postcode/Borough

In [5]:
#Group by postcode and borough and aggregate Neighbourhood string
df =df.groupby(['Postcode','Borough']).aggregate(lambda x: list(x)).reset_index()

#Convert Neighbourhood list into string
df['Neighbourhood']= df['Neighbourhood'].apply(', '.join)

### Assign Borough where Neighbourhood = 'Not assigned'

In [6]:
# Assign Neighbourhood as Borough when Neighbourhood = 'Not assigned'
df['Neighbourhood'] = [df['Borough'][x] if df['Neighbourhood'][x].find('Not assigned') !=-1 else df['Neighbourhood'][x] for x in range(len(df['Neighbourhood'])) ]

In [7]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [8]:
df.shape

(103, 3)

### Load Latitude/Longitude of Neighbourhoods

- I have purposely searched for coordinates of <b>Neighbourhoods rather than Postcodes</b>. I did not want to sign up to get an API key from Google
- Geocoder would not find coordinates when postcodes were used as the input so used provided csv instead

In [9]:
coords = pd.read_csv('http://cocl.us/Geospatial_data')

In [10]:
df['Latitude'] = coords['Latitude']
df['Longitude'] = coords['Longitude']

In [11]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Exploring Neighbourhoods

### Select only Boroughs containing the word Toronto to explore

In [12]:
df_tr = df[df.Borough.str.contains('Toronto')].reset_index()
print(df_tr.shape)
df_tr.head()

(38, 6)


Unnamed: 0,index,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,37,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,43,M4M,East Toronto,Studio District,43.659526,-79.340923
4,44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


### Create map of Toronto with Neighbourhoods (in Boroughs containing the word Toronto) superimposed

In [13]:
#First get coordinates of Toronto for map
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent = "my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto, Ontario are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto, Ontario are 43.653963, -79.387207.


In [14]:
# create map of Toronto using latitude and longitude values
map_ontario = folium.Map(location=[latitude, longitude], zoom_start=11)


# add markers to map
for lat, lng, borough, nb in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{} - {}'.format(nb, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        ).add_to(map_ontario)  
    
map_ontario

## Define Foursquare credentials
- Mask credentials with getpass

In [15]:
CLIENT_ID = 'ZJB4P04SPIAGPN5O0DK4NID5FOTZZAPBBM1NA432P5GLQBKE' #getpass.getpass('Client_ID',)
CLIENT_SECRET = 'YARRE13VADDPR1Z3NNLTYPGK2J3DJOEVFLL4NNEP52BIWEMN' #getpass.getpass('Client Secret',)
VERSION = '20180605' 


## Define function to extract top 100 venues in 500m radius around each Neighbourhood

In [16]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, limit=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
       # print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

## Apply function to Toronto Neighbourhoods to get nearby venues

In [None]:
toronto_venues = getNearbyVenues(names=df_tr['Neighbourhood'],
                                   latitudes= df_tr['Latitude'],
                                   longitudes= df_tr['Longitude']
                                  )

In [None]:
print(toronto_venues.shape)
toronto_venues.head()

## Analyse neighbourhoods by counting frequncy of each venue category

In [None]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

## Group by neighbourhood and take frequncy of each category of venue

In [None]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped

## Define function which takes top x number of venues for each Neighbourhood

- We will be taking top 10 venues for each neighbourhood

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

## Create dataframe for each Neighbourhood ranking top 10 most common venue types

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted

## Run k-means to clustor neighbourhoods into 5 clusters

In [None]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

## Create new dataframe with each neighbourhood, cluster label and their top 10 venues

In [None]:
toronto_merged = df_tr

# add clustering labels
toronto_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

toronto_merged.head()

## Visualise clusters by plotting them on a map

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters