__Section 1: Scraping the data and transforming it into a pandas dataframe__

In [79]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

Parsing data

In [81]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source,'lxml')

Find the table and iterate through the cell values

In [103]:
table = soup.find('table')
tablevalues = table.find_all('td') #finds all the cells in table and creates a list

elementcount = len(tablevalues) #total number of cells

postcode,brough,neighborhood = [],[],[] #sets columns to 3 empty lists

for i in range(0, elementcount, 3): #start at cell 0, iterate through cells in increments of 3
    postcode.append(tablevalues[i].text.strip()) #removes white space of text
    borough.append(tablevalues[i+1].text.strip())
    neighborhood.append(tablevalues[i+2].text.strip())

Build dataframe

In [117]:
df = pd.DataFrame(data=[postcode, borough, neighborhood]).transpose()
df.columns = ['Postcode', 'Borough', 'Neighborhood']
df

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
569,,Etobicoke,
570,,Etobicoke,
571,,Etobicoke,
572,,Etobicoke,


Clean and transform the data per requirements

In [119]:
df.drop(df[df['Borough'] == 'Not assigned'].index, inplace=True)
df.loc[df.Neighborhood == 'Not assigned', "Neighborhood"] = df.Borough

Cleaned dataframe

In [123]:
newdf = newdf.groupby(['Postcode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()
newdf.columns = ['Postcode', 'Borough', 'Neighborhood']
newdf

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


Number of rows in dataframe

In [125]:
newdf.shape

(103, 3)

__Section 2: Finding geographical coordinates and appending it to dataframe__

In [129]:
coordinatesfile = 'http://cocl.us/Geospatial_data'
columns = ["Postcode","Latitude","Longitude"]

coordinates_df = pd.read_csv(coordinatesfile,names=columns,skiprows=1)  #getting data from coordinates file
print("Done")

new_coordinates_df = pd.merge(newdf, coordinates_df, on='Postcode', how='inner')  #join the dfs to also show coordinates info

new_coordinates_df

Done


Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437


__Section 3: Explore and cluster the Toronto neighborhoods__

In [139]:
#start by importing all necessary libraries 

import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json 

!pip install geocoder==1.5.0
!pip install geopy
#!conda install -c conda-forge geopy --yes #uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim #convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
!pip install sklearn

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
!pip install folium
import folium # map rendering library

print('Libraries imported.')

You should consider upgrading via the 'pip install --upgrade pip' command.[0m
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Libraries imported.


Find greographical coordinates of the city of Toronto

In [146]:
df_toronto = new_coordinates_df[new_coordinates_df['Borough'].str.contains("Toronto")]
df_toronto

address = 'Toronto, Ontario' 

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

  


The geograpical coordinate of Toronto City are 43.653963, -79.387207.


Create a map of Toronto with neighborhoods superimposed on top

In [148]:
toronto_map = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  
    
toronto_map

Picking central Toronto (ct) for deep dive

In [149]:
neighborhoods=df_toronto
ct_data = neighborhoods[neighborhoods['Borough'] == 'Central Toronto'].reset_index(drop=True)
ct_data.head()

address = 'Central Toronto, Ontario'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Central Toronto are {}, {}.'.format(latitude, longitude))

  import sys


The geograpical coordinate of Central Toronto are 43.6449033, -79.3818364.


Create map of Central Toronto with neighborhoods superimposed on top

In [151]:
ct_map = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(ct_data['Latitude'], ct_data['Longitude'], ct_data['Borough'], ct_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(ct_map)  
    
ct_map

Finding nearby venues in Central Toronto

In [154]:
CLIENT_ID = 'VXKTGFX1QDNVIVEVIO1G3ZRYCWIHP24EHIDQPAKN3G1DRQQZ' # your Foursquare ID
CLIENT_SECRET = 'HNMENEA4I0CJM3OSCQOKRGF1UBTDZ3MBQ1J12FH2UU4ZSAKC' # your Foursquare Secret
VERSION = '20190107' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

ct_data.loc[0, 'Neighborhood']

neighborhood_latitude = ct_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = ct_data.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = ct_data.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

LIMIT = 100 # limit of number of venues returned by Foursquare API

radius = 500 # define radius

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

results = requests.get(url).json()
results

# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']
    
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues

Your credentails:
CLIENT_ID: VXKTGFX1QDNVIVEVIO1G3ZRYCWIHP24EHIDQPAKN3G1DRQQZ
CLIENT_SECRET:HNMENEA4I0CJM3OSCQOKRGF1UBTDZ3MBQ1J12FH2UU4ZSAKC
Latitude and longitude values of Lawrence Park are 43.7280205, -79.3887901.


Unnamed: 0,name,categories,lat,lng
0,Lawrence Park Ravine,Park,43.726963,-79.394382
1,Dim Sum Deluxe,Dim Sum Restaurant,43.726953,-79.39426
2,Zodiac Swim School,Swim School,43.728532,-79.38286
3,TTC Bus #162 - Lawrence-Donway,Bus Line,43.728026,-79.382805
