# Import libraries that will be used throughout the assignment

In [15]:
!conda install -c conda-forge geopy --yes
!pip install bs4
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
# import k-means from clustering stage
from sklearn.cluster import KMeans
import folium # map rendering library
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')
import folium

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



# Question 1

Obtain data from wikipedia using BeautifulSoup

In [36]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(url).text
soup = BeautifulSoup(source)
table_data = soup.find('div', class_='mw-parser-output')
table = table_data.table.tbody

Initialize the columns, run a for loop to populate the columns, using the 'tr' code from the html.

In [37]:
col = ['Postal Code', 'Borough', 'Neighborhood']
data = dict({key:[]*len(col) for key in col})
for row in table.find_all('tr'):
    for i,column in zip(row.find_all('td'),col):
        i = i.text
        i = i.replace('\n', '')
        data[column].append(i)

df = pd.DataFrame.from_dict(data=data)[col]
print('Shape of the table:', df.shape)
df = df[df['Borough'] != 'Not assigned'].reset_index(drop = True)
print('Shape after dropping Not Assigned Boroughs:',df.shape)
print('Number of rows where Neighborhood is "Not assigned" but borough has value: ', 
      df[df['Neighborhood'] == 'Not assigned'].shape[0])
df

Shape of the table: (180, 3)
Shape after dropping Not Assigned Boroughs: (103, 3)
Number of rows where Neighborhood is "Not assigned" but borough has value:  0


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


# Question 2

Read the geospatial data csv file, and append the latitude and longitude using the Postal Code.

In [38]:
coordinates = pd.read_csv('https://cocl.us/Geospatial_data')
df = pd.merge(df, coordinates, how= 'inner', on = 'Postal Code')
df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


# Question 3

Obtain the coordinates for Toronto

In [39]:
where = 'Toronto, Ontario, Canada'
geo = Nominatim(user_agent="tor_exp")
location = geo.geocode(where)
lat = location.latitude
lon = location.longitude
print(where, '| Latitude:', lat, 'Longitude:', lon)

Toronto, Ontario, Canada | Latitude: 43.6534817 Longitude: -79.3839347


Create map of Toronto using Folium

In [42]:
t_map = folium.Map(location=[lat, lon], zoom_start=10)

# Add blue markers to all different postal codes
for b, n, lat, lon in zip(df['Borough'], df['Neighborhood'], df['Latitude'], df['Longitude']):
    label = '{}, {}'.format(b, n)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.4,
        parse_html=False).add_to(t_map)  
    
t_map

Cluster all Boroughs

In [43]:
central = df[df['Borough'] == 'Central Toronto'].reset_index(drop=True)
downtown = df[df['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
east = df[df['Borough'] == 'East Toronto'].reset_index(drop=True)
easty = df[df['Borough'] == 'East York'].reset_index(drop=True)
eto = df[df['Borough'] == 'Etobicoke'].reset_index(drop=True)
mis = df[df['Borough'] == 'Mississauga'].reset_index(drop=True)
northy = df[df['Borough'] == 'North York'].reset_index(drop=True)
scar = df[df['Borough'] == 'Scarborough'].reset_index(drop=True)
west = df[df['Borough'] == 'West Toronto'].reset_index(drop=True)
york = df[df['Borough'] == 'York'].reset_index(drop=True)      

Plot the 10 Boroughs using different colors

In [44]:
where = 'Toronto, Ontario, Canada'
geo = Nominatim(user_agent="tor_exp")
location = geo.geocode(where)
lat = location.latitude
lon = location.longitude
print(where, '| Latitude:', lat, 'Longitude:', lon)

# create map of Central Toronto using red markers.
ct_map = folium.Map(location=[lat, lon], zoom_start= 10)

#CENTRAL TORONTO#
for b, n, lat, lon in zip(central['Borough'], central['Neighborhood'], central['Latitude'], central['Longitude']):
    label = '{}, {}'.format(b, n)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='yellow',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.4,
        parse_html=False).add_to(ct_map)  

#DOWNTOWN TORONTO#
for b, n, lat, lon in zip(downtown['Borough'], downtown['Neighborhood'], downtown['Latitude'], downtown['Longitude']):
    label = '{}, {}'.format(b, n)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.4,
        parse_html=False).add_to(ct_map)
    
#EAST TORONTO#
for b, n, lat, lon in zip(east['Borough'], east['Neighborhood'], east['Latitude'], east['Longitude']):
    label = '{}, {}'.format(b, n)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.4,
        parse_html=False).add_to(ct_map)  

#WEST TORONTO#
for b, n, lat, lon in zip(west['Borough'], west['Neighborhood'], west['Latitude'], west['Longitude']):
    label = '{}, {}'.format(b, n)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.4,
        parse_html=False).add_to(ct_map)

#NORTH YORK#
for b, n, lat, lon in zip(northy['Borough'], northy['Neighborhood'], northy['Latitude'], northy['Longitude']):
    label = '{}, {}'.format(b, n)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='black',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.4,
        parse_html=False).add_to(ct_map)
    
#EAST YORK#
for b, n, lat, lon in zip(easty['Borough'], easty['Neighborhood'], easty['Latitude'], easty['Longitude']):
    label = '{}, {}'.format(b, n)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='white',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.4,
        parse_html=False).add_to(ct_map)  

#YORK#
for b, n, lat, lon in zip(york['Borough'], york['Neighborhood'], york['Latitude'], york['Longitude']):
    label = '{}, {}'.format(b, n)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='purple',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.4,
        parse_html=False).add_to(ct_map)    

#MISSISSAUGA#
for b, n, lat, lon in zip(mis['Borough'], mis['Neighborhood'], mis['Latitude'], mis['Longitude']):
    label = '{}, {}'.format(b, n)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='orange',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.4,
        parse_html=False).add_to(ct_map)
    
#SCARBOROUGH#
for b, n, lat, lon in zip(scar['Borough'], scar['Neighborhood'], scar['Latitude'], scar['Longitude']):
    label = '{}, {}'.format(b, n)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='brown',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.4,
        parse_html=False).add_to(ct_map)  

#ETOBICOKE#
for b, n, lat, lon in zip(eto['Borough'], eto['Neighborhood'], eto['Latitude'], eto['Longitude']):
    label = '{}, {}'.format(b, n)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='gray',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.4,
        parse_html=False).add_to(ct_map)    
    
ct_map

Toronto, Ontario, Canada | Latitude: 43.6534817 Longitude: -79.3839347


Use Foursquare API to gather information about venues in Toronto

In [45]:
CLIENT_ID = 'PYEA1DYDRXCJ01EU400D5QLUG2CZQ543LFBSSV10HWN0DSOR' # your Foursquare ID
CLIENT_SECRET = '4DINGRUUOCYA3IXIHK5CZYDILKGTA3K3OG4JEZSDTF4SHIDF' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

LIMIT = 100
radius =1000
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, CLIENT_SECRET, VERSION, lat,lon, radius, LIMIT)

# gettig the venues data form Forsquare API in json format
results = requests.get(url).json()

Define the getNearbyVenues function.

In [87]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Run the getNearbyVenues function, display number of unique categories.

In [88]:
toronto_venues = getNearbyVenues(names=df['Neighborhood'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 273 uniques categories.


Create one-hot encoding of the different categories

In [84]:
toronto_venues

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.332140,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant
4,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
...,...,...,...,...,...,...,...
2134,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999,Koala Tan Tanning Salon & Sunless Spa,43.631370,-79.519006,Tanning Salon
2135,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999,Once Upon A Child,43.631075,-79.518290,Kids Store
2136,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999,Value Village,43.631269,-79.518238,Thrift / Vintage Store
2137,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999,Kingsway Boxing Club,43.627254,-79.526684,Gym


In [89]:
t_oh = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
t_oh['Neighborhood'] = toronto_venues['Neighborhood'] 
# move neighborhood column to the first column
fixed_columns = [t_oh.columns[-82]] + list(t_oh.columns[:-1])
t_oh = t_oh[fixed_columns]

t_oh.head()

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Group results by neighborhood (first column)

In [95]:
t_grouped = t_oh.groupby(t_oh.iloc[:,0]).mean().reset_index()
t_grouped

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,"Willowdale, Willowdale West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
92,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
93,Woodbine Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
94,York Mills West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Print the top 3 venue categories for each neighborhood

In [98]:
num_top_venues = 3

for hood in t_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = t_grouped[t_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt----
            venue  freq
0  Breakfast Spot   0.2
1          Lounge   0.2
2    Skating Rink   0.2


----Alderwood, Long Branch----
            venue  freq
0     Pizza Place  0.29
1             Gym  0.14
2  Sandwich Place  0.14


----Bathurst Manor, Wilson Heights, Downsview North----
         venue  freq
0         Bank  0.10
1  Coffee Shop  0.10
2  Bridal Shop  0.05


----Bayview Village----
                 venue  freq
0  Japanese Restaurant  0.25
1   Chinese Restaurant  0.25
2                 Bank  0.25


----Bedford Park, Lawrence Manor East----
                venue  freq
0         Coffee Shop  0.09
1      Sandwich Place  0.09
2  Italian Restaurant  0.09


----Berczy Park----
            venue  freq
0     Coffee Shop  0.09
1        Beer Bar  0.04
2  Farmers Market  0.04


----Birch Cliff, Cliffside West----
                   venue  freq
0  General Entertainment  0.25
1        College Stadium  0.25
2           Skating Rink  0.25


----Brockton, Parkdale Village, Exh

Create new dataframe displaying the top 3 venues of each neighborhood

In [99]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [111]:
num_top_venues = 3

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = t_grouped['Neighborhood']

for ind in np.arange(t_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(t_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,Agincourt,Clothing Store,Latin American Restaurant,Breakfast Spot
1,"Alderwood, Long Branch",Pizza Place,Pharmacy,Gym
2,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Mobile Phone Shop
3,Bayview Village,Café,Japanese Restaurant,Chinese Restaurant
4,"Bedford Park, Lawrence Manor East",Sandwich Place,Italian Restaurant,Coffee Shop
...,...,...,...,...
91,"Willowdale, Willowdale West",Pharmacy,Pizza Place,Butcher
92,Woburn,Coffee Shop,Korean BBQ Restaurant,Mexican Restaurant
93,Woodbine Heights,Skating Rink,Park,Athletics & Sports
94,York Mills West,Park,Convenience Store,Women's Store
