In [5]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install lxml

Note: you may need to restart the kernel to use updated packages.


In [7]:
import pandas as pd
import lxml

In [8]:
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

# there are three tables on the page; the one with the postal codes 
# is the first one. If I were putting this in a production environment,
# where I was re-loading the data regularly, I would add some error checking
# to make sure the page's structure hadn't changed.

raw_postal_codes = df[0]
raw_postal_codes.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [9]:
# remove unassigned boroughs

raw_postal_codes = raw_postal_codes[(raw_postal_codes.Borough != 'Not assigned')]

# As of this writing, there are no rows with an assigned Borough but
# an unassigned Neighbourhood. But the data set is small enough that
# doing this check doesn't hurt.

raw_postal_codes = raw_postal_codes[(raw_postal_codes.Neighbourhood != 'Not assigned')]

# Contrary to the instructions, each postal code is only listed once, and when
# a postal code has more than one neighborhood they are listed on the
# same line, separated by a comma, as required by the assignement. Like I said,
# I noticed that after I had already written the code.

# since I'm creating a new dictionary, I change the names here rather than
# use dataframe.rename()

grouped = raw_postal_codes.groupby('Postal Code')
grouped_data = {'PostalCode':[], 'Borough':[], 'Neighborhood':[]}
for a, b in grouped:
    grouped_data['PostalCode'].append(a)
    grouped_data['Borough'].append(', '.join(b['Borough'].tolist()))
    grouped_data['Neighborhood'].append(', '.join(b['Neighbourhood'].tolist()))

postal_codes = pd.DataFrame(grouped_data)
postal_codes.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [10]:
postal_codes.shape

(103, 3)

In [11]:
import pandas as pd
import lxml

df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
raw_postal_codes = df[0]
raw_postal_codes = raw_postal_codes[(raw_postal_codes.Borough != 'Not assigned')]
raw_postal_codes = raw_postal_codes[(raw_postal_codes.Neighbourhood != 'Not assigned')]

grouped = raw_postal_codes.groupby('Postal Code')
grouped_data = {'PostalCode':[], 'Borough':[], 'Neighborhood':[]}
for a, b in grouped:
    grouped_data['PostalCode'].append(a)
    grouped_data['Borough'].append(', '.join(b['Borough'].tolist()))
    grouped_data['Neighborhood'].append(', '.join(b['Neighbourhood'].tolist()))

postal_codes = pd.DataFrame(grouped_data)
print("Dataframe ready!")
print(postal_codes.shape)

Dataframe ready!
(103, 3)


In [12]:
pip install geocoder

Note: you may need to restart the kernel to use updated packages.


In [13]:
import geocoder

# initialize your variable to None

# Small change from Coursera - as a matter of
# style, I don't like to let a loop run forever
# until is succeeds, so I added a counter. Good
# thing I did: the geocode code never returned 
# a value for me, no matter how many times I
# let the loop run

lat_lng_coords = None
found = 0
not_found = 0

for p in postal_codes['PostalCode']:
    print(".", end='')
    i = 0
    while(lat_lng_coords is None and i <= 10):
        i += 1
        g = geocoder.google("{}, Toronto, Ontario, Canada".format(p))
        lat_lng_coords = g.latlng
    if lat_lng_coords is None:
        not_found += 1
    else:
        found += 1

print("\nfound: {}\nnot found: {}".format(found, not_found))

.......................................................................................................
found: 0
not found: 103


In [14]:
pip install geopy

Note: you may need to restart the kernel to use updated packages.


In [15]:
from geopy.geocoders import Nominatim

found = 0
not_found = 0

geolocator = Nominatim(user_agent="Capstone Week 3")
for i in postal_codes['PostalCode']:
    print(".", end='')
    location = geolocator.geocode("{}, Toronto, Ontario, Canada".format(i))
    if location is not None:
        found += 1
    else:
        not_found += 1

print("\nfound: {}\nnot found: {}".format(found, not_found))

.......................................................................................................
found: 20
not found: 83


In [16]:
long_lat = pd.read_csv("https://cocl.us/Geospatial_data")
long_lat.rename(columns={'Postal Code':'PostalCode'}, inplace=True)
postal_codes = postal_codes.merge(long_lat, on='PostalCode')
postal_codes.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [17]:
postal_codes.shape

(103, 5)

In [18]:
import pandas as pd
import lxml

df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
raw_postal_codes = df[0]
raw_postal_codes = raw_postal_codes[(raw_postal_codes.Borough != 'Not assigned')]
raw_postal_codes = raw_postal_codes[(raw_postal_codes.Neighbourhood != 'Not assigned')]

grouped = raw_postal_codes.groupby('Postal Code')
grouped_data = {'PostalCode':[], 'Borough':[], 'Neighborhood':[]}
for a, b in grouped:
    grouped_data['PostalCode'].append(a)
    grouped_data['Borough'].append(', '.join(b['Borough'].tolist()))
    grouped_data['Neighborhood'].append(', '.join(b['Neighbourhood'].tolist()))

postal_codes = pd.DataFrame(grouped_data)
print("Dataframe ready!")
print(postal_codes.shape)

Dataframe ready!
(103, 3)


In [19]:
long_lat = pd.read_csv("https://cocl.us/Geospatial_data")
long_lat.rename(columns={'Postal Code':'PostalCode'}, inplace=True)
long_lat.head()
postal_codes = postal_codes.merge(long_lat, on='PostalCode')
print("Longitude and Latitude added to dataframe")
print(postal_codes.shape)

Longitude and Latitude added to dataframe
(103, 5)


In [20]:
pip install folium

Collecting folium
  Downloading folium-0.11.0-py2.py3-none-any.whl (93 kB)
[K     |████████████████████████████████| 93 kB 3.2 MB/s  eta 0:00:01
[?25hCollecting branca>=0.3.0
  Downloading branca-0.4.1-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.1 folium-0.11.0
Note: you may need to restart the kernel to use updated packages.


In [21]:
from geopy.geocoders import Nominatim
import folium
import requests
from pandas import json_normalize
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

In [22]:
postal_codes.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [23]:
postal_codes['NumberNeighborhoods'] = postal_codes.apply(lambda row: len(row['Neighborhood'].split(",")), axis=1)

print('The dataframe has {} boroughs and {} neighborhoods'.format(
            len(postal_codes['Borough'].unique()),
            postal_codes['NumberNeighborhoods'].sum()))

The dataframe has 10 boroughs and 217 neighborhoods


In [24]:
address = "Toronto, Ontario, Canada"
geolocator = Nominatim(user_agent="Capstone_Week_3")
location = geolocator.geocode(address)
if location is not None:
    latitude = location.latitude
    longitude = location.longitude
    print("The coordinates of Toronto are {}, {}".format(latitude, longitude))
else:
    print("Coordinates not found!")

The coordinates of Toronto are 43.6534817, -79.3839347


In [25]:
# The Color names come from the folium documentation. I experimented
# a little to position them.

folium_colors = ['purple', 'black', 'green', 'red', 'orange', 'darkred',
                 'blue', 'beige', 'darkblue', 'darkgreen']

boroughs = postal_codes['Borough'].unique()
borough_colors = dict(zip(boroughs, folium_colors))

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for i, row in postal_codes.iterrows():
    label_text = "{}, {}".format(row['PostalCode'], row['Borough'])
    label = folium.Popup(label_text, parse_html=True)
    folium.CircleMarker(
        [row['Latitude'], row['Longitude']],
        radius=5,
        popup=label,
        color=borough_colors[row['Borough']],
        fill=True,
        fill_color=borough_colors[row['Borough']],
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)

map_toronto

In [26]:
# Adapted from New York exercise. Refactored slightly for clarity.

client_secret = "O1BJBPD2QA5QAC3VDVFBXS1J1GAGISXTY5BQKZOKAXUEPM2W"
client_id = "ZWNOEXV5OG0240EV0Y5YERTPBTWOQRMEG2ZEV5DQR5D2IHTJ"
api_version = '20180323' # different from what we used in the NY exercise, from FS documentation
limit = 100


# Radius is in meters, I learn from the Foursquare API documentation

def get_nearby_venues(postal_code, latitude, longitude, radius=500):
    
    venues_list = []
        
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        client_id, 
        client_secret, 
        api_version, 
        latitude, 
        longitude, 
        radius, 
        limit)

    # I notice that if the key for groups is entered as "groups" it fails,
    # but entered as 'groups' it succeeds. I would really like to know
    # why that is.
    
    results = requests.get(url).json()["response"]['groups'][0]["items"]
    for v in results:
        venues_list.append((postal_code,
                           latitude,
                           longitude,
                           v['venue']['name'],
                           v['venue']['location']['lat'],
                           v['venue']['location']['lng'],
                           v['venue']['categories'][0]['name']))
    
    return venues_list

In [27]:
venues_list = []

for i, row in postal_codes.iterrows():
    print(".", end="")
    venues_list += get_nearby_venues(row['PostalCode'],
                                        row['Latitude'],
                                        row['Longitude'])

print("\nVenues list created")

.......................................................................................................
Venues list created


In [28]:
# I keep "PostalCode" without the space, to preserve compatibility
# with the earlier required step. Which, frankly, I wouldn't have done.

toronto_venues = pd.DataFrame(data=venues_list, columns=['PostalCode',
                                                        'Postal Code Latitude',
                                                        'Postal Code Longitude',
                                                        'Venue',
                                                        'Venue Latitude',
                                                        'Venue Longitude',
                                                        'Venue Category'])

In [29]:
print(toronto_venues.shape)
toronto_venues.head()

(2122, 7)


Unnamed: 0,PostalCode,Postal Code Latitude,Postal Code Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M1B,43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant
1,M1B,43.806686,-79.194353,Interprovincial Group,43.80563,-79.200378,Print Shop
2,M1C,43.784535,-79.160497,Chris Effects Painting,43.784343,-79.163742,Construction & Landscaping
3,M1C,43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
4,M1E,43.763573,-79.188711,RBC Royal Bank,43.76679,-79.191151,Bank


In [30]:
toronto_venues.groupby('PostalCode').count()

Unnamed: 0_level_0,Postal Code Latitude,Postal Code Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
M1B,2,2,2,2,2,2
M1C,2,2,2,2,2,2
M1E,7,7,7,7,7,7
M1G,4,4,4,4,4,4
M1H,8,8,8,8,8,8
...,...,...,...,...,...,...
M9N,1,1,1,1,1,1
M9P,7,7,7,7,7,7
M9R,4,4,4,4,4,4
M9V,9,9,9,9,9,9


In [31]:
print('There are {} unique categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 267 unique categories.


In [32]:
# one hot encoding for the venue categories

toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], 
                                prefix="", 
                                prefix_sep="")

toronto_onehot['PostalCode'] = toronto_venues['PostalCode']

# In the New York exercise, the order of the columns was changed 
# at this point so the Neighborhood column came first. 
# But this isn't necessary, because the group step further
# down does that.

toronto_onehot.head()


Unnamed: 0,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio,PostalCode
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,M1B
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,M1B
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,M1C
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,M1C
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,M1E


In [33]:
toronto_onehot.shape

(2122, 268)

In [34]:
toronto_grouped = toronto_onehot.groupby('PostalCode').mean().reset_index()
toronto_grouped

Unnamed: 0,PostalCode,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Truck Stop,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,M9N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,M9P,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,M9R,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,M9V,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
toronto_grouped.shape

(100, 268)

In [36]:
def most_common_venues(row, number=5):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:number]

number_top_venues = 5

# The New York notebook used numpy.arange for this.
# Why use numpy.arange for something trivial like this?

columns = ['PostalCode']
for i in range(1, number_top_venues + 1):
    if i == 1:
        columns.append("1st Most Common Venue")
    elif i == 2:
        columns.append("2nd Most Common Venue")
    elif i == 3:
        columns.append("3rd Most Common Venue")
    else:
        columns.append("{}th Most Common Venue".format(i))

postal_code_venues_sorted = pd.DataFrame(columns=columns)
postal_code_venues_sorted['PostalCode'] = toronto_grouped['PostalCode']

# Again: why use numpy.arange for something trivial like this?

for i in range(0, toronto_grouped.shape[0]):
    postal_code_venues_sorted.iloc[i, 1:] = most_common_venues(
            toronto_grouped.iloc[i, :], number_top_venues)

postal_code_venues_sorted.head()

Unnamed: 0,PostalCode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M1B,Print Shop,Fast Food Restaurant,Yoga Studio,Dim Sum Restaurant,Diner
1,M1C,Construction & Landscaping,Bar,Yoga Studio,Donut Shop,Diner
2,M1E,Restaurant,Rental Car Location,Breakfast Spot,Medical Center,Intersection
3,M1G,Coffee Shop,Pharmacy,Korean BBQ Restaurant,Escape Room,Ethiopian Restaurant
4,M1H,Fried Chicken Joint,Gas Station,Hakka Restaurant,Bakery,Athletics & Sports


In [37]:
clusters = 4

toronto_grouped_clustering = toronto_grouped.drop('PostalCode', 1)

kmeans = KMeans(n_clusters=clusters, random_state=3).fit(toronto_grouped_clustering)
kmeans.labels_[0:10]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

In [38]:
# not sure if this is a problem, but in the New York notebook the Cluster Labels
# column as added as an int32, while when I did it it was converted into a float64. 
# So I've added the type conversion.

postal_code_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
postal_code_venues_sorted['Cluster Labels'] = postal_code_venues_sorted['Cluster Labels'].astype('int32')


toronto_merged = postal_codes
toronto_merged = toronto_merged.join(
        postal_code_venu

SyntaxError: unexpected EOF while parsing (<ipython-input-38-a54cf3a2efcc>, line 11)