# Dataframe of Canadian Postal Codes

#### Import libraries

In [1]:
try:
    import bs4
except:
    !conda install lxml html5lib BeautifulSoup4 --yes

import numpy as np
import pandas as pd

Solving environment: \ 
  - conda-forge::html5lib-0.9999999-py36_0
  - defaults::html5lib-0.9999999-py36done


  current version: 4.5.11
  latest version: 4.8.0

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - beautifulsoup4
    - html5lib
    - lxml


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    beautifulsoup4-4.8.1       |           py36_0         153 KB
    lxml-4.3.0                 |   py36hefd8a0e_0         1.5 MB
    certifi-2019.11.28         |           py36_0         156 KB
    soupsieve-1.9.5            |           py36_0          61 KB
    openssl-1.1.1d             |       h7b6447c_3         3.7 MB
    ------------------------------------------------------------
                                           Total:         5.6 MB

The

#### Perform Data Scraping from Wikipedia List of Postal Codes

In [2]:
# Read the table using pandas read_html function
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
# read_html returns a list of frames for each table on the page. We know
# there is only one, so pick the first.
canada_frame = pd.read_html(url, index_col=0, attrs={"class":"wikitable"})[0]
canada_frame.reset_index(inplace=True)

# Now identify and drop all rows that have an undefined Borough
# (column 'Borough' has value 'Not assigned')
canada_frame = canada_frame[ canada_frame["Borough"] != "Not assigned" ].copy()

# Assign all non-assigned neighorhoods from boroughs column
i_no_neighborhoods = canada_frame["Neighborhood"] == "Not assigned"
canada_frame.loc[ i_no_neighborhoods, "Neighborhood" ] = canada_frame.loc[ i_no_neighborhoods ]["Borough"]

funcs = { "Borough": "first", "Neighborhood": lambda x: ", ".join(x) }
canada_frame = canada_frame.groupby(canada_frame["Postcode"]).aggregate(funcs)
canada_frame.reset_index(inplace=True)



View new dataframe

In [3]:
canada_frame

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


In [4]:
canada_frame.shape

(103, 3)

# Geocoding Canadian Neighborhoods


In [5]:
#if geo service does not work get coords from file

def get_coords_from_file():
    global canada_frame
    df_geo = pd.read_csv (r'https://cocl.us/Geospatial_data')
    canada_frame = pd.merge(left=canada_frame,right=df_geo, how='left', left_on='Postcode', right_on='Postal Code')
    canada_frame.drop(['Postal Code'], axis=1, inplace=True)

#### Create a panda dateframe for the geospatial data

In [6]:
# Now get the geo coordinates for the postal codes

try:
    import geocoder
except:
    !conda install -c conda-forge geocoder --yes

import geocoder

try:
    canada_frame.drop(["Longitude", "Latitude"], axis=1, inplace=True)
    print("Deleted Longitude, Latitude from data frame")
except:
    print("Longitude, Latitude do not exist yet in data frame")

lats = []
lngs = []
success = True
num_received = 0
num_expected = canada_frame.shape[0]
for row in canada_frame.itertuples():
    # initialize to None
    coords = None
    # try 3 times getting the coords for a plae before giving up
    tries_left = 3
    # loop until we get coordinates
    while(coords is None and tries_left > 0):
        tries_left = tries_left - 1
        #print(row)
        postcode = row.Postcode
        # geocodefarm API seems to work -- google returns REQUEST DENIED
        g = geocoder.geocodefarm('{}, Toronto, Ontario'.format(postcode))
        coords = g.latlng
    if coords is not None:
        # success!!! now unpack coordinates
        lat, lon = coords
        lats.append(lat)
        lngs.append(lon)
        num_received += 1
        print("{} of {} coordinates received".format(num_received, num_expected), end="\r")
    else:
        print("failed to get coordinates for {}; reading from file".format(postcode))
        success = False
        # get out of the for loop
        break

if success:
    print("getting coordinates from geo service successful")
    # add geo coords to data frame
    canada_frame["Latitude"] = lats
    canada_frame["Longitude"] = lngs
else:
    # read geo coords from file
    get_coords_from_file()

canada_frame

Solving environment: done


  current version: 4.5.11
  latest version: 4.8.0

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - geocoder


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    scikit-learn-0.20.1        |   py36h22eb022_0         5.7 MB
    liblapack-3.8.0            |      11_openblas          10 KB  conda-forge
    liblapacke-3.8.0           |      11_openblas          10 KB  conda-forge
    libopenblas-0.3.6          |       h5a2b251_2         7.7 MB
    numpy-1.17.3               |   py36h95a1406_0         5.2 MB  conda-forge
    click-7.0                  |             py_0          61 KB  conda-forge
    ratelim-0.1.6              |             py_2     

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.810154,-79.194603
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784672,-79.158958
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.766289,-79.172890
3,M1G,Scarborough,Woburn,43.768288,-79.214111
4,M1H,Scarborough,Cedarbrae,43.769180,-79.238770
...,...,...,...,...,...
98,M9N,York,Weston,43.705570,-79.518616
99,M9P,Etobicoke,Westmount,43.696484,-79.529266
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.684505,-79.557915
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.742935,-79.584831


#### New dataframe consisting of only those neighborhoods in Downtown Toronto.

In [7]:
# create a new dataframe with only Toronto boroughs

toronto_frame = canada_frame[ canada_frame["Borough"].str.contains("Toronto") ].copy()
toronto_frame


Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.679611,-79.295692
41,M4K,East Toronto,"The Danforth West, Riverdale",43.682327,-79.355797
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.670906,-79.316269
43,M4M,East Toronto,Studio District,43.657566,-79.340492
44,M4N,Central Toronto,Lawrence Park,43.728504,-79.388443
45,M4P,Central Toronto,Davisville North,43.71302,-79.388565
46,M4R,Central Toronto,North Toronto West,43.714615,-79.406532
47,M4S,Central Toronto,Davisville,43.703163,-79.385895
48,M4T,Central Toronto,"Moore Park, Summerhill East",43.690735,-79.383003
49,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686035,-79.402267


# Clustering the Toronto Neighborhoods

#### Visualize Toronto boroughs on map

#### Creation of a map of Toronto with neighborhoods superimposed on top.

In [8]:
import folium

lat_toronto = 43.651070
lon_toronto = -79.347015

map1 = folium.Map(location=[lat_toronto, lon_toronto], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood, postal in zip(toronto_frame['Latitude'], toronto_frame['Longitude'], toronto_frame['Borough'], toronto_frame['Neighborhood'], toronto_frame["Postcode"]):
    label = '{}, {}, {}'.format(postal, neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map1)  
    
map1

#### Define Foursquare Credentials and Version

In [9]:
# This cell is to set up some constants

CLIENT_ID = 'YBBOHK25A10KX5RV5YMDA2JXQK3MLGQKUYY3RSLJRF4U2401' # your Foursquare ID
CLIENT_SECRET = 'JYEXPKHNORBJHAYZSC0UKJQ3OIY5IR14LVK2W3QGB4NZ5B3P' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

LIMIT = 100 
RADIUS = 500

#### A function to repeat the process for all the neighborhoods in Toronto

In [10]:
# from nyc neighborhoods notebook (with some modifications)

import requests

def getNearbyVenues(names, latitudes, longitudes):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            RADIUS, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postcode', 
                  'Postcode Latitude', 
                  'Postcode Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### The code to run the above function on each neighborhood and create a new dataframe called *toronto_venues*.

In [11]:
# run the above function for every postal district in Toronto, get top venues and store then in a dataframe

toronto_venues = getNearbyVenues(names=toronto_frame["Postcode"],
                                   latitudes=toronto_frame["Latitude"],
                                   longitudes=toronto_frame["Longitude"]
                                )
toronto_venues

M4E
M4K
M4L
M4M
M4N
M4P
M4R
M4S
M4T
M4V
M4W
M4X
M4Y
M5A
M5B
M5C
M5E
M5G
M5H
M5J
M5K
M5L
M5N
M5P
M5R
M5S
M5T
M5V
M5W
M5X
M6G
M6H
M6J
M6K
M6P
M6R
M6S
M7A
M7Y


Unnamed: 0,Postcode,Postcode Latitude,Postcode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M4E,43.679611,-79.295692,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
1,M4E,43.679611,-79.295692,Glen Manor Ravine,43.676821,-79.293942,Trail
2,M4E,43.679611,-79.295692,Beaches Bake Shop,43.680363,-79.289692,Bakery
3,M4E,43.679611,-79.295692,Grover Pub and Grub,43.679181,-79.297215,Pub
4,M4E,43.679611,-79.295692,Domino's Pizza,43.679058,-79.297382,Pizza Place
...,...,...,...,...,...,...,...
1710,M7Y,43.648689,-79.385437,Starbucks,43.645594,-79.386063,Coffee Shop
1711,M7Y,43.648689,-79.385437,Design Exchange,43.647972,-79.380104,Art Gallery
1712,M7Y,43.648689,-79.385437,Old City Hall,43.652009,-79.381744,Monument / Landmark
1713,M7Y,43.648689,-79.385437,Little India Restaurant,43.650319,-79.388998,Indian Restaurant


Define unique venue categories

In [12]:
print('we have {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

we have 221 uniques categories.


#### Analyze Each Neighborhood

In [13]:
# one hot encoding of venues

onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add postcode column back to dataframe
onehot["Postcode"] = toronto_venues["Postcode"]

# move postcode column to the first column
fixed_columns = [ onehot.columns[-1] ] + list(onehot.columns[:-1])
onehot = onehot[fixed_columns]

grouped = onehot.groupby("Postcode").mean().reset_index()

# make sure all postal codes exist in table, even those that have no returned venues
remaining = set(toronto_frame["Postcode"]) - set(grouped["Postcode"])

for r in remaining:
    c = grouped.loc[0].copy()
    c[:] = 0
    c["Postcode"] = r
    grouped = grouped.append(c, ignore_index=True)

grouped.head()

Unnamed: 0,Postcode,Accessories Store,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,Baby Store,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M4E,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,...,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M4L,0.0,0.0,0.0,0.0,0.0,0.037037,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,0.0,0.029412,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M4N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Cluster the Vector Representation of Each Postal District

In [14]:
from sklearn.cluster import KMeans

# number of clusters
kclusters=6

cluster_data = grouped.drop("Postcode", 1)
cluster_data.fillna(0.0);

# run k-means clustering with 3 clusters
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(cluster_data)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([1, 5, 5, 1, 3, 1, 2, 1, 5, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       4, 0, 1, 1, 1, 1, 1, 1, 5, 5, 1, 1, 5, 1, 1, 1, 1], dtype=int32)

### Create the new dataframe and display the top 10 venues for each neighborhood.

In [15]:
# most common venues per postal code in data frame

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]


num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ["Postcode"]
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
postcode_venues_sorted = pd.DataFrame(columns=columns)
postcode_venues_sorted["Postcode"] = grouped["Postcode"]

for ind in np.arange(grouped.shape[0]):
    postcode_venues_sorted.iloc[ind, 1:] = return_most_common_venues(grouped.iloc[ind, :], num_top_venues)

postcode_venues_sorted.head()

Unnamed: 0,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,Pizza Place,Asian Restaurant,Bakery,Pub,Other Great Outdoors,Trail,Park,Health Food Store,Coffee Shop,Falafel Restaurant
1,M4K,Café,Park,Grocery Store,Discount Store,Yoga Studio,Electronics Store,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm
2,M4L,Sandwich Place,Grocery Store,Café,Fast Food Restaurant,Brewery,Burger Joint,Burrito Place,Snack Place,Fish & Chips Shop,Gym
3,M4M,Coffee Shop,Café,Italian Restaurant,Gastropub,Bar,Latin American Restaurant,Brewery,Comfort Food Restaurant,Bookstore,Clothing Store
4,M4N,Bus Line,Swim School,Dim Sum Restaurant,Yoga Studio,Elementary School,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm


## Create a Dataframe With Clusters and 10 Most Common Venues

In [16]:
# add clustering labels
postcode_venues_sorted.insert(0, "Cluster Label", kmeans.labels_)

merged = toronto_frame.copy()

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
merged = merged.join(postcode_venues_sorted.set_index("Postcode"), on="Postcode")

merged.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
37,M4E,East Toronto,The Beaches,43.679611,-79.295692,1,Pizza Place,Asian Restaurant,Bakery,Pub,Other Great Outdoors,Trail,Park,Health Food Store,Coffee Shop,Falafel Restaurant
41,M4K,East Toronto,"The Danforth West, Riverdale",43.682327,-79.355797,5,Café,Park,Grocery Store,Discount Store,Yoga Studio,Electronics Store,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.670906,-79.316269,5,Sandwich Place,Grocery Store,Café,Fast Food Restaurant,Brewery,Burger Joint,Burrito Place,Snack Place,Fish & Chips Shop,Gym
43,M4M,East Toronto,Studio District,43.657566,-79.340492,1,Coffee Shop,Café,Italian Restaurant,Gastropub,Bar,Latin American Restaurant,Brewery,Comfort Food Restaurant,Bookstore,Clothing Store
44,M4N,Central Toronto,Lawrence Park,43.728504,-79.388443,3,Bus Line,Swim School,Dim Sum Restaurant,Yoga Studio,Elementary School,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm


#### Visualize the clusters

In [17]:
# colors from matplotlib
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[lat_toronto, lon_toronto], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, hood, borough, postcode, cluster in zip(merged["Latitude"], merged["Longitude"], merged["Neighborhood"], merged["Borough"], merged["Postcode"], merged["Cluster Label"]):
    label = folium.Popup(str(postcode) + " Cluster " + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters