# Coursera Week 3 Assignment
## Identifying neighborhoods in toronto
### Author: Kaemon Derrick
### Date: 3/3/19


### Import the libraries used for the asignment

In [1]:
#Import libraries
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import re
import numpy as np
import pandas as pd
import csv

#!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

# import k-means from clustering stage
from sklearn.cluster import KMeans

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

## 1. Get toronto Postal Code information from Wikipedia

### 1.1 HTML Formatting Functions 

In [2]:
# Author: Kaemon Derrick
# Funtion: Remove_tags
# Description: This function removes the html tags for the wikipedia page

def remove_tags(data_arr_list):
    tags = ["<td>", "</td>", "\n", "td>" , "</td", "]]"]
    for i in range(0, len(data_arr_list)):
        for j in range(0, len(tags)):
            if str(tags[j]) in str(data_arr_list[i]):
                data_arr_list[i] = data_arr_list[i].replace(tags[j], "")
                if 'title="' in str(data_arr_list[i]):
                    data_arr_list[i] = str(data_arr_list[i]).split('title="')[1].split('">')[0]
    
    return (data_arr_list)

In [3]:
# Author: Kaemon Derrick
# Funtion: compile_postal
# Description: This function recursivly groups the postal codes and information such as the neighborhoods

def compile_postal(data_arr_list):

    #Compare the postal code to the next one in order
    for i in range (0, len(data_arr_list)-3, 3):

        if str(data_arr_list[i]) == str(data_arr_list[i+3]):
            #Add to the current postal code
            if str(data_arr_list[i+4]) not in data_arr_list[i+1]:
                data_arr_list[i+1] = str(data_arr_list[i+1]) + ", " + str(data_arr_list[i+4])
            if str(data_arr_list[i+5]) not in data_arr_list[i+2]:
                data_arr_list[i+2] = str(data_arr_list[i+2]) + ", " + str(data_arr_list[i+5])
            
            #Remove old entry(s)
            del(data_arr_list[i+3])
            del(data_arr_list[i+3])
            del(data_arr_list[i+3])
            
            data_arr_list = compile_postal(data_arr_list)
            
            break
            
    return data_arr_list

In [4]:
# Author: Kaemon Derrick
# Funtion: drop_na_borough
# Description: Drop borough rows that are N/A - recurivly

def drop_na_borough(data_arr_list):

    for i in range (1, len(data_arr_list)-1, 3):
        if str(data_arr_list[i]) == 'Not assigned':
            
            #Remove the row
            del(data_arr_list[i-1])
            del(data_arr_list[i-1])
            del(data_arr_list[i-1])
            
            data_arr_list = drop_na_borough(data_arr_list)
            break
            
    return data_arr_list

In [5]:
# Author: Kaemon Derrick
# Funtion: neighborhood_borough
# Description: Assign borough value to neighborhood if neighborhood is N/A 
 
def neighborhood_borough(data_arr_list):
    
    for i in range (2, len(data_arr_list), 3):
        if str(data_arr_list[i]) == 'Not assigned':
            
            data_arr_list[i] = str(data_arr_list[i-1])
            data_arr_list = neighborhood_borough(data_arr_list)
            
            break
            
    return data_arr_list

### 1.2 Identify Postal Code Information from the Wikipedia page

In [6]:
# specify the url
quote_page = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

# query the website and return the html to the variable ‘page’
page = urlopen(quote_page)

# parse the html using beautiful soup and store in variable `soup`
soup = BeautifulSoup(page, "html.parser")

#Define array to hold all of the data points
data_arr = []

#Get the first table in the html
data = soup.findAll('table')

#assign the cells to the array
for row in data:
    for item in row.findAll('td'):
        if "<td>" in str(item):
            data_arr.append(str(item))

#Remove the last element in the list as it is invalid
data_arr.pop()            

#Clean up the tags and data points

#Remove HTML tags
data_arr = remove_tags(data_arr)

#Compile postal codes
data_arr = compile_postal(data_arr)

#Drop Not assigned boroughs
data_arr = drop_na_borough(data_arr)

#Assign borough to n/a neighborhoods
data_arr = neighborhood_borough(data_arr)

## 2. Create a Pandas Dataframe with Toronto data

### 2.1 Display the Dataframe with Wikipedia Information

In [7]:
#Create a dictionary
toronto_dict = {'Postal_Code':data_arr[0::3], 'Borough': data_arr[1::3], 
                                     'Neighborhood':data_arr[2::3] }

#Pandas Data frame
toronto_df = pd.DataFrame.from_dict(toronto_dict)

#*********Uncomment these lines to focus only on those boroughs in Toronto - containing the word Toronto*********#
#toronto_df = toronto_df[toronto_df['Borough'].str.contains("Toronto")==True]
#toronto_df.reset_index(drop=True, inplace=True)

#Print the shape of the new frame and display the first 5 rows
print(toronto_df.shape)

toronto_df.head()

(103, 3)


Unnamed: 0,Postal_Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront (Toronto), Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park (Toronto),Queen's Park (Toronto)


### 2.2 Add location information to the Dataframe

In [8]:
#Add the Latitude and Longitude columns to the table and initialize with placeholder information
toronto_df['Latitude'] = 'Not Set'
toronto_df['Longitude'] = 'Not Set'

#Open file containing the geospacial coordinates for Toronto
with open('Geospatial_Coordinates.csv', 'r') as csvfile:
    geo_reader = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in geo_reader:
        #Find the postal code in the fame and add coordinates
        toronto_df.loc[toronto_df['Postal_Code'] == str(row[0]), "Latitude"] = str(row[1])
        toronto_df.loc[toronto_df['Postal_Code'] == str(row[0]), "Longitude"] = str(row[2])
        
#Set type to numeric
toronto_df['Latitude'] = pd.to_numeric(toronto_df['Latitude'])
toronto_df['Longitude'] = pd.to_numeric(toronto_df['Longitude'])


toronto_df.head()

Unnamed: 0,Postal_Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront (Toronto), Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park (Toronto),Queen's Park (Toronto),43.662301,-79.389494


# 3. Get nearby venues from Foursquare

### 3.1 Foursquare credentials

In [9]:
#Foursquare credentials 
CLIENT_ID = 'FESORG5RGQEFUMXHOO4N4X00EEY1GJ2FP0REWGQAYFOLSEUL' # your Foursquare ID
CLIENT_SECRET = 'KKNKRCMCIGILUH0NIJFQ4IRGZGHPJP4MMHCAXCSKC2WRAOIZ' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: FESORG5RGQEFUMXHOO4N4X00EEY1GJ2FP0REWGQAYFOLSEUL
CLIENT_SECRET:KKNKRCMCIGILUH0NIJFQ4IRGZGHPJP4MMHCAXCSKC2WRAOIZ


### 3.2 Foursquare function to pull nearby values

In [10]:
## Cognitive Class.ai
## Segmenting and Clustering Neighborhoods in New York City
## Note: This function is taken from : https://labs.cognitiveclass.ai/tools/jupyterlab/lab/tree/labs/DP0701EN/DP0701EN-3-3-2-Neighborhoods-New-York-py-v1.0.ipynb
## I do not take credit for writing the below function
## I have used this function and made changes where nessesary for use in this project

#This function will get the near-by venues of a location using coordinates

def getNearbyVenues(names, latitudes, longitudes, radius):
    
    venues_list=[]
    remove = []
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        #Log postal codees without nearby addresses 
        if not results:
            remove.append(name)

        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postal_Code', 
                  'Postal_Latitude', 
                  'Postal_Longitude', 
                  'Venue', 
                  'Venue_Latitude', 
                  'Venue_Longitude', 
                  'Venue_Category']
    
    return(nearby_venues, remove)

### 3.3 Get venues and add to the Dataframe

In [12]:
#Radius of 750m and a limit of 100 venues
radius = 750
LIMIT = 100


#Get the venues near Toronto postal codes
toronto_venues,remove = getNearbyVenues(names=toronto_df['Postal_Code'],
                                   latitudes=toronto_df['Latitude'],
                                   longitudes=toronto_df['Longitude'],
                                   radius = radius
                                  )

#Remove those postal codes with no nearby venues
for item in remove:
    indexNames = toronto_df[ toronto_df['Postal_Code'] == item ].index
    # Delete these row indexes from dataFrame
    toronto_df.drop(indexNames , inplace=True)

### 3.4 Examine the dataframe

In [13]:
# Display the shape of the dataframe and the first 5 rows
print(toronto_venues.shape)
toronto_venues.head()

(3713, 7)


Unnamed: 0,Postal_Code,Postal_Latitude,Postal_Longitude,Venue,Venue_Latitude,Venue_Longitude,Venue_Category
0,M3A,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,M3A,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,M3A,43.753259,-79.329656,DVP at York Mills,43.758899,-79.334099,Road
3,M3A,43.753259,-79.329656,TTC Stop #09083,43.759655,-79.332223,Bus Stop
4,M4A,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena


In [14]:
#Determine how many unique categories there are 
print('There are {} uniques categories.'.format(len(toronto_venues['Venue_Category'].unique())))

There are 320 uniques categories.


# 4. Analyze Each Neighborhood

### 4.1 Initial Analysis

In [15]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue_Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot["Postal_Code"] = toronto_venues["Postal_Code"] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

#### Next, let's group rows by Postal Code and by taking the mean of the frequency of occurrence of each category
toronto_grouped = toronto_onehot.groupby("Postal_Code").mean().reset_index()
print(toronto_grouped.shape)
toronto_grouped.head()

(102, 321)


Unnamed: 0,Postal_Code,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478


### 4.2 Identify the top 10 venues

In [16]:
# Author: Kaemon Derrick
# Funtion: return_most_common_venue
# Description: This function returns the most common venues

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [17]:
#Interested in the top 10 venues
num_top_venues = 10

#Numbering indicators (1st, 2nd, 3rd)
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ["Postal_Code"]
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
postal_venues_sorted = pd.DataFrame(columns=columns)
postal_venues_sorted["Postal_Code"] = toronto_grouped["Postal_Code"]

for ind in np.arange(toronto_grouped.shape[0]):
    postal_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)
    
postal_venues_sorted.head()

Unnamed: 0,Postal_Code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Coffee Shop,Fast Food Restaurant,Martial Arts Dojo,Hobby Shop,Spa,Business Service,Paper / Office Supplies Store,Yoga Studio,Donut Shop,Diner
1,M1C,Breakfast Spot,Italian Restaurant,Burger Joint,Bar,Yoga Studio,Drugstore,Discount Store,Dive Bar,Dog Run,Doner Restaurant
2,M1E,Pizza Place,Fast Food Restaurant,Coffee Shop,Grocery Store,Breakfast Spot,Beer Store,Fried Chicken Joint,Greek Restaurant,Sports Bar,Gym
3,M1G,Coffee Shop,Park,Convenience Store,Business Service,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dive Bar,Dog Run
4,M1H,Coffee Shop,Bakery,Indian Restaurant,Rental Car Location,Bank,Burger Joint,Bus Line,Caribbean Restaurant,Chinese Restaurant,Pharmacy


# 5. Cluster postalcodes

### 5.1 Assign neach postal code to a cluster

In [18]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop("Postal_Code", 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 0, 3, 3, 0, 3, 3, 0, 0, 0], dtype=int32)

In [19]:
toronto_merged = toronto_df

# add clustering labels
toronto_merged['Cluster Labels'] = kmeans.labels_ 

# merge toronto_grouped with toronto_data to add latitude/longitude for each postal code
toronto_merged = toronto_merged.join(postal_venues_sorted.set_index('Postal_Code'), on='Postal_Code')

toronto_merged.head()

Unnamed: 0,Postal_Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,3,Food & Drink Shop,Road,Park,Bus Stop,Ethiopian Restaurant,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dive Bar
1,M4A,North York,Victoria Village,43.725882,-79.315572,0,Coffee Shop,Pizza Place,Café,Park,Hockey Arena,Intersection,Sporting Goods Shop,Playground,Portuguese Restaurant,Donut Shop
2,M5A,Downtown Toronto,"Harbourfront (Toronto), Regent Park",43.65426,-79.360636,3,Coffee Shop,Italian Restaurant,Park,Pub,Café,Theater,Restaurant,Bakery,Gym / Fitness Center,Cosmetics Shop
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763,3,Furniture / Home Store,Clothing Store,Vietnamese Restaurant,Restaurant,Coffee Shop,Miscellaneous Shop,Dessert Shop,Greek Restaurant,Fried Chicken Joint,Boutique
4,M7A,Queen's Park (Toronto),Queen's Park (Toronto),43.662301,-79.389494,0,Coffee Shop,Café,Sandwich Place,Japanese Restaurant,Italian Restaurant,Gastropub,Gym,Bubble Tea Shop,Sushi Restaurant,Ice Cream Shop


# 6. Map postal codes on map and color code by cluster

### 6.1 Display Map

In [21]:
latitude = 43.7
longitude = -79.3832

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters