# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

## Part 1: transform the data in the table on the Wikipedia page into the above pandas dataframe.

### Part 1a: scraping data using Beautiful Soup

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from urllib.request import urlopen
!pip install beautifulsoup4
from bs4 import BeautifulSoup



In [4]:
#define the URL
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
#Get the html of the page
source = urlopen(url)
#pass the html to the BeautifulSoup() function, parser 'html.parser' is used here as this is already included in the standard library.
soup = BeautifulSoup(source,'html.parser')
#examine the type of soup
type(soup)

bs4.BeautifulSoup

### Part 1b: getting the right table

In [5]:
#get the table to 'table'
table = soup.find('table',{'class':'wikitable sortable'})

In [6]:
#using the tab 'tr' and 'td' to define columns and rows"
table_rows = table.find_all('tr')
res = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td if tr.text.strip()]
    if row:
        res.append(row)

df = pd.DataFrame(res, columns=["PostalCode",'Borough','Neightborhood'])
df.head()
print('the original shape of the table is',df.shape)


the original shape of the table is (180, 3)


### Part 1c cleaning up data as per the assignement requirement

In [7]:
# 1) Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
df = df[df.Borough != 'Not assigned']
print('after removing rows without an assigned borough, the shape is',df.shape)

after removing rows without an assigned borough, the shape is (103, 3)


In [8]:
# 2) More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.
print('Since the count and unique of PostalCode have the same value, there is no doubling of PostaCode in the dataframe')
df.describe()

Since the count and unique of PostalCode have the same value, there is no doubling of PostaCode in the dataframe


Unnamed: 0,PostalCode,Borough,Neightborhood
count,103,103,103
unique,103,10,98
top,M5M,North York,Downsview
freq,1,24,4


In [9]:
# 3) If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
print('since there is no null value, none of the cell under neighborhood is not assigned')
df.isnull().values.any()

since there is no null value, none of the cell under neighborhood is not assigned


False

In [22]:
# 4) In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.
print('this is the final dataframe')
print(df.head())
print('and the shape is:',df.shape)

this is the final dataframe
  PostalCode           Borough                                Neightborhood
2        M3A        North York                                    Parkwoods
3        M4A        North York                             Victoria Village
4        M5A  Downtown Toronto                    Regent Park, Harbourfront
5        M6A        North York             Lawrence Manor, Lawrence Heights
6        M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government
and the shape is: (103, 3)


## Part 2: Get the  geographical coordinates to the dataframe

In [23]:
df_geo = pd.read_csv('http://cocl.us/Geospatial_data')
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [25]:
df_geo = df_geo.rename(columns={'Postal Code':'PostalCode'})
df_geo.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [29]:
df_new = pd.merge(df,df_geo, on='PostalCode')

In [32]:
df_new.head()

Unnamed: 0,PostalCode,Borough,Neightborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## Part 3: Explore and cluster the neighborhoods in Toronto

In [38]:
#reduce to dataframe to area interested
df_toronto = df_new[df_new['Borough'].str.contains('Toronto')]
df_toronto.shape

(39, 5)

In [45]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    folium-0.5.0               |             py_0          45 KB  conda-forge
    certifi-2020.4.5.1         |   py36h9f0ad1d_0         151 KB  conda-forge
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    altair-4.1.0               |             py_1         614 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    ca-certificates-2020.4.5.1 |       hecc5488_0         146 KB  conda-forge
    branca-0.4.1               |             py_0          26 KB  conda-forge
    ------------------------------------------------------------
                       

In [46]:
#Use geopy library to get the latitude and longitude values of Toronto
address = 'Toronto'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [54]:
#Create a map of Toronto with Borough superimposed on top
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=13)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neightborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto

#### Define Foursquare Credentials and Version

#### Let's explore only the downtown toronto in our dataframe.

In [70]:
# strim to downtown
df_downtown = df_toronto[df_toronto['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
df_downtown.head()

#find the middle of the map
Latitude = df_downtown['Latitude'].mean()
Longitude = df_downtown['Longitude'].mean()

print(Latitude,Longitude)

43.65459717894736 -79.38397156842105


In [73]:
# zoom to downtown
#Create a map of Toronto downtown with Neightborhood superimposed on top
map_downtown = folium.Map(location=[latitude, longitude], zoom_start=13)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_downtown['Latitude'], df_downtown['Longitude'], df_downtown['Borough'], df_downtown['Neightborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_downtown)  
    
map_downtown

In [74]:
df_downtown

Unnamed: 0,PostalCode,Borough,Neightborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
5,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
6,M6G,Downtown Toronto,Christie,43.669542,-79.422564
7,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
8,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752
9,M5K,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.647177,-79.381576


In [75]:
#lets find out how many coffee shopes are there in downtown Regent Park, Harbourfront.

In [83]:
CLIENT_ID = 'BESBPEDHNTKUY32TFLTWUP3HLLAG45FNSH4DB1PQI2KGPJP2' # your Foursquare ID
CLIENT_SECRET = 'MMB0U1FGJMTCDWEHNOI1YSWMKCGT1E0RBRH3MBBFKNEXLO2M' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
search_query = 'coffee'
radius = 500
LIMIT = 100

In [85]:
neighborhood_latitude =  df_downtown.loc[0,'Latitude']
neighborhood_longitude = df_downtown.loc[0,'Longitude']
neighborhood_name =      df_downtown.loc[0, 'Neightborhood']
print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Regent Park, Harbourfront are 43.6542599, -79.3606359.


In [86]:
url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(
    CLIENT_ID,
    CLIENT_SECRET,
    neighborhood_latitude,
    neighborhood_longitude,
    VERSION,
    search_query,
    radius,
    LIMIT)
url

'https://api.foursquare.com/v2/venues/search?client_id=BESBPEDHNTKUY32TFLTWUP3HLLAG45FNSH4DB1PQI2KGPJP2&client_secret=MMB0U1FGJMTCDWEHNOI1YSWMKCGT1E0RBRH3MBBFKNEXLO2M&ll=43.6542599,-79.3606359&v=20180605&query=coffee&radius=500&limit=100'

In [89]:
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

In [90]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5eb6643a71c428001b7b2d8e'},
 'response': {'venues': [{'id': '58bdc3a23ef0f629212f1f70',
    'name': 'Arena Coffee Bar',
    'location': {'address': '15 Tank House Lane',
     'lat': 43.65028,
     'lng': -79.35886,
     'labeledLatLngs': [{'label': 'display',
       'lat': 43.65028,
       'lng': -79.35886}],
     'distance': 465,
     'postalCode': 'M5A 3C4',
     'cc': 'CA',
     'city': 'Toronto',
     'state': 'ON',
     'country': 'Canada',
     'formattedAddress': ['15 Tank House Lane',
      'Toronto ON M5A 3C4',
      'Canada']},
    'categories': [{'id': '4bf58dd8d48988d1e0931735',
      'name': 'Coffee Shop',
      'pluralName': 'Coffee Shops',
      'shortName': 'Coffee Shop',
      'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/coffeeshop_',
       'suffix': '.png'},
      'primary': True}],
    'referralId': 'v-1589011811',
    'hasPerk': False},
   {'id': '53b8466a498e83df908c3f21',
    'name': 'Tandem Coffee',
    'lo

In [93]:
# assign relevant part of JSON to venues
venues = results['response']['venues']

# tranform venues into a dataframe
dataframe = json_normalize(venues)
dataframe.head()

Unnamed: 0,categories,hasPerk,id,location.address,location.cc,location.city,location.country,location.crossStreet,location.distance,location.formattedAddress,location.labeledLatLngs,location.lat,location.lng,location.postalCode,location.state,name,referralId
0,"[{'id': '4bf58dd8d48988d1e0931735', 'name': 'C...",False,58bdc3a23ef0f629212f1f70,15 Tank House Lane,CA,Toronto,Canada,,465,"[15 Tank House Lane, Toronto ON M5A 3C4, Canada]","[{'label': 'display', 'lat': 43.65028, 'lng': ...",43.65028,-79.35886,M5A 3C4,ON,Arena Coffee Bar,v-1589011811
1,"[{'id': '4bf58dd8d48988d1e0931735', 'name': 'C...",False,53b8466a498e83df908c3f21,368 King St E,CA,Toronto,Canada,at Trinity St,122,"[368 King St E (at Trinity St), Toronto ON, Ca...","[{'label': 'display', 'lat': 43.65355870959944...",43.653559,-79.361809,,ON,Tandem Coffee,v-1589011811
2,"[{'id': '4bf58dd8d48988d1e0931735', 'name': 'C...",False,4d9f1825a77d816e4501fa08,142 Parliament St.,CA,Toronto,Canada,Richmond St.,284,"[142 Parliament St. (Richmond St.), Toronto ON...","[{'label': 'display', 'lat': 43.65454122520666...",43.654541,-79.364147,,ON,Coffee Time,v-1589011811
3,"[{'id': '4bf58dd8d48988d125941735', 'name': 'T...",False,54cb8e59498e14b4a8ab2aba,550 Queen Street East,CA,Toronto,Canada,,417,"[550 Queen Street East, Toronto ON, Canada]","[{'label': 'display', 'lat': 43.65729010071037...",43.65729,-79.357592,,ON,Coffee Enterprise,v-1589011811
4,"[{'id': '4bf58dd8d48988d125941735', 'name': 'T...",False,55fc340d498e3e59b6211dc0,550 Queen Street East,CA,Toronto,Canada,River,467,"[550 Queen Street East (River), Toronto ON, Ca...","[{'label': 'display', 'lat': 43.6574868268064,...",43.657487,-79.35693,,ON,Coffee Mobile - Brand Partners,v-1589011811


In [94]:
# keep only columns that include venue name, and anything that is associated with location
filtered_columns = ['name', 'categories'] + [col for col in dataframe.columns if col.startswith('location.')] + ['id']
dataframe_filtered = dataframe.loc[:, filtered_columns]

# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

# filter the category for each row
dataframe_filtered['categories'] = dataframe_filtered.apply(get_category_type, axis=1)

# clean column names by keeping only last term
dataframe_filtered.columns = [column.split('.')[-1] for column in dataframe_filtered.columns]

dataframe_filtered

Unnamed: 0,name,categories,address,cc,city,country,crossStreet,distance,formattedAddress,labeledLatLngs,lat,lng,postalCode,state,id
0,Arena Coffee Bar,Coffee Shop,15 Tank House Lane,CA,Toronto,Canada,,465,"[15 Tank House Lane, Toronto ON M5A 3C4, Canada]","[{'label': 'display', 'lat': 43.65028, 'lng': ...",43.65028,-79.35886,M5A 3C4,ON,58bdc3a23ef0f629212f1f70
1,Tandem Coffee,Coffee Shop,368 King St E,CA,Toronto,Canada,at Trinity St,122,"[368 King St E (at Trinity St), Toronto ON, Ca...","[{'label': 'display', 'lat': 43.65355870959944...",43.653559,-79.361809,,ON,53b8466a498e83df908c3f21
2,Coffee Time,Coffee Shop,142 Parliament St.,CA,Toronto,Canada,Richmond St.,284,"[142 Parliament St. (Richmond St.), Toronto ON...","[{'label': 'display', 'lat': 43.65454122520666...",43.654541,-79.364147,,ON,4d9f1825a77d816e4501fa08
3,Coffee Enterprise,Tech Startup,550 Queen Street East,CA,Toronto,Canada,,417,"[550 Queen Street East, Toronto ON, Canada]","[{'label': 'display', 'lat': 43.65729010071037...",43.65729,-79.357592,,ON,54cb8e59498e14b4a8ab2aba
4,Coffee Mobile - Brand Partners,Tech Startup,550 Queen Street East,CA,Toronto,Canada,River,467,"[550 Queen Street East (River), Toronto ON, Ca...","[{'label': 'display', 'lat': 43.6574868268064,...",43.657487,-79.35693,,ON,55fc340d498e3e59b6211dc0
5,Keefaa Coffee,Ethiopian Restaurant,,CA,Toronto,Canada,,294,"[Toronto ON, Canada]","[{'label': 'display', 'lat': 43.65589621753793...",43.655896,-79.363509,,ON,5047a0b4e4b046ea951ae2bf
6,Ethiopian Keeffaa Forest Coffee,Ethiopian Restaurant,368 Queen Street East,CA,Toronto,Canada,,328,"[368 Queen Street East, Toronto ON, Canada]","[{'label': 'display', 'lat': 43.65595786436085...",43.655958,-79.363977,,ON,4f96be7ae4b01de5727ebcff
7,Cuppa Coffee Studios,Office,53 Ontario Street,CA,Toronto,Canada,,480,"[53 Ontario Street, Toronto ON, Canada]","[{'label': 'display', 'lat': 43.6535443049727,...",43.653544,-79.366518,,ON,514c9b65e4b08b26d3c8d8c0
8,Cup of coffee,,53 Ontario Street,CA,Toronto,Canada,,578,"[53 Ontario Street, Toronto ON, Canada]","[{'label': 'display', 'lat': 43.65287, 'lng': ...",43.65287,-79.367554,,ON,4c3f26851ef0d13a28409380
9,Balzac's Coffee,Coffee Shop,1 Trinity Street,CA,Toronto,Canada,at Distillery Ln.,511,"[1 Trinity Street (at Distillery Ln.), Toronto...","[{'label': 'display', 'lat': 43.64979714147618...",43.649797,-79.359142,M5A 3C4,ON,4adb58f7f964a520412621e3


In [98]:
#the dataframe is including some non-coffee shop. And this need to be filtered out.
df_coffee = dataframe_filtered[dataframe_filtered['categories'] == 'Coffee Shop'].reset_index(drop=True)
df_coffee.head()

Unnamed: 0,name,categories,address,cc,city,country,crossStreet,distance,formattedAddress,labeledLatLngs,lat,lng,postalCode,state,id
0,Arena Coffee Bar,Coffee Shop,15 Tank House Lane,CA,Toronto,Canada,,465,"[15 Tank House Lane, Toronto ON M5A 3C4, Canada]","[{'label': 'display', 'lat': 43.65028, 'lng': ...",43.65028,-79.35886,M5A 3C4,ON,58bdc3a23ef0f629212f1f70
1,Tandem Coffee,Coffee Shop,368 King St E,CA,Toronto,Canada,at Trinity St,122,"[368 King St E (at Trinity St), Toronto ON, Ca...","[{'label': 'display', 'lat': 43.65355870959944...",43.653559,-79.361809,,ON,53b8466a498e83df908c3f21
2,Coffee Time,Coffee Shop,142 Parliament St.,CA,Toronto,Canada,Richmond St.,284,"[142 Parliament St. (Richmond St.), Toronto ON...","[{'label': 'display', 'lat': 43.65454122520666...",43.654541,-79.364147,,ON,4d9f1825a77d816e4501fa08
3,Balzac's Coffee,Coffee Shop,1 Trinity Street,CA,Toronto,Canada,at Distillery Ln.,511,"[1 Trinity Street (at Distillery Ln.), Toronto...","[{'label': 'display', 'lat': 43.64979714147618...",43.649797,-79.359142,M5A 3C4,ON,4adb58f7f964a520412621e3
4,Rooster Coffee,Coffee Shop,343 King St E,CA,Toronto,Canada,btwn Princess & Berkeley St,479,"[343 King St E (btwn Princess & Berkeley St), ...","[{'label': 'display', 'lat': 43.65189965670432...",43.6519,-79.365609,M5A 1L1,ON,51853a73498e4d97a8b20831


In [99]:
# zoom to Regent Park, Harbourfront

#Create a map of Regent Park, Harbourfront with location of coffee shops superimposed on top
map_coffee = folium.Map(location=[neighborhood_latitude, neighborhood_longitude], zoom_start=15)

# add markers to map
for lat, lng, name in zip(df_coffee['lat'], df_coffee['lng'], df_coffee['name']):
    label = '{}'.format(name)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_coffee)  
    
map_coffee