### Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

# Question 1

#### Importing pandas. Transforming the table in the url into a dataframe, using pandas.

In [1]:
import pandas as pd

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

data = pd.read_html(url, flavor='bs4')
df = data[0]

#### Overview of the dataframe.

In [2]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### The original column names are PostalCode, Borough, and Neighborhood, so no change needs to be made.

In [3]:
df.shape

(180, 3)

#### Lets check how many rows have a not assigned Borough and Neighborhood.

In [4]:
print('# of not assigned Boroughs:', str(sum(df['Borough']=='Not assigned')))
print('# of not assigned Neighborhoods:', str(sum(df['Neighborhood']=='Not assigned')))

# of not assigned Boroughs: 77
# of not assigned Neighborhoods: 0


#### Lets make sure that only the cells that have an assigned borough are processed. Lets discard the cells with a borough that is Not assigned.

In [5]:
df_ = df.copy()
bol = df_['Borough']=='Not assigned'
df_.drop(df_[bol].index, inplace=True)
df_.shape

(103, 3)

#### 73 rows were dropped

#### As checked above, there is no cell with a not assigned neighborhood, so there is no case in which we have to change the not assigned neighborhood to match the borough.

#### Lets print the size of the final dataframe.

In [7]:
df_.shape

(103, 3)

# Question 2

#### Gathering latitute and longitude, and updating the dataframe.

In [8]:
# Geocoder

#df['Latitude'] = ""
#df['Longitude'] = ""

#import geocoder # import geocoder

#for i, postal_code in enumerate(df['Postal Code']):

#    # initialize your variable to None
#    lat_lng_coords = None

    # loop until you get the coordinates
#    while(lat_lng_coords is None):
#      g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#      lat_lng_coords = g.latlng

#    latitude = lat_lng_coords[0]
#    longitude = lat_lng_coords[1]
    
#    df.loc[i,'Latitude'] = latitude
#    df.loc[i,'Longitude'] = longitude
    
#    print(' '.join([postal_code, str(latitude), str(longitude)]) )

#### Since geocoder didnt work, lets load the csv file and update the table.

In [9]:
df_csv = pd.read_csv('/home/ines/Downloads/Geospatial_Coordinates.csv')
df_csv

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


#### Checking of the postal codes in the cvs match those already in the dataframe, and update dataframe accordingly.

In [10]:
latitudes = list()
longitudes = list()
for i, postal_code in enumerate(df_['Postal Code']):
    latitudes = latitudes + df_csv.loc[df_csv['Postal Code'] == postal_code, 'Latitude'].tolist()
    longitudes = longitudes + df_csv.loc[df_csv['Postal Code'] == postal_code, 'Longitude'].tolist()
    
df_['Latitude'] = latitudes
df_['Longitude'] = longitudes

In [11]:
df_

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.753259,-79.329656
3,M4A,North York,Victoria Village,43.725882,-79.315572
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
165,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
168,M7Y,East Toronto,Business reply mail Processing Centre,43.662744,-79.321558
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


# Question 3

#### Gather all Boroughs that constitute Toronto

In [12]:
df_Toronto = pd.DataFrame(df_.loc[df_['Borough'].str.find('Toronto')>=0,:])
df_Toronto.reset_index(inplace=True)
df_Toronto.drop(columns="index",inplace=True)
df_Toronto.sort_values(['Borough','Latitude','Longitude'],inplace=True)
df_Toronto.reset_index(inplace=True)
df_Toronto.drop(columns="index",inplace=True)
df_Toronto

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5R,Central Toronto,"The Annex, North Midtown, Yorkville",43.67271,-79.405678
1,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049
2,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
3,M5P,Central Toronto,Forest Hill North & West,43.696948,-79.411307
4,M4S,Central Toronto,Davisville,43.704324,-79.38879
5,M5N,Central Toronto,Roselawn,43.711695,-79.416936
6,M4P,Central Toronto,Davisville North,43.712751,-79.390197
7,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
8,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442


#### How many Neighborhoods do we have?

In [13]:
df_Toronto['Neighborhood'].unique().shape[0]

39

#### Toront map with the Neighborhoods highlighted.

In [14]:
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

# create map of Toronto using the mean latitude and longitude
Toronto_map = folium.Map(location=[df_Toronto['Latitude'].mean(), df_Toronto['Longitude'].mean()], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_Toronto['Latitude'], df_Toronto['Longitude'], df_Toronto['Borough'], df_Toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(Toronto_map)  
    
Toronto_map

## Find the venues available in each Neighborhood.

In [15]:
import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

Collecting package metadata (current_repodata.json): done
Solving environment: \ 
  - anaconda/linux-64::ca-certificates-2020.1.1-0, anaconda/linux-64::openssl-1.1.1d-h7b6447c_4
  - anaconda/linux-64::ca-certificates-2020.1.1-0, defaults/linux-64::openssl-1.1.1d-h7b6447c_4
  - anaconda/linux-64::openssl-1.1.1d-h7b6447c_4, defaults/linux-64::ca-certificates-2020.1.1-0
  - defaults/linux-64::ca-certificates-2020.1.1-0, defaults/linux-64::openssl-1.1.1d-h7b6447cdone

# All requested packages already installed.



#### My details to access Foursquare API.

In [16]:
CLIENT_ID = 'G0CBNEMAXDCCJZAVFJYXANUIHOVM0VS2LCCOBZSAHHGV0KFO' # your Foursquare ID
CLIENT_SECRET = 'EO2XSLOZYS2M1F5TRA3GHTLW3GRSVI4W2S21G00KIXFWAZHS' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version   

#### Function that gets the venues for each Neighborhood.

In [39]:
def getNearbyVenues(names, latitudes, longitudes, radius=1000, limit=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Get the venues for each Neighborhood in Toronto, using the dataframe and function defined above. 

In [40]:
# type your answer here

venues_Toronto = getNearbyVenues(names=df_Toronto['Neighborhood'],
                                 latitudes=df_Toronto['Latitude'],
                                 longitudes=df_Toronto['Longitude'],
                                 )

The Annex, North Midtown, Yorkville
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
Moore Park, Summerhill East
Forest Hill North & West
Davisville
Roselawn
Davisville North
North Toronto West
Lawrence Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Harbourfront East, Union Station, Toronto Islands
Berczy Park
Stn A PO Boxes
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
First Canadian Place, Underground city
Richmond, Adelaide, King
St. James Town
Kensington Market, Chinatown, Grange Park
Regent Park, Harbourfront
Garden District, Ryerson
Central Bay Street
Queen's Park, Ontario Provincial Government
University of Toronto, Harbord
Church and Wellesley
St. James Town, Cabbagetown
Christie
Rosedale
Studio District
Business reply mail Processing Centre
India Bazaar, The Beaches West
The Beaches
The Danforth West, Riverdale
Brockton, Parkdale Village, Exhibition Place
Little Portugal

#### How many venues were gathered? What is the aspect of the dataframe?

In [41]:
print(venues_Toronto.shape)
venues_Toronto.head()

(3188, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"The Annex, North Midtown, Yorkville",43.67271,-79.405678,Roti Cuisine of India,43.674618,-79.408249,Indian Restaurant
1,"The Annex, North Midtown, Yorkville",43.67271,-79.405678,Ezra's Pound,43.675153,-79.405858,Café
2,"The Annex, North Midtown, Yorkville",43.67271,-79.405678,Rose & Sons,43.675668,-79.403617,American Restaurant
3,"The Annex, North Midtown, Yorkville",43.67271,-79.405678,Jean Sibelius Square,43.671426,-79.408831,Park
4,"The Annex, North Midtown, Yorkville",43.67271,-79.405678,Creeds Coffee Bar,43.6741,-79.410838,Coffee Shop


#### How many Venue Categories there are?

In [42]:
len(venues_Toronto['Venue Category'].unique())

277

#### Check if one of the Venue Categories is called 'Neighborhood'.

In [43]:
import numpy as np
np.sum(venues_Toronto['Venue Category'].unique()=='Neighborhood')

1

### Reshaping the data to be ready for clustering. 
#### Before the hot encoding, the Neighborhood is set as index to facilitate the need of keeping it.
#### To avoid conflicts, we will rename the Venue Category 'Neighborhood' (which we realised it exists in the previous cell) as 'Cat Neighborhood'.
#### Clustering will be done using the feature "Venue Category".

In [44]:
# one hot encoding
onehot_Toronto = pd.get_dummies(venues_Toronto.set_index('Neighborhood')['Venue Category'], prefix="", prefix_sep="")
onehot_Toronto.rename(columns={'Neighborhood':'Cat Neighborhood'},inplace=True)
onehot_Toronto.reset_index(level=0, inplace=True)
onehot_Toronto

Unnamed: 0,Neighborhood,Accessories Store,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Aquarium,Art Gallery,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio,Zoo
0,"The Annex, North Midtown, Yorkville",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"The Annex, North Midtown, Yorkville",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"The Annex, North Midtown, Yorkville",0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"The Annex, North Midtown, Yorkville",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"The Annex, North Midtown, Yorkville",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3183,"Dufferin, Dovercourt Village",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3184,"Dufferin, Dovercourt Village",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3185,"Dufferin, Dovercourt Village",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3186,"Dufferin, Dovercourt Village",0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


#### Frequecny of each category by Neighborhood.

In [46]:
category_freq_Toronto = onehot_Toronto.groupby('Neighborhood').mean().reset_index()
category_freq_Toronto

Unnamed: 0,Neighborhood,Accessories Store,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Aquarium,Art Gallery,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio,Zoo
0,Berczy Park,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.02,...,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.01,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.01,...,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Business reply mail Processing Centre,0.0,0.0,0.0,0.021277,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0625,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.02,...,0.02,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.02,0.0
5,Christie,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.01,...,0.02,0.0,0.0,0.01,0.0,0.01,0.0,0.0,0.0,0.0
6,Church and Wellesley,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.01,...,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.01,0.02,0.0
7,"Commerce Court, Victoria Hotel",0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.02,...,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01,0.01,0.0,0.01,0.0,0.01,0.0,0.0,0.01,0.0
9,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01,0.01,0.0,0.01,0.0,0.01,0.0,0.0,0.02,0.0


## Run k-means Clustering with k equals to 6

In [47]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 6

category_freq_Toronto4clustering = category_freq_Toronto.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(category_freq_Toronto4clustering)

#### Mapping the name of the Neighborhood with the cluster id. This will allow to understand which Neighborhood belongs to which cluster.

In [48]:
Neighborhood_Cluster_dict = pd.Series(kmeans.labels_,category_freq_Toronto['Neighborhood']).to_dict()
Neighborhood_Cluster_dict

{'Berczy Park': 0,
 'Brockton, Parkdale Village, Exhibition Place': 3,
 'Business reply mail Processing Centre': 2,
 'CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport': 1,
 'Central Bay Street': 4,
 'Christie': 3,
 'Church and Wellesley': 4,
 'Commerce Court, Victoria Hotel': 0,
 'Davisville': 2,
 'Davisville North': 2,
 'Dufferin, Dovercourt Village': 2,
 'First Canadian Place, Underground city': 0,
 'Forest Hill North & West': 2,
 'Garden District, Ryerson': 4,
 'Harbourfront East, Union Station, Toronto Islands': 0,
 'High Park, The Junction South': 3,
 'India Bazaar, The Beaches West': 3,
 'Kensington Market, Chinatown, Grange Park': 3,
 'Lawrence Park': 5,
 'Little Portugal, Trinity': 3,
 'Moore Park, Summerhill East': 2,
 'North Toronto West': 2,
 'Parkdale, Roncesvalles': 3,
 "Queen's Park, Ontario Provincial Government": 4,
 'Regent Park, Harbourfront': 2,
 'Richmond, Adelaide, King': 0,
 'Rosedale': 4,
 'Roselawn': 2,
 

#### Add the information about the Clustering to the original table with the names of the Neighborhoods and the latitude and longitude.

In [49]:
df_Toronto['Cluster Label'] = df_Toronto['Neighborhood']
df_Toronto['Cluster Label'].replace(Neighborhood_Cluster_dict, inplace=True)
df_Toronto

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Label
0,M5R,Central Toronto,"The Annex, North Midtown, Yorkville",43.67271,-79.405678,3
1,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049,2
2,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316,2
3,M5P,Central Toronto,Forest Hill North & West,43.696948,-79.411307,2
4,M4S,Central Toronto,Davisville,43.704324,-79.38879,2
5,M5N,Central Toronto,Roselawn,43.711695,-79.416936,2
6,M4P,Central Toronto,Davisville North,43.712751,-79.390197,2
7,M4R,Central Toronto,North Toronto West,43.715383,-79.405678,2
8,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,5
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442,1


In [50]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
Toronto_map_with_clusters = folium.Map(location=[df_Toronto['Latitude'].mean(), df_Toronto['Longitude'].mean()], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_Toronto['Latitude'], df_Toronto['Longitude'], df_Toronto['Neighborhood'], df_Toronto['Cluster Label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(Toronto_map_with_clusters)
       
Toronto_map_with_clusters