# Segmenting and Clustering Neighborhoods in Toronto

## 1. Use webscraping to copy table of Toronto postal codes, neighborhoods and boroughs from Wikipedia

### We'll start by importing Pandas, requests and BeautifulSoup

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

### Download the Wikipedia page and save the contents as html_data

In [None]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
res = requests.get(url)
html_data = res.content

### Create a BeautifulSoup ojbect ("soup") and extract the table to a variable called "table"

In [None]:
soup = BeautifulSoup(html_data, 'html.parser')
table = soup.find('table')

### Create an empty list called "table_contents". Then we will use a for loop to go row by row in the table, saving the values to a dictionary with keys 'PostalCode', 'Borough' and 'Neighborhood'. At the end of each loop iteration, the new dictionary entry is appended to the list.

In [None]:
table_contents = []

for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['Postal Code'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

In [None]:
Toronto_Boroughs=pd.DataFrame(table_contents)
Toronto_Boroughs.head(10)

### Replace some PO boxes and industrial addresses with simpler "Neighborhood" names

In [None]:
Toronto_Boroughs['Borough']=Toronto_Boroughs['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})


### Print the shape (number of entries)

In [None]:
Toronto_Boroughs.shape

## 2. Get Geographic Coordinates
### Use !wget to download the provided csv file and the pandas 'read_csv' function to convert it to a dataframe

In [None]:
geo_data = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv'

!wget -O Geospatial_Coordinates.csv https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv

lat_long = pd.read_csv('Geospatial_Coordinates.csv')
lat_long.head()

### Merge lat_long and Toronto_Boroughs dataframes

In [None]:
boroughs_latlong = pd.merge(Toronto_Boroughs, lat_long, on='Postal Code')
boroughs_latlong

## 3. Cluster Toronto Boroughs
### Import additional libraries to fetch data and draw maps

In [None]:
import numpy as np
import random # library for random number generation

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
import json
from pandas.io.json import json_normalize

import folium # plotting library

#Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

### Get Lat/Long for Toronto and display a map of the neighborhoods

In [None]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

In [None]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(boroughs_latlong['Latitude'], boroughs_latlong['Longitude'], boroughs_latlong['Borough'], boroughs_latlong['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [None]:
### Input FourSquare Credentials:

In [None]:
CLIENT_ID = 'R0XOAK5S5QH1TRVROZT4RIMFXZSNXAKLFXINYOESGFUC2CFX' # your Foursquare ID
CLIENT_SECRET = 'EQW23JZIIHY53OU4LVRNRE2HPPC44E3ODH5KYJRTBTCFUT42' # your Foursquare Secret
ACCESS_TOKEN = '2XFXFNFWF2TXZOMFLIUE3ZKOHD10OY4BUYQMIF3MTLKP2CQG' # your FourSquare Access Token
VERSION = '20180604'
LIMIT = 100

### Let's borrow the get_category_type function from the Foursquare lab.

In [None]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

### Let's also keep the function to get venue data (using the getNearbyVenues function) but modify it for neighborhoods in Toronto

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
### Now, we'll use the functions above to populate "toronto_venues" with the venue info from FourSquare for each Neighborhood:

In [None]:
toronto_venues = getNearbyVenues(names=boroughs_latlong['Neighborhood'],
                                   latitudes=boroughs_latlong['Latitude'],
                                   longitudes=boroughs_latlong['Longitude']
                                  )

### Make sure we got a full dataframe

In [None]:
print(toronto_venues.shape)
toronto_venues.head()

### How many unique venue categories are there in Toronto?

In [None]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

### Analyze each neighborhood

In [None]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

toronto_onehot

### Move 'Neighborhood' to the first column.

In [None]:
first_column = toronto_onehot.pop('Neighborhood')
toronto_onehot.insert(0, 'Neighborhood', first_column)

toronto_onehot.head()

In [None]:
toronto_onehot.shape

### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [None]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

### I'm gonna steal this function to put venue categories in descending order of prominence:

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### ...and create a nice table of the top 10 venue categories for each neighborhood

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

### And then let's do the k-means cluster analysis (and just stick with 5 clusters):

In [None]:
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

### Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
### But first, let's have a reminder of what the "neighborhoods_venues_sorted" df looks like:

In [None]:
neighborhoods_venues_sorted

### Now we'll add the cluster labels to it:

In [None]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
#neighborhoods_venues_sorted #verify above line added cluster label to each neighborhood in neighborhoods_venues_sorted

toronto_merged = boroughs_latlong

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head(105)

### Some of the Cluster Labels ended up as NaN for some reason, which results in "Cluster Label" being converted to float, which won't work down the line. I'm just going to get rid of those neighborhoods. They clearly suck anyway. Then we'll convert the "Cluster Labels" to int64.

In [None]:
toronto_merged = toronto_merged.drop(labels=[5,45,95], axis=0)

In [None]:
toronto_merged['Cluster Labels']=toronto_merged['Cluster Labels'].astype('int64') 

In [None]:
toronto_merged

### And lastly, lets map it

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters