# Week 3 CapStone Project
---
## Segmenting and Clustering Neighborhoods in Toronto


We will use `request` and `beautifulSoup` Python libraries to scrape the webpage provided to extract all the required information and put them into a pandas DataFrame

### Part 1: WebScrapping Wiki and Creating DataFrame

In [1]:
!conda install -c conda-forge geopy --yes
!conda install -c conda-forge folium --yes


Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/DSX-Python35

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2019.6.16  |       hecc5488_0         145 KB  conda-forge
    openssl-1.0.2r             |       h14c3975_0         3.1 MB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    certifi-2018.8.24          |        py35_1001         139 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.49-py_0         conda-forge
    geopy:           1.20.0-py_0       conda-forge

The following packages will be UPDATED:

   

In [2]:
#import libraries
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup as bs

In [28]:
# create a request for the url
wiki = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
content = bs(wiki,'lxml')

In [29]:
table = content.find('table',{'class':'wikitable sortable'})
# print(table)

In [5]:
# saving all the rows in a csv file
raw_data = "Postcode,Borough,Neighborhood\n"
for r in table.find_all('tr'):
    row = ""
    for item in r.find_all('td'):
        row = row + "," + item.text
    raw_data = raw_data + row[1:]
file = open("toronto.csv",'wb')
file.write(bytes(raw_data,encoding='ascii',errors='ignore'))

8768

In [6]:
# store in data frame
df = pd.read_csv('toronto.csv')
df.head()
df.shape

(288, 3)

In [7]:
# since many rows have not assigned entry we remove it
# indexname = df [ df['Borough'] == 'Not assigned'].index

df.drop(df [ df['Borough'] == 'Not assigned'].index, inplace=True)
df.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [8]:
df.loc[df['Neighborhood'] == 'Not assigned','Neighborhood'] = df['Borough']
df.shape

(211, 3)

In [9]:
df = df.groupby(['Postcode','Borough'],sort=False).agg(', '.join).reset_index()
df.shape

(103, 3)

### Part 2 Adding Lattitude and Longitude using geocoder

In [30]:
!wget -q -O 'Toronto_locations.csv' http://cocl.us/Geospatial_data
df_loc = pd.read_csv('Toronto_locations.csv')
df_loc.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [102]:
# Merge both the data frame into one

# set index to postal code
temp_df = df.set_index('Postcode')
temp_loc = df_loc.set_index('Postal Code')
loc_df = pd.concat([temp_df,temp_loc],axis=1,join='inner')
loc_df.index.name = 'PostalCode'
loc_df.reset_index(inplace=True)

In [103]:
loc_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


### Part 3 Explore and Create Cluster in the Toronto Neighborhood

In [13]:
# importing libraries
from geopy.geocoders import Nominatim

import matplotlib.cm as cm
import matplotlib.colors as colors

# !conda install scikit-learn --yes
from sklearn.cluster import KMeans

import folium
print('done!')

done!


In [14]:
# set up
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="tl-toronto-neigh")
location = geolocator.geocode(address)
lat = location.latitude
long = location.longitude
print(lat, long)

43.653963 -79.387207


In [15]:
map_toronto = folium.Map(location=[lat,long],zoom_start=10)

# adding labels to the map
for lt, lg, pc, bgh, ngh in zip(loc_df['Latitude'],loc_df['Longitude'],
                                loc_df['PostalCode'],loc_df['Borough'],loc_df['Neighborhood']):
    label = "{} [ {} ]: {}".format(bgh,pc,ngh)
    popup = folium.Popup(label, parse_html=True)
    folium.CircleMarker([lt,lg],radius=5,popup=popup,color='red',fill=True,
                        fill_color='#3186cc',fill_opacity = 0.6, parse_html=False).add_to(map_toronto)
map_toronto

In [27]:
# The code was removed by Watson Studio for sharing.

In [20]:
import json

radius = 500
LIMIT = 100

venue_list = []

for lt, lg, pc, bgh, ngh in zip(loc_df['Latitude'],loc_df['Longitude'],
                                loc_df['PostalCode'],loc_df['Borough'],loc_df['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lt,
        lg,
        radius, 
        LIMIT)
    result = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in result:
        venue_list.append((pc, 
            bgh,
            ngh,
            lt, 
            lg, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))
    

In [22]:
venue_df = pd.DataFrame(venue_list)
venue_df.columns = ['PostalCode', 'Borough', 'Neighborhood', 'Borough-Latitude', 
                    'Borough-Longitude', 'Venue-Name', 'Venue-Latitude', 'Venue-Longitude', 'Venue-Category']
venue_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Borough-Latitude,Borough-Longitude,Venue-Name,Venue-Latitude,Venue-Longitude,Venue-Category
0,M3A,North York,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,M3A,North York,Parkwoods,43.753259,-79.329656,KFC,43.754387,-79.333021,Fast Food Restaurant
2,M3A,North York,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,M4A,North York,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,M4A,North York,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop


In [26]:
venue_df.groupby(['PostalCode', 'Borough', 'Neighborhood'])['Venue-Name'].count()
len(venue_df['Venue-Category'].unique())

280

#### Analyzing Each Area's Venues

In [39]:
# creating one hot encoding matrix
onehot = pd.get_dummies(venue_df[['Venue-Category']],prefix="",prefix_sep="")

# onehot.head()
onehot['PostalCode'] = venue_df['PostalCode']
onehot['Borough'] = venue_df['Borough']
onehot['Neighborhood'] = venue_df['Neighborhood']
onehot.shape

(2258, 282)

In [41]:
toronto_group_mean = onehot.groupby(['PostalCode','Borough','Neighborhood']).mean().reset_index()
toronto_group_mean.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M1B,Scarborough,"Rouge, Malvern",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,Scarborough,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,Scarborough,Cedarbrae,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [69]:
# We will return top 10 most freq venues in our data frame
top_k = 10
indicators = ['st','nd','rd']
area = ['PostalCode','Borough','Neighborhood']
freq=[]

for x in np.arange(top_k):
    try:
        freq.append('{}{} Most Common Venue'.format(x+1, indicators[x]))
    except:
        freq.append('{}th Most Common Venue'.format(x+1))

top_venues = pd.DataFrame(columns = area + freq)
top_venues['PostalCode'] = toronto_group_mean['PostalCode']
top_venues['Borough'] = toronto_group_mean['Borough']
top_venues['Neighborhood'] = toronto_group_mean['Neighborhood']

# adding the ranking
for num in np.arange(toronto_group_mean.shape[0]):
    row = toronto_group_mean.iloc[num, :].iloc[3:]
    row = row.sort_values(ascending=False) # Highest Rank gets first position
    top_venues.iloc[num, 3:] = row.index.values[0:top_k]

top_venues.sort_values(freq, inplace=True)
top_venues

Unnamed: 0,PostalCode,Borough,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
66,M5V,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Terminal,Airport Lounge,Airport Service,Plane,Sculpture Garden,Bar,Boat or Ferry,Harbor / Marina,Airport Gate,Airport Food Court
74,M6H,West Toronto,"Dovercourt Village, Dufferin",Bakery,Supermarket,Pharmacy,Brazilian Restaurant,Coffee Shop,Bank,Middle Eastern Restaurant,Café,Discount Store,Pool
91,M9B,Etobicoke,"Cloverdale, Islington, Martin Grove, Princess ...",Bank,Yoga Studio,Dumpling Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Eastern European Restaurant,Dim Sum Restaurant
80,M6P,West Toronto,"High Park, The Junction South",Bar,Café,Mexican Restaurant,Fried Chicken Joint,Gastropub,Bakery,Cajun / Creole Restaurant,Furniture / Home Store,Speakeasy,Flea Market
75,M6J,West Toronto,"Little Portugal, Trinity",Bar,Men's Store,Asian Restaurant,Coffee Shop,Restaurant,Bakery,Café,Pizza Place,New American Restaurant,Cocktail Bar
89,M8Y,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So...",Baseball Field,Yoga Studio,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Eastern European Restaurant,Dim Sum Restaurant
94,M9M,North York,"Emery, Humberlea",Baseball Field,Yoga Studio,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Eastern European Restaurant,Dim Sum Restaurant
81,M6R,West Toronto,"Parkdale, Roncesvalles",Breakfast Spot,Gift Shop,Bookstore,Dog Run,Italian Restaurant,Restaurant,Bar,Dessert Shop,Movie Theater,Eastern European Restaurant
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",Bus Line,Bakery,Fast Food Restaurant,Intersection,Bus Station,Metro Station,Soccer Field,Park,Gift Shop,German Restaurant
19,M2L,North York,"Silver Hills, York Mills",Cafeteria,Dumpling Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Yoga Studio,Dim Sum Restaurant


#### Building KMeans Cluster

In [115]:
# dropping all info col

Kmeans_df = toronto_group_mean.drop(['PostalCode','Borough','Neighborhood'],1)
KM = KMeans(n_clusters=3,random_state=0).fit(Kmeans_df)
len(KM.labels_)

100

In [105]:
# The code was removed by Watson Studio for sharing.

In [116]:
cluster_df = loc_df
cluster_df['cluster'] = KM.labels_

cluster_df = cluster_df.join(top_venues.drop(
    ['Borough','Neighborhood'],1).set_index('PostalCode'), on='PostalCode')
cluster_df.sort_values(['cluster']+ freq, inplace=True)
cluster_df


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,cluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
87,M5V,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",43.628947,-79.394420,0,Airport Terminal,Airport Lounge,Airport Service,Plane,Sculpture Garden,Bar,Boat or Ferry,Harbor / Marina,Airport Gate,Airport Food Court
31,M6H,West Toronto,"Dovercourt Village, Dufferin",43.669005,-79.442259,0,Bakery,Supermarket,Pharmacy,Brazilian Restaurant,Coffee Shop,Bank,Middle Eastern Restaurant,Café,Discount Store,Pool
11,M9B,Etobicoke,"Cloverdale, Islington, Martin Grove, Princess ...",43.650943,-79.554724,0,Bank,Yoga Studio,Dumpling Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Eastern European Restaurant,Dim Sum Restaurant
69,M6P,West Toronto,"High Park, The Junction South",43.661608,-79.464763,0,Bar,Café,Mexican Restaurant,Fried Chicken Joint,Gastropub,Bakery,Cajun / Creole Restaurant,Furniture / Home Store,Speakeasy,Flea Market
37,M6J,West Toronto,"Little Portugal, Trinity",43.647927,-79.419750,0,Bar,Men's Store,Asian Restaurant,Coffee Shop,Restaurant,Bakery,Café,Pizza Place,New American Restaurant,Cocktail Bar
57,M9M,North York,"Emery, Humberlea",43.724766,-79.532242,0,Baseball Field,Yoga Studio,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Eastern European Restaurant,Dim Sum Restaurant
75,M6R,West Toronto,"Parkdale, Roncesvalles",43.648960,-79.456325,0,Breakfast Spot,Gift Shop,Bookstore,Dog Run,Italian Restaurant,Restaurant,Bar,Dessert Shop,Movie Theater,Eastern European Restaurant
44,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577,0,Bus Line,Bakery,Fast Food Restaurant,Intersection,Bus Station,Metro Station,Soccer Field,Park,Gift Shop,German Restaurant
45,M2L,North York,"Silver Hills, York Mills",43.757490,-79.374714,0,Cafeteria,Dumpling Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Yoga Studio,Dim Sum Restaurant
80,M5S,Downtown Toronto,"Harbord, University of Toronto",43.662696,-79.400049,0,Café,Bakery,Bar,Italian Restaurant,Japanese Restaurant,Bookstore,Restaurant,Poutine Place,Beer Bar,Beer Store


In [117]:
# create a cluster using this data frame
map_clusters = folium.Map(location=[lat,long],zoom_start=10)

# set color scheme for the clusters
x = np.arange(3)
ys = [i + x + (i*x)**2 for i in range(3)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(cluster_df['Latitude'], cluster_df['Longitude'], cluster_df['Neighborhood'], cluster_df['cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters