1. Scrape Wikipedia for data and create dataframe¨

In [49]:
#import the needed libraries
import requests 
from bs4 import BeautifulSoup
import pandas as pd 
from sklearn.cluster import KMeans
import numpy as np

In [2]:
url ='https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=942851379'

In [3]:
wikipage = requests.get(url)


In [4]:
#Parse the HTML page
soup = BeautifulSoup(wikipage.content, "lxml")
print(soup)

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of postal codes of Canada: M - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"XoOv2ApAMM8AAWwA1BUAAAAA","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":948084252,"wgRevisionId":942851379,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communications in Ontario","Postal codes in Canada","Toronto","Ontario-related lists"],"wgPageContentLanguage":"en","wgPag

In [5]:

# Get Table Data from the wiki page data 
table = soup.find('table', class_='wikitable sortable')
rows = table.find_all('tr')
data =[]
for row in rows:
    data.append([t.text.strip() for t in row.find_all('td')])

neighborhoods = pd.DataFrame(data, columns=['Postal Code', 'Borough', 'Neighborhood'])
neighborhoods =  neighborhoods[neighborhoods.Borough != "Not assigned"]
neighborhoods = neighborhoods.reset_index()
neighborhoods = neighborhoods.drop(["index"], axis=1)
neighborhoods.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,,,
1,M3A,North York,Parkwoods
2,M4A,North York,Victoria Village
3,M5A,Downtown Toronto,Harbourfront
4,M6A,North York,Lawrence Heights


In [35]:
#Group Neighborhoods with the same postal code 
filter_neighborhoods = neighborhoods.groupby(['Postal Code', 'Borough']).agg({'Neighborhood': lambda x:' , '.join(x)}).reset_index()

In [7]:
filter_neighborhoods.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## Adding the Coordinates of the Postal Codes 

In [8]:
import geocoder # import geocoder



# loop until you get the coordinates
latitude = []
longitude = []

for index, row in filter_neighborhoods.iterrows():
    postal_code = row["Postal Code"]
    lat_lng_coords = None
    while(lat_lng_coords is None):
      g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
      lat_lng_coords = g.latlng

latitude.append(lat_lng_coords[0])
longitude.append(lat_lng_coords[1])

print(latitude)
print(longitude)



KeyboardInterrupt: 

In [9]:
!wget -O Geospatial_Coordinates.csv https://cocl.us/Geospatial_data/Geospatial_Coordinates.csv

--2020-03-31 23:02:48--  https://cocl.us/Geospatial_data/Geospatial_Coordinates.csv
Resolving cocl.us (cocl.us)... 159.8.72.228, 159.8.69.21, 159.8.69.24
Connecting to cocl.us (cocl.us)|159.8.72.228|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2020-03-31 23:02:49--  https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Resolving ibm.box.com (ibm.box.com)... 185.235.236.197
Connecting to ibm.box.com (ibm.box.com)|185.235.236.197|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2020-03-31 23:02:50--  https://ibm.box.com/public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Reusing existing connection to ibm.box.com:443.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.ent.box.com/public/static/9afzr83pps4

In [13]:
#Read the CSV and drop the postal code column 
coordinates = pd.read_csv("Geospatial_Coordinates.csv")
coordinates = coordinates.drop(["Postal Code"], axis=1)
rounded_coordinates = coordinates.round({'Latitude':2, 'Longitude':2})

In [14]:
rounded_coordinates.head()

Unnamed: 0,Latitude,Longitude
0,43.81,-79.19
1,43.78,-79.16
2,43.76,-79.19
3,43.77,-79.22
4,43.77,-79.24


In [15]:
#Concat the two data frames 
filter_neighborhoods = pd.concat([filter_neighborhoods, rounded_coordinates], axis=1)
filter_neighborhoods.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.81,-79.19
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.78,-79.16
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.76,-79.19
3,M1G,Scarborough,Woburn,43.77,-79.22
4,M1H,Scarborough,Cedarbrae,43.77,-79.24


## Getting and Plotting Foursquare Data

In [16]:
from geopy.geocoders import Nominatim
import json
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium
print ('Lirbraries imported.')

Lirbraries imported.


In [17]:
toronto_filtered = filter_neighborhoods[filter_neighborhoods.Borough.str.contains('Toronto')] #filter out 
toronto_filtered.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.68,-79.29
41,M4K,East Toronto,"The Danforth West, Riverdale",43.68,-79.35
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.67,-79.32
43,M4M,East Toronto,Studio District,43.66,-79.34
44,M4N,Central Toronto,Lawrence Park,43.73,-79.39


In [36]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="toronto_mapper")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

#create the map of Toronto 
map_toronto = folium.Map(location=[latitude, longitude],zoom_start=12)


for lat, lng, neighborhood, borough in zip(toronto_filtered['Latitude'], toronto_filtered['Longitude'], toronto_filtered['Neighborhood'], toronto_filtered['Borough']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto 

## Using K means to cluster neighborhoods

In [47]:
k = 4
toronto_clustered = toronto_filtered.drop(['Postal Code','Borough','Neighborhood'], 1)
kmeans = KMeans(n_clusters= k, random_state=0).fit(toronto_clustered)
kmeans.labels_
toronto_filtered.insert(0, 'Cluster #', kmeans.labels_)

ValueError: cannot insert Cluster #, already exists

In [48]:
toronto_filtered

Unnamed: 0,Cluster #,Postal Code,Borough,Neighborhood,Latitude,Longitude
37,3,M4E,East Toronto,The Beaches,43.68,-79.29
41,3,M4K,East Toronto,"The Danforth West, Riverdale",43.68,-79.35
42,3,M4L,East Toronto,"The Beaches West, India Bazaar",43.67,-79.32
43,3,M4M,East Toronto,Studio District,43.66,-79.34
44,0,M4N,Central Toronto,Lawrence Park,43.73,-79.39
45,0,M4P,Central Toronto,Davisville North,43.71,-79.39
46,0,M4R,Central Toronto,North Toronto West,43.72,-79.41
47,0,M4S,Central Toronto,Davisville,43.7,-79.39
48,0,M4T,Central Toronto,"Moore Park, Summerhill East",43.69,-79.38
49,0,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.69,-79.4


## Map the Clusters

In [55]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="toronto_mapper")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

#create the map of Toronto 
map_clusters = folium.Map(location=[latitude, longitude],zoom_start=12)

#create colors
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0,1,len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

#add markers to the map 
markers_colors = []
for lat, lon, neighborhood, cluster in zip(toronto_filtered['Latitude'], 
                                           toronto_filtered['Longitude'],
                                           toronto_filtered['Neighborhood'],
                                           toronto_filtered['Cluster #']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
    [lat, lon],
    radius=5,
    popup=label,
    color=rainbow[cluster-1],
    fill=True,
    fill_color=rainbow[cluster-1],
    fill_opacity=0.7).add_to(map_clusters)

map_clusters 


### Retrive Foursquare information

In [19]:
CLIENT_ID = 'TJVRDDBR45XWMUQH4EV1QYCPNTLR2RQAROBNMHQXJPBTI33I' # your Foursquare ID
CLIENT_SECRET = 'BHJOCMOCKSYORF3WGYOOG1B50S3CAC2MEPDESQPIEMPCOMT2' # your Foursquare Secret
ACCESS_TOKEN = 'FYXYLPMMWRDIPWIFPTAH3NEVILB5SANF1YLXLM2UNM3MRQEK'

VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: TJVRDDBR45XWMUQH4EV1QYCPNTLR2RQAROBNMHQXJPBTI33I
CLIENT_SECRET:BHJOCMOCKSYORF3WGYOOG1B50S3CAC2MEPDESQPIEMPCOMT2


In [39]:

LIMIT = 100
CATEGORY_ID = '4d4b7105d754a06374d81259'
#Function to get the Toronto Neighborhoods
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        print(lat)
        print(lng)
        #API Request URL 
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&oauth_token={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            ACCESS_TOKEN,
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        #make a GET Request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
 
        #reutn only relevant info
        venues_list.append([(
            name,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])
        
        nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
        nearby_venues.columns = ['Neighborhood',
                               'Neighborhood Latitude',
                               'Neighborhood Longitude',
                               'Venue',
                               'Venue Latitude',
                               'Venue Longitude',
                               'Venue Category']
        return(nearby_venues)


In [40]:
print(toronto_filtered['Neighborhood'])

toronto_venues = getNearbyVenues(names=toronto_filtered['Neighborhood'],
                                 latitudes=toronto_filtered['Latitude'], 
                                 longitudes=toronto_filtered['Longitude'])

37                                          The Beaches
41                         The Danforth West, Riverdale
42                       The Beaches West, India Bazaar
43                                      Studio District
44                                        Lawrence Park
45                                     Davisville North
46                                   North Toronto West
47                                           Davisville
48                          Moore Park, Summerhill East
49    Deer Park, Forest Hill SE, Rathnelly, South Hi...
50                                             Rosedale
51                          Cabbagetown, St. James Town
52                                 Church and Wellesley
53                                         Harbourfront
54                             Ryerson, Garden District
55                                       St. James Town
56                                          Berczy Park
57                                   Central Bay

In [41]:
print(toronto_venues.shape)
toronto_venues

(28, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.68,-79.29,The Beech Tree,43.680493,-79.288846,Gastropub
1,The Beaches,43.68,-79.29,Beaches Bake Shop,43.680363,-79.289692,Bakery
2,The Beaches,43.68,-79.29,The Feathers Pub,43.680501,-79.287522,Pub
3,The Beaches,43.68,-79.29,Glen Manor Ravine,43.676821,-79.293942,Trail
4,The Beaches,43.68,-79.29,The Real Jerk Beaches,43.680781,-79.285727,Caribbean Restaurant
5,The Beaches,43.68,-79.29,The Green Dragon,43.680834,-79.284712,Pub
6,The Beaches,43.68,-79.29,No Bull Burger,43.680761,-79.285019,Burger Joint
7,The Beaches,43.68,-79.29,Subway,43.680716,-79.287006,Sandwich Place
8,The Beaches,43.68,-79.29,Threads Lifestyle,43.680516,-79.287752,Clothing Store
9,The Beaches,43.68,-79.29,Guardian Upper Beach Pharmacy,43.680732,-79.28772,Pharmacy
