In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

In [6]:
# creat the wiki link
wiki = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(wiki.text, 'lxml')
soup.prettify

# find table
table = soup.find('table',class_='wikitable')
table

<table class="wikitable sortable">
<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
</td></tr>
<tr>
<td>M6A</td>

In [7]:
# Store parsed data into Pandas DataFrame

# iterate the wikitable to get the data
data = []
columns = []
for index, tr in enumerate(table.find_all('tr')):
    section = []
    for td in tr.find_all(['th','td']):
        section.append(td.text.rstrip())
    
# First row of data is the header
    if (index == 0):
        columns = section
    else:
        data.append(section)
        
Toronto=pd.DataFrame(data = data,columns = columns)
Toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [8]:
#Ignore cells with a borough that is Not assigned.
Toronto = Toronto[Toronto['Borough'] != 'Not assigned']
Toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [9]:
#More than one neighborhood can exist in one postal code area. 
#Combined into one row with the neighborhoods separated with a comma

Toronto["Neighbourhood"] = Toronto.groupby("Postcode")["Neighbourhood"].transform(lambda neigh: ', '.join(neigh))

Toronto=Toronto.drop_duplicates()
Toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Harbourfront, Regent Park"
6,M6A,North York,"Lawrence Heights, Lawrence Manor"
8,M7A,Queen's Park,Not assigned


In [10]:
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough

Toronto['Neighbourhood'].replace("Not assigned", Toronto["Borough"],inplace=True)
Toronto

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Harbourfront, Regent Park"
6,M6A,North York,"Lawrence Heights, Lawrence Manor"
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,"Rouge, Malvern"
14,M3B,North York,Don Mills North
15,M4B,East York,"Woodbine Gardens, Parkview Hill"
17,M5B,Downtown Toronto,"Ryerson, Garden District"


In [11]:
TorontoPostalCodes = Toronto.set_index("Postcode")
TorontoPostalCodes.rename_axis("Postal Code", axis='index', inplace=True)
TorontoPostalCodes.head()

Unnamed: 0_level_0,Borough,Neighbourhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Harbourfront, Regent Park"
M6A,North York,"Lawrence Heights, Lawrence Manor"
M7A,Queen's Park,Queen's Park


In [12]:
!conda install -c conda-forge geocoder -y


import geocoder

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geocoder


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2019.9.11  |       hecc5488_0         144 KB  conda-forge
    geocoder-1.38.1            |             py_1          53 KB  conda-forge
    ratelim-0.1.6              |             py_2           6 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    certifi-2019.9.11          |           py36_0         147 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.4 MB

The following NEW packages will be INSTALLED:

    geocoder:        1.38.1-py_1       conda-forge
    ratelim:         0.1.6-py_2        conda-forge

The following packages will be UPDATED:

    

In [13]:
toronto_go = pd.read_csv('https://cocl.us/Geospatial_data')
toronto_go
toronto_data = toronto_go.set_index("Postal Code")
toronto_data


Unnamed: 0_level_0,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476
M1J,43.744734,-79.239476
M1K,43.727929,-79.262029
M1L,43.711112,-79.284577
M1M,43.716316,-79.239476
M1N,43.692657,-79.264848


In [14]:
Toronto_2 = TorontoPostalCodes.join(toronto_data)
Toronto_2

Unnamed: 0_level_0,Borough,Neighbourhood,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M3A,North York,Parkwoods,43.753259,-79.329656
M4A,North York,Victoria Village,43.725882,-79.315572
M5A,Downtown Toronto,"Harbourfront, Regent Park",43.654260,-79.360636
M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
M7A,Queen's Park,Queen's Park,43.662301,-79.389494
M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
M3B,North York,Don Mills North,43.745906,-79.352188
M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


In [15]:
import json 
from geopy.geocoders import Nominatim 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
!pip install folium==0.5.0
import folium 

Collecting folium==0.5.0
[?25l  Downloading https://files.pythonhosted.org/packages/07/37/456fb3699ed23caa0011f8b90d9cad94445eddc656b601e6268090de35f5/folium-0.5.0.tar.gz (79kB)
[K     |████████████████████████████████| 81kB 13.1MB/s eta 0:00:01
[?25hCollecting branca (from folium==0.5.0)
  Downloading https://files.pythonhosted.org/packages/63/36/1c93318e9653f4e414a2e0c3b98fc898b4970e939afeedeee6075dd3b703/branca-0.3.1-py3-none-any.whl
Building wheels for collected packages: folium
  Building wheel for folium (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/dsxuser/.cache/pip/wheels/f8/98/ff/954791afc47740d554f0d9e5885fa09dd60c2265d42578e665
Successfully built folium
Installing collected packages: branca, folium
Successfully installed branca-0.3.1 folium-0.5.0


In [16]:
address = 'Toronto, Ontario Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto Canada are {}, {}.'.format(latitude, longitude))

  app.launch_new_instance()


The geograpical coordinate of Toronto Canada are 43.653963, -79.387207.


In [18]:
# create map 
map_Toronto = folium.Map(location= [latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(Toronto_2['Latitude'], Toronto_2['Longitude'], Toronto_2['Borough'], Toronto_2['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=4,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#87cefa',
        fill_opacity=0.5,
        parse_html=False).add_to(map_Toronto)
map_Toronto

In [19]:
from pandas.io.json import json_normalize
import folium
from geopy.geocoders import Nominatim 
import requests

In [23]:
CLIENT_ID = 'WPMSSA1HFT5OTPMZF11HA1LYRPJG1K22JVYVSIJTOZZHSIAP' # your Foursquare ID
CLIENT_SECRET = 'G3FK4EBQERDRORNFWK3SODHUNZM5WGVFDZVUWB1LJAPAMTAG' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 100

In [28]:
radius = 2000
LIMIT = 100

venues = []

for lat, lng, borough, neighborhood in zip(Toronto_2['Latitude'], Toronto_2['Longitude'], Toronto_2['Borough'], Toronto_2['Neighbourhood']):
    
    
     # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        lng,
        radius, 
        LIMIT)
    
     # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            lng, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [27]:
# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues)

# define the column names
venues_df.columns = ['Neighborhood', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

(8562, 7)


Unnamed: 0,Neighborhood,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,Parkwoods,43.753259,-79.329656,Allwyn's Bakery,43.75984,-79.324719,Caribbean Restaurant
1,Parkwoods,43.753259,-79.329656,Donalda Golf & Country Club,43.752816,-79.342741,Golf Course
2,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
3,Parkwoods,43.753259,-79.329656,LCBO,43.757774,-79.314257,Liquor Store
4,Parkwoods,43.753259,-79.329656,Graydon Hall Manor,43.763923,-79.342961,Event Space


In [29]:
 #check how many venues were returned for each neighorhood
    
    
venues_df.groupby(["Neighborhood"]).count()

Unnamed: 0_level_0,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100
Agincourt,100,100,100,100,100,100
"Agincourt North, L'Amoreaux East, Milliken, Steeles East",98,98,98,98,98,98
"Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown",47,47,47,47,47,47
"Alderwood, Long Branch",100,100,100,100,100,100
"Bathurst Manor, Downsview North, Wilson Heights",53,53,53,53,53,53
Bayview Village,47,47,47,47,47,47
"Bedford Park, Lawrence Manor East",100,100,100,100,100,100
Berczy Park,100,100,100,100,100,100
"Birch Cliff, Cliffside West",46,46,46,46,46,46


In [30]:
# find out how many unique categories can be curated from all the returned venues

print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))

There are 327 uniques categories.


In [31]:
# print out the list of categories
venues_df['VenueCategory'].unique()[:50]

array(['Caribbean Restaurant', 'Golf Course', 'Park', 'Liquor Store',
       'Event Space', 'Middle Eastern Restaurant', 'Supermarket',
       'Gym / Fitness Center', 'Mediterranean Restaurant',
       'Japanese Restaurant', 'Seafood Restaurant', 'Persian Restaurant',
       'Fast Food Restaurant', 'Café', 'Ice Cream Shop',
       'Asian Restaurant', 'Pizza Place', 'American Restaurant',
       'Steakhouse', 'Chinese Restaurant', 'Coffee Shop',
       'Discount Store', 'Pool Hall', 'Burger Joint', 'Fish & Chips Shop',
       'Grocery Store', 'Fried Chicken Joint', 'Pharmacy',
       'Italian Restaurant', 'Hotel', 'Salad Place',
       'Paper / Office Supplies Store', 'Gym', 'Bank', 'Hakka Restaurant',
       'Beer Store', 'Sandwich Place', 'Thai Restaurant',
       'Greek Restaurant', 'Breakfast Spot', 'Wings Joint', 'Bar',
       'Automotive Shop', 'Diner', 'Furniture / Home Store',
       'Skating Rink', 'Toy / Game Store', 'Bus Line', 'Sushi Restaurant',
       'Video Store'], dtype

In [32]:
# check if the results contain "Shopping Mall"
"Neighborhood" in venues_df['VenueCategory'].unique()

True

In [34]:

# one hot encoding
To_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
To_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [To_onehot.columns[-1]] + list(To_onehot.columns[:-1])
To_onehot = To_onehot[fixed_columns]

print(To_onehot.shape)
To_onehot.head()

(8562, 328)


Unnamed: 0,Neighborhoods,Accessories Store,Afghan Restaurant,African Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,...,Vietnamese Restaurant,Volleyball Court,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
#group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

To_grouped = To_onehot.groupby(["Neighborhoods"]).mean().reset_index()

print(To_grouped.shape)
To_grouped

(103, 328)


Unnamed: 0,Neighborhoods,Accessories Store,Afghan Restaurant,African Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,...,Vietnamese Restaurant,Volleyball Court,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,"Adelaide, King, Richmond",0.0,0.00,0.000000,0.00,0.00,0.020000,0.00,0.00,0.000000,...,0.000000,0.000000,0.00,0.0,0.00,0.000000,0.000000,0.010000,0.000000,0.000000
1,Agincourt,0.0,0.00,0.000000,0.00,0.00,0.010000,0.00,0.00,0.000000,...,0.010000,0.000000,0.00,0.0,0.01,0.000000,0.000000,0.000000,0.000000,0.000000
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.0,0.00,0.000000,0.00,0.00,0.000000,0.00,0.00,0.000000,...,0.030612,0.000000,0.00,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.0,0.00,0.000000,0.00,0.00,0.000000,0.00,0.00,0.000000,...,0.000000,0.000000,0.00,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000
4,"Alderwood, Long Branch",0.0,0.00,0.000000,0.00,0.00,0.010000,0.00,0.00,0.000000,...,0.000000,0.000000,0.00,0.0,0.00,0.020000,0.000000,0.000000,0.000000,0.000000
5,"Bathurst Manor, Downsview North, Wilson Heights",0.0,0.00,0.000000,0.00,0.00,0.018868,0.00,0.00,0.000000,...,0.000000,0.000000,0.00,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000
6,Bayview Village,0.0,0.00,0.000000,0.00,0.00,0.000000,0.00,0.00,0.000000,...,0.021277,0.000000,0.00,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000
7,"Bedford Park, Lawrence Manor East",0.0,0.00,0.000000,0.00,0.00,0.000000,0.00,0.00,0.000000,...,0.000000,0.000000,0.00,0.0,0.00,0.010000,0.000000,0.000000,0.000000,0.000000
8,Berczy Park,0.0,0.00,0.000000,0.00,0.00,0.020000,0.00,0.00,0.000000,...,0.000000,0.000000,0.00,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000
9,"Birch Cliff, Cliffside West",0.0,0.00,0.000000,0.00,0.00,0.000000,0.00,0.00,0.000000,...,0.000000,0.000000,0.00,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000


In [38]:
len(To_grouped[To_grouped["Shopping Mall"] > 0])

43

In [40]:
To_mall = To_grouped[["Neighborhoods","Shopping Mall"]]
To_mall.head()

Unnamed: 0,Neighborhoods,Shopping Mall
0,"Adelaide, King, Richmond",0.01
1,Agincourt,0.02
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.010204
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.0
4,"Alderwood, Long Branch",0.01


In [42]:
# Run k-means to cluster the neighborhoods in Toronto into 3 clusters.

# set number of clusters
Tclusters = 3

To_clustering = To_mall.drop(["Neighborhoods"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=Tclusters, random_state=0).fit(To_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([1, 1, 1, 0, 1, 1, 2, 0, 0, 0], dtype=int32)

In [58]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
To_merged = To_mall.copy()

# add clustering labels
To_merged["Cluster Labels"] = kmeans.labels_
To_merged.rename(columns={"Neighborhoods": "Neighborhood"}, inplace=True)
To_merged.head()

Unnamed: 0,Neighborhood,Shopping Mall,Cluster Labels
0,"Adelaide, King, Richmond",0.01,1
1,Agincourt,0.02,1
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.010204,1
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.0,0
4,"Alderwood, Long Branch",0.01,1


In [59]:

# sort the results by Cluster Labels
print(To_merged.shape)
To_merged.sort_values(["Cluster Labels"], inplace=True)
To_merged

(103, 3)


Unnamed: 0,Neighborhood,Shopping Mall,Cluster Labels
51,"Harbourfront, Regent Park",0.000000,0
64,Lawrence Park,0.000000,0
62,L'Amoreaux West,0.000000,0
58,Humewood-Cedarvale,0.000000,0
56,"Humber Bay, King's Mill Park, Kingsway Park So...",0.000000,0
55,"Humber Bay Shores, Mimico South, New Toronto",0.000000,0
53,"Highland Creek, Rouge Hill, Port Union",0.000000,0
52,"High Park, The Junction South",0.000000,0
101,Woodbine Heights,0.000000,0
50,"Harbourfront East, Toronto Islands, Union Station",0.000000,0
