# part 1 - scraping the data and cleaning the table

### import the required packs 

In [1]:
from bs4 import BeautifulSoup
import requests
from urllib.request import urlopen
import matplotlib.cm as cm
import matplotlib.colors as colors
import pandas as pd
import numpy as np
import geopy
from geopy.geocoders import Nominatim
import os
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium 

In [2]:
!pip install -U numpy

!pip install -U pandas

!pip install -U scipy

!pip install -U scikit-learn

!pip install -U imbalanced-learn




### Webscraping the data from https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M


In [3]:
pip install pandas as pd


Note: you may need to restart the kernel to use updated packages.


In [4]:
from bs4 import BeautifulSoup as soup


my_url = 'https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&direction=prev&oldid=926287641'

wiki_page_request = requests.get(my_url)
r=requests.get(my_url)
soup=soup(r.content,'html5lib')


#find the table 
table = soup.find_all('table',{"class":"wikitable sortable"})


#find the table headers
column = [i.text.strip() for i in (table[0].find_all('th'))]


#extract the table body
data = table[0].find_all('td')
data

#create list for postcode
postcode = []
for index in range(0, len(data), 3):
    n = data[index].text.strip()
    postcode.append(n)

#create list for borough
borough = []
for index in range(1, len(data), 3):
    n = data[index].text.strip()
    borough.append(n)

#create list for neighbourhood 
neighbourhood = []
for index in range(2, len(data), 3):
    n = data[index].text.strip()
    neighbourhood.append(n)

#create dictionary 
table_df = {}
for a, b in zip(column, [postcode, borough, neighbourhood]):
    table_df[a] = b

#create the dataframe
table = pd.DataFrame(table_df)
table


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
283,M8Z,Etobicoke,Mimico NW
284,M8Z,Etobicoke,The Queensway West
285,M8Z,Etobicoke,Royal York South West
286,M8Z,Etobicoke,South of Bloor


### cleaning the table 

In [5]:
#remove rows where borough is not assigned 
table = table.loc[table["Borough"] != "Not assigned"]

#group table by postcode 

#create temp table for postcodes with neighbourhood>1
temp = table.groupby(['Postcode'])['Neighbourhood'].count().to_frame().reset_index(drop=False)

# #SELECT ARRAY OF POSTCODES WHERE NEIGHBOURHOOD > 1
multi = temp.query("Neighbourhood > 1")['Postcode'].values
single = temp.query("Neighbourhood == 1")['Postcode'].values

t1 = table.loc[table.Postcode.isin(multi),:].sort_values(by='Postcode').reset_index(drop=True)
t2 = table.loc[table.Postcode.isin(single),:].sort_values(by='Postcode').reset_index(drop=True)


multi 

codes = []
boroughs = []
neighbours = []

for code in multi:
    
    table1 = t1.loc[t1.Postcode == code, :] 

    code = np.unique(table1.Postcode)
    codes.append(code[0])
    
    borough = np.unique(table1.Borough)
    boroughs.append(borough[0])
    
    neighbour = table1.Neighbourhood.to_list() 
    neighbour = ', '.join(neighbour) 
    neighbours.append(neighbour)
    

# a new dataframe for 'Postcode' with > 1 'Neighbourhood'
multiple = pd.DataFrame({'Postcode': codes, 'Borough': boroughs, 'Neighbourhood': neighbours})
multiple


# combine the tables
tablenew = pd.concat([multiple, t2], axis=0).sort_values(by='Postcode').reset_index(drop=True)



### fixing neighbourhoods not assigned

In [6]:
tablenew.query("Neighbourhood == 'Not assigned'")
tablenew.loc[tablenew.Postcode == 'M7A', 'Neighbourhood'] = "Queen's Park"

### counting rows and columns


In [7]:
tablenew.shape

(103, 3)

# Part 2: inserting latitude and longitude

### inserting lat and long

In [8]:
geo_lat = {}
geo_lon = {}

url1 = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv"
geo_cords = pd.read_csv(url1)

#combine
datacombine = tablenew.merge(geo_cords, left_on="Postcode", right_on="Postal Code")
dataclean = datacombine.iloc[:,[3,1,2,4,5]]
dataclean.head()


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"West Hill, Morningside, Guildwood",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Part 3: analysing the data

### create a dataframe for Tornoto Borough data 

In [36]:
Toronto_data = dataclean[dataclean['Borough'].str.contains("Toronto")].reset_index(drop=True)
Toronto_data.rename(columns = {"Neighbourhood":"Neighborhood"}, inplace = True)
Toronto_data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


### Calling Foursquare API

In [10]:
CLIENT_ID = '0SWWW2V1W4FUCTDUIF1B3ONZJQYPPHHLSN5WXZWNPLOOP41N' # your Foursquare ID
CLIENT_SECRET = 'D53UEB1ASWPCUU2JEYSVL3NROJH4EJIN1RKGV3FOUVS1JEMV' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 0SWWW2V1W4FUCTDUIF1B3ONZJQYPPHHLSN5WXZWNPLOOP41N
CLIENT_SECRET:D53UEB1ASWPCUU2JEYSVL3NROJH4EJIN1RKGV3FOUVS1JEMV


In [11]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [37]:
toronto_venues = getNearbyVenues(names=Toronto_data['Neighborhood'],
                                   latitudes=Toronto_data['Latitude'],
                                   longitudes=Toronto_data['Longitude']
                                  )

The Beaches
The Danforth West, Riverdale
India Bazaar, The Beaches West
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Summerhill East, Moore Park
Rathnelly, Deer Park, South Hill, Summerhill West, Forest Hill SE
Rosedale
Cabbagetown, St. James Town
Church and Wellesley
Harbourfront, Regent Park
Ryerson, Garden District
St. James Town
Berczy Park
Central Bay Street
King, Adelaide, Richmond
Harbourfront East, Toronto Islands, Union Station
Toronto Dominion Centre, Design Exchange
Victoria Hotel, Commerce Court
Roselawn
Forest Hill West, Forest Hill North
The Annex, Yorkville, North Midtown
Harbord, University of Toronto
Chinatown, Kensington Market, Grange Park
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Stn A PO Boxes 25 The Esplanade
Underground city, First Canadian Place
Christie
Dufferin, Dovercourt Village
Trinity, Little Portugal
Parkdale Village, Exhibition Place, Brockton
High Park, The 

### analysing each neighbourhood 


In [13]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Adult Boutique,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Theme Restaurant,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### group rows by neighborhood

In [14]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Adult Boutique,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.017241,0.0,0.0,0.0,0.0
1,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.0,0.066667,0.066667,0.133333,0.133333,0.066667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Cabbagetown, St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.021739,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.016667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.016667,0.0,0.0,0.016667,0.0
5,"Chinatown, Kensington Market, Grange Park",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.046154,0.0,0.046154,0.015385,0.0
6,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Church and Wellesley,0.026316,0.013158,0.0,0.0,0.0,0.0,0.0,0.013158,0.0,...,0.013158,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.028571,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


###  print each neighborhood along with the top 5 most common venues

In [15]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
          venue  freq
0   Coffee Shop  0.10
1  Cocktail Bar  0.07
2        Bakery  0.05
3    Restaurant  0.03
4      Pharmacy  0.03


----Business Reply Mail Processing Centre 969 Eastern----
                venue  freq
0  Light Rail Station  0.12
1          Comic Shop  0.06
2          Restaurant  0.06
3             Brewery  0.06
4         Pizza Place  0.06


----CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara----
                 venue  freq
0       Airport Lounge  0.13
1      Airport Service  0.13
2                Plane  0.07
3  Rental Car Location  0.07
4             Boutique  0.07


----Cabbagetown, St. James Town----
                venue  freq
0         Coffee Shop  0.09
1                Café  0.07
2  Italian Restaurant  0.04
3         Pizza Place  0.04
4           Pet Store  0.04


----Central Bay Street----
                venue  freq
0         Coffee Shop  0.18
1      Sandwich Place  0.07
2         

### put in a pandas df and display the top 10 venues for each neighbourhood

In [16]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [38]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Bakery,Restaurant,Pharmacy,Cheese Shop,Seafood Restaurant,Beer Bar,Farmers Market,Department Store
1,Business Reply Mail Processing Centre 969 Eastern,Light Rail Station,Comic Shop,Restaurant,Brewery,Pizza Place,Skate Park,Farmers Market,Burrito Place,Butcher,Garden
2,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Lounge,Airport Service,Plane,Rental Car Location,Boutique,Boat or Ferry,Sculpture Garden,Bar,Coffee Shop,Airport
3,"Cabbagetown, St. James Town",Coffee Shop,Café,Italian Restaurant,Pizza Place,Pet Store,Bakery,Pub,Restaurant,Market,Sri Lankan Restaurant
4,Central Bay Street,Coffee Shop,Sandwich Place,Café,Italian Restaurant,Burger Joint,Salad Place,Bubble Tea Shop,Donut Shop,Restaurant,Comic Shop


## Clustering Neighbourhoods

### Run k-means to cluster the neighborhood into 5 clusters.

In [18]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

In [39]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 


array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

### Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood

In [40]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = Toronto_data

# merge toronto grouped with toronto data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,1,Health Food Store,Trail,Pub,Yoga Studio,Museum,Martial Arts School,Mediterranean Restaurant,Men's Store,Mexican Restaurant,Middle Eastern Restaurant
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,1,Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Furniture / Home Store,Yoga Studio,Lounge,Frozen Yogurt Shop,Liquor Store,Spa
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572,1,Pizza Place,Park,Fast Food Restaurant,Gym,Restaurant,Fish & Chips Shop,Pet Store,Board Shop,Food & Drink Shop,Liquor Store
3,M4M,East Toronto,Studio District,43.659526,-79.340923,1,Coffee Shop,Brewery,Café,Bakery,American Restaurant,Gastropub,Yoga Studio,Diner,Seafood Restaurant,Cheese Shop
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,3,Park,Bus Line,Swim School,Yoga Studio,Movie Theater,Martial Arts School,Mediterranean Restaurant,Men's Store,Mexican Restaurant,Middle Eastern Restaurant


### Finally, let's visualize the resulting clusters

In [45]:
address = 'Toronto, Ontario'

geolocator = geopy.Nominatim(user_agent="ny_explorer", timeout=30)
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto {}, {}.'.format(latitude, longitude))


The geograpical coordinate of Toronto 43.6534817, -79.3839347.


In [46]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters