# Part 1) Extracting Wiki Data, Formatting Table

<b> Parsing Wikipedia Page onto Pandas Dataframe </b>

In [1]:
from bs4 import BeautifulSoup
import requests 

page = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(page.content)


In [2]:
import pandas as pd

First, I am extracting the table from the wikipedia page and assigning it to 'toronto_neighborhoods'.
Then, I am iterating through each row of the table to get values of borough, postal code, neighborhood and placing it into a dictionary.

In [3]:
toronto_neighborhoods = soup.find("table", class_="wikitable")

In [4]:
#returns all the values of column from wikitable, adds to lists 

borough = []
postal_code = []
neighborhood = []

for row in toronto_neighborhoods.find_all("tr"):
    columns = row.find_all(["td", "th"])
    postal_code.append(columns[0].text)
    borough.append(columns[1].text)
    neighborhood.append(columns[2].text)



Now, I am converting the dictionary into a pandas dataframe:

In [5]:
#add values into dictionary 

toronto_dict = {'PostalCode': postal_code,
               'Borough': borough,
               'Neighborhood': neighborhood}

#convert dictionary to dataframe
toronto_df = pd.DataFrame(toronto_dict)
toronto_df.head()



Unnamed: 0,PostalCode,Borough,Neighborhood
0,Postal Code\n,Borough\n,Neighbourhood\n
1,M1A\n,Not assigned\n,Not assigned\n
2,M2A\n,Not assigned\n,Not assigned\n
3,M3A\n,North York\n,Parkwoods\n
4,M4A\n,North York\n,Victoria Village\n


In [6]:
#clean data to remove '\n' from each value

column_names = ['PostalCode', 'Borough', 'Neighborhood']

for col in column_names:
    toronto_df[col] = toronto_df[col].str.replace('\n', '')
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,Postal Code,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [7]:
toronto_df.shape

(181, 3)

In [8]:
#drop the first row which contains the headers
toronto_df = toronto_df.drop(toronto_df.index[0])
toronto_df.shape

(180, 3)

Final dataframe:

In [9]:
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [10]:
#drop the rows that are not assigned to a borrow

toronto_df = toronto_df.drop(toronto_df.index[toronto_df['Borough']=='Not assigned'])
toronto_df = toronto_df.reset_index(drop=True)

In [11]:
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


# Part 2) Getting Latitude and Longitude Details, adding to DataFrame

In [12]:
import numpy as np
import json
import requests
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



The API was not working, thus I extracted Latitude/Longitude information from the csv file:

In [13]:
#import geospatial df

geospatial_df = pd.read_csv('Geospatial_Coordinates.csv')
geospatial_df.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)
geospatial_df.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Now, I am adding two columns on toronto_df dataframe to include the latitude and longitude values corresponding to each postal code:

In [14]:
#add latitude, longitude info to toronto df 

latitudes = []
longitudes = []

for row in toronto_df['PostalCode']:
    geospatial_row = geospatial_df[geospatial_df['PostalCode'] == row]
    latitudes.append(geospatial_row.iloc[0]['Latitude'])
    longitudes.append(geospatial_row.iloc[0]['Longitude'])

toronto_df['Latitude'] = latitudes
toronto_df['Longitude'] = longitudes

In [15]:
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


# Part 3) Clustering Neighborhoods

In [16]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [17]:
#working only with Boroughs in Toronto

toronto = toronto_df[toronto_df['Borough'].str.contains("Toronto")]
toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [18]:
# API information for foursquare:
CLIENT_ID = '33XNO5GWL14TS4QOV5T2MBT5V1NH2X3IB1UCKRXEHBGUTPGM' # your Foursquare ID
CLIENT_SECRET = '3JILJ2E21MEXD0O1GAKQLE0X0LCSJXE05205XGHG130JN4T3' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

Getting all the nearby venues for buroughs in Toronto:

In [19]:
def getNearbyVenues(names, postal, latitudes, longitudes):
    radius=300
    LIMIT=100
    venues_list=[]
    for name, pst, lat, lng in zip(names, postal, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            pst,
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood',
                             'PostalCode',
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [21]:
#get the nearby venues for toronto neighbourhoods

venues = getNearbyVenues(names=toronto['Neighborhood'],
                         postal=toronto['PostalCode'],
                         latitudes = toronto['Latitude'],
                         longitudes = toronto['Longitude']);



We are grouping the venues by their Postal Codes:

In [22]:
#GROUP by postal code
venues.groupby('PostalCode').count().head()

Unnamed: 0_level_0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
M4E,3,3,3,3,3,3,3
M4K,23,23,23,23,23,23,23
M4L,16,16,16,16,16,16,16
M4M,23,23,23,23,23,23,23
M4N,2,2,2,2,2,2,2


In [23]:
#One hot encoding:

toronto_onehot = pd.get_dummies(venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot.insert(loc=0, column='PostalCode', value=venues['PostalCode'])
toronto_onehot.head()

Unnamed: 0,PostalCode,Accessories Store,Adult Boutique,Airport Food Court,Airport Gate,Airport Lounge,Airport Terminal,American Restaurant,Arepa Restaurant,Art Gallery,...,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
#group by postal code

toronto_grouped = toronto_onehot.groupby('PostalCode').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,PostalCode,Accessories Store,Adult Boutique,Airport Food Court,Airport Gate,Airport Lounge,Airport Terminal,American Restaurant,Arepa Restaurant,Art Gallery,...,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,M4E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478
2,M4L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M4N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Now, for each Postal Code, we are creating a dataframe that tells us the most popular venues in that area:

In [25]:
#put into pandas df

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:] #columns and their values
    
    row_categories_sorted = row_categories.sort_values(ascending=False) #sort the columns based on the values 
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 5
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['PostalCode']

for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
postalcodes_venues_sorted = pd.DataFrame(columns=columns)
postalcodes_venues_sorted['PostalCode'] = toronto_grouped['PostalCode']

for ind in np.arange(toronto_grouped.shape[0]):
    postalcodes_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

postalcodes_venues_sorted.head()

Unnamed: 0,PostalCode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M4E,Playground,Park,Trail,Yoga Studio,Diner
1,M4K,Greek Restaurant,Restaurant,Ice Cream Shop,Yoga Studio,Italian Restaurant
2,M4L,Light Rail Station,Italian Restaurant,Pet Store,Pizza Place,Movie Theater
3,M4M,Coffee Shop,Bar,Thai Restaurant,Fish Market,Cheese Shop
4,M4N,Photography Studio,Lake,Fast Food Restaurant,Falafel Restaurant,Ethiopian Restaurant


# Using KMeans to Cluster Toronto Neighborhoods:

In [27]:
#arrange Toronto into 5 clusters 

kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('PostalCode',1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)
kmeans.labels_[0:10]



array([3, 3, 3, 3, 3, 3, 0, 3, 3, 3], dtype=int32)

In [28]:
toronto_grouped_clustering.head()

Unnamed: 0,Accessories Store,Adult Boutique,Airport Food Court,Airport Gate,Airport Lounge,Airport Terminal,American Restaurant,Arepa Restaurant,Art Gallery,Arts & Crafts Store,...,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [30]:
#add clustering labels
postalcodes_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_merged = toronto

#merge toronto_grouped with toronto to add  longitude, latitude 
toronto_merged = toronto_merged.join(postalcodes_venues_sorted.set_index('PostalCode'), on='PostalCode')

In [31]:
toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,3.0,History Museum,Bakery,Park,Design Studio,Coffee Shop
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,3.0,Coffee Shop,Italian Restaurant,Park,Sandwich Place,Café
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,3.0,Coffee Shop,Clothing Store,Café,Middle Eastern Restaurant,Movie Theater
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,3.0,Gastropub,Coffee Shop,Restaurant,Japanese Restaurant,Gym
19,M4E,East Toronto,The Beaches,43.676357,-79.293031,3.0,Playground,Park,Trail,Yoga Studio,Diner


In [34]:
#get coordinates for toronto

address = 'Toronto, Canada'
geolocator = Nominatim(user_agent="yyz_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        fill=True,
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters


The geograpical coordinate of Toronto are 43.6534817, -79.3839347.
