# Neighborhoods in Toronto

## Explore and cluster the neighborhoods

## Author: Hyojae Lee

### Import all the necessary libraries

In [2]:
import numpy as np 
import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt # plotting library
# backend for rendering plots within the browser
%matplotlib inline 

# import k-means from clustering stage
from sklearn.cluster import KMeans
from sklearn.datasets.samples_generator import make_blobs

import folium # map rendering library

# import webscraping package
from bs4 import BeautifulSoup 
from urllib.request import urlopen

print("Libraries imported!")

Libraries imported!


### Use BeautifulSoup package to import the table information from the wikipedia link

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html = urlopen(url) 
soup = BeautifulSoup(html, 'lxml')
tables = soup.find_all('table')

** Find <tr> and <td> to distinguish rows and columns ** </br>
each "cell" has 3 elements since there are 3 columns </br>


In [4]:
PostalCode = []
Borough = []
Neighborhood = []

for table in tables:
    rows = table.find_all('tr')
    
    for row in rows:
        cells = row.find_all('td')

        if len(cells) == 3:
            PostalCode.append(cells[0].find(text=True))
            Borough.append(cells[1].find(text=True))
            Neighborhood.append(cells[2].find(text=True))

### Use pandas dataframe to create a pandas dataframe using the table information 

In [5]:
df = pd.DataFrame(PostalCode, columns =['PostalCode'])
df['Borough'] = Borough
df['Neighborhood'] = Neighborhood
# elimiate all \n signs
df = df.replace('\n','', regex=True)
# eliminate all rows with "Not assigned" in the Borough column
df = df[df.Borough != "Not assigned"]
df.shape

(103, 3)

In [6]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Import csv file containing geographical coordinates of each postal code 

In [7]:
coordinates = pd.read_csv("http://cocl.us/Geospatial_data")

In [8]:
coordinates.columns = ['PostalCode', 'Latitude', 'Longitude']
coordinates.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merge coordinates with the original dataset by matching postal codes. 

In [9]:
merged_df = df.merge(coordinates, how = 'inner', on = ['PostalCode'])
merged_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


### use geopy library to get the latitude and longitude values of Toronto

In [10]:
address = 'Toronto'

geolocator = Nominatim(user_agent="explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


## Use folium library to generate a map of Toronto 

In [11]:
# create map using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(merged_df['Latitude'], merged_df['Longitude'], merged_df['Borough'], merged_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        parse_html=False).add_to(map_toronto)  

map_toronto