## Segmenting and Clustering Neighborhoods in Toronto

##### Explore, segment, and cluster the neighborhoods in the city of Toronto based on the postalcode and borough information

### Part 1: Scrap the Wiki page to obtain data

In [7]:
## Install required packages
!conda install bs4
!pip install urllib3

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [15]:
# Import libraries
import bs4 as bs
import urllib.request
import pandas as pd
import numpy as np

In [10]:
# Create a soup to request to the link
source = urllib.request.urlopen('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').read()
soup = bs.BeautifulSoup(source,'lxml')

In [27]:
# Get the required table
table = soup.find('table', attrs={'class':'wikitable sortable'})
# Get all the rows
table_rows = table.find_all('tr')
# Loop through the rows
l = []
c = 0
for tr in table_rows:
    if (c > 0): 
        td = tr.find_all('td')
        row = [tr.text.replace("\n", "") for tr in td]
        l.append(row)
    c = c + 1
    
# Create panda dataframe 
neighbours = pd.DataFrame(l, columns=["PostalCode", "Borough", "Neighbourhood"])
# Remove the cell with no borough
neighbours = neighbours[neighbours.Borough != "Not assigned"]


# Print the first 5 
neighbours.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [26]:
# Check if there is neighbourhood that is not assigned
neighbours[neighbours.Neighbourhood == "Not assigned"].shape

(0, 3)

In [28]:
# Check the shape of the dataframe
neighbours.shape

(103, 3)

### Part 2: Get Latitude and Longitude for each row

In [38]:
# import the csv as a dataframe
geo = pd.read_csv(r'http://cocl.us/Geospatial_data')
# Rename column
geo.rename(columns={"Postal Code": "Postal"})
geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [31]:
# Create two empty column in the dataframe
neighbours['Latitude'] = ""
neighbours['Longitude'] = ""
neighbours.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
2,M3A,North York,Parkwoods,,
3,M4A,North York,Victoria Village,,
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",,
5,M6A,North York,"Lawrence Manor, Lawrence Heights",,
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",,


In [47]:
# Loop through the neighbours dataframe and insert latitude and longitude
for index, row in neighbours.iterrows():
    row['Latitude'] = geo.loc[geo['Postal Code'] == row['PostalCode'], 'Latitude'].values[0]
    row['Longitude'] = geo.loc[geo['Postal Code'] == row['PostalCode'], 'Longitude'].values[0]
    
# Print the first 10 rows of the dataframe
neighbours.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.7533,-79.3297
3,M4A,North York,Victoria Village,43.7259,-79.3156
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6543,-79.3606
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7185,-79.4648
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6623,-79.3895
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.6679,-79.5322
9,M1B,Scarborough,"Malvern, Rouge",43.8067,-79.1944
11,M3B,North York,Don Mills,43.7459,-79.3522
12,M4B,East York,"Parkview Hill, Woodbine Gardens",43.7064,-79.3099
13,M5B,Downtown Toronto,"Garden District, Ryerson",43.6572,-79.3789


### Part 3: Explore and cluster the neighborhoods in Toronto