# Segmenting and Clustering Neighborhoods in Toronto

## PART 1

###  Libraries

In [32]:
import numpy as np
import pandas as pd 
import requests 
from bs4 import BeautifulSoup 

###   Wikipedia Scrap

In [33]:
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(data, 'html.parser')
postalcode = []
borough = []
neighborhood = []

In [34]:
soup.find('table').find_all('tr')
soup.find('table').find_all('tr')
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalcode.append(cells[0].text.rstrip('\n'))
        borough.append(cells[1].text.rstrip('\n'))
        neighborhood.append(cells[2].text.rstrip('\n'))

### Create dataframe

In [35]:
df = pd.DataFrame({"postalcode": postalcode,
                           "borough": borough,
                           "neighborhood": neighborhood})
df.head()

Unnamed: 0,postalcode,borough,neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


### Drop NA and Group neighborhoods in borough

In [36]:
df = df[df.borough != "Not assigned"].reset_index(drop=True)
df = df.groupby(["postalcode", "borough"], as_index=False).agg(lambda x: ", ".join(x))

In [37]:
postcodes = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]
column_names = ["postalcode", "borough", "neighborhood"]
df_final = pd.DataFrame(columns=column_names)

for postcode in postcodes:
    df_final = df_final.append(df[df["postalcode"]==postcode], ignore_index=True)

In [38]:
df.shape

(103, 3)

## PART 2

### Load geospatial coordinates

In [39]:
geo_coordinates = pd.read_csv("Geospatial_Coordinates.csv")
geo_coordinates.head()

Unnamed: 0,postalcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge 

In [40]:
df = df.merge(geo_coordinates, on="postalcode", how="left")

In [41]:
postcodes = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]
column_names = ["postalcode", "borough", "neighborhood"]
df_final = pd.DataFrame(columns=column_names)

for postcode in postcodes:
    df_final = df_final.append(df[df["postalcode"]==postcode], ignore_index=True)

In [42]:
df_final

Unnamed: 0,Latitude,Longitude,borough,neighborhood,postalcode
0,43.657952,-79.387383,Downtown Toronto,Central Bay Street,M5G
1,43.803762,-79.363452,North York,Hillcrest Village,M2H
2,43.706397,-79.309937,East York,Parkview Hill / Woodbine Gardens,M4B
3,43.744734,-79.239476,Scarborough,Scarborough Village,M1J
4,43.70906,-79.363452,East York,Leaside,M4G
5,43.659526,-79.340923,East Toronto,Studio District,M4M
6,43.750072,-79.295849,Scarborough,Wexford / Maryvale,M1R
7,43.739416,-79.588437,Etobicoke,South Steeles / Silverstone / Humbergate / Jam...,M9V
8,43.756303,-79.565963,North York,Humber Summit,M9L
9,43.628947,-79.39442,Downtown Toronto,CN Tower / King and Spadina / Railway Lands / ...,M5V
