# **Neighborhood Clustering of Toronto, Canada**

In [24]:
# This will import the libraries needed to scrap the data
import pandas as pd
import requests
from bs4 import BeautifulSoup

## Scrapping the data and creating a pandas dataframe

In [25]:
# This will scrap the data from wikipedia for the Toronto zip code and create a Beautiful Soup object
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
toronto_url = requests.get(url).text
soup = BeautifulSoup(toronto_url,'html.parser')

In [26]:
# This will clean the soup object
table_contents=[]
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['Postal Code'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)
                
# This will create the pandas dataframe from the soup object and clean up the Borough field
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto Business,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [27]:
# This will display the shape of the dataframe
df.shape

(103, 3)

# Part II - Get latitude and longitude for each coordinate

In [28]:
# This will import the geospatial coordinates csv file
toronto_coordinates = pd.read_csv('Geospatial_Coordinates.csv')

In [29]:
# Will merge the two dataframes together to get the coordinates and neighboorhood data
df_toronto = df.set_index(['Postal Code'])
toronto_coordinates = toronto_coordinates.set_index(['Postal Code'])
toronto_new_df = pd.concat([df_toronto,toronto_coordinates],axis = 1, join = 'inner')

toronto_new_df.index.name = 'Postal Code'
toronto_new_df.reset_index(inplace = True)

toronto_new_df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto Business,Enclave of M4L,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509
