# Segmenting and Clustering Neighborhoods in Toronto

The project is devide in three parts. 

### Part I -  Web Scraping the data

In [1]:
#algorithm from scraping the data from Wikipedia page.

from bs4 import BeautifulSoup # Python library for pulling data out of HTML and XML files.
import requests
import csv
import pandas as pd
import numpy as np

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')


csv_file = open('Neibor_toronto.csv', 'w')
csv_writer = csv.writer(csv_file)

cell = ""
table = soup.find('table', class_='wikitable sortable')
for row in table.find_all('tr'):
    data = ""
    for element in row.find_all('td'):
        data = data +","+ element.text 
    cell = cell + data[1:]
       

print(cell)

M1A,Not assigned,Not assigned
M2A,Not assigned,Not assigned
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,Harbourfront
M5A,Downtown Toronto,Regent Park
M6A,North York,Lawrence Heights
M6A,North York,Lawrence Manor
M7A,Queen's Park,Not assigned
M8A,Not assigned,Not assigned
M9A,Etobicoke,Islington Avenue
M1B,Scarborough,Rouge
M1B,Scarborough,Malvern
M2B,Not assigned,Not assigned
M3B,North York,Don Mills North
M4B,East York,Woodbine Gardens
M4B,East York,Parkview Hill
M5B,Downtown Toronto,Ryerson
M5B,Downtown Toronto,Garden District
M6B,North York,Glencairn
M7B,Not assigned,Not assigned
M8B,Not assigned,Not assigned
M9B,Etobicoke,Cloverdale
M9B,Etobicoke,Islington
M9B,Etobicoke,Martin Grove
M9B,Etobicoke,Princess Gardens
M9B,Etobicoke,West Deane Park
M1C,Scarborough,Highland Creek
M1C,Scarborough,Rouge Hill
M1C,Scarborough,Port Union
M2C,Not assigned,Not assigned
M3C,North York,Flemingdon Park
M3C,North York,Don Mills South
M4C,East York,Woodbine Heights
M

In [3]:
# Writing our data in a csv file
csv_file = open('Neibor_toronto.csv', 'wb')
csv_file.write(bytes(cell,encoding="ascii",errors="ignore"))

8738

In [4]:
# read the data

df = pd.read_csv('Neibor_toronto.csv', header = None)
df.columns = ["PostalCode", "Borough", "Neighborhood"]
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [5]:
df.tail()

Unnamed: 0,PostalCode,Borough,Neighborhood
283,M8Z,Etobicoke,Mimico NW
284,M8Z,Etobicoke,The Queensway West
285,M8Z,Etobicoke,Royal York South West
286,M8Z,Etobicoke,South of Bloor
287,M9Z,Not assigned,Not assigned


In [8]:
# Drop cells with a borough that is Not assigned.
index = df[df['Borough'] == 'Not assigned'].index
df.drop(index, axis=0, inplace=True)

# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
df.loc[df['Neighborhood'] == 'Not assigned', 'Neighborhood'] = df['Borough']

# More than one neighborhood can exist in one postal code area.
join_table = df.groupby(['PostalCode', 'Borough'], sort=False).agg(','.join)
final_table = join_table.reset_index()
final_table.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge,Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens,Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson,Garden District"


In [7]:
final_table.shape

(103, 3)