# Segmenting and Clustering Neighborhoods in Toronto
## Part 1: Web Scraping from Wikipedia

### Import needed libraries

In [10]:
import requests as rq
from bs4 import BeautifulSoup as bs
import pandas as pd
from urllib import request as urre

### I have used BeautifulSoup to scrap the Wikipedia HTML

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
website_url = rq.get(url).text
sp = bs(website_url,'lxml')

### And then find table and store columns and rows of it

In [4]:
tb = sp.find('table',{'class':'wikitable sortable'})
columns = [th.text.replace('\n', '') for th in tb.find('tr').find_all('th')]

trs = tb.find_all('tr')[1:]
rows = list()
for tr in trs:
    rows.append([td.text.replace('\n', '').replace('\xa0', '') for td in tr.find_all('td')])

### Finally I created DFs according to the conditions
#### Condition 1 : all data

#### Condition 2: all data except Borough is equal to Not assigned 

#### Condition 3: Con2 + use comma to separate code with more than 1 neighborhood
#### Wiki's list has been changed, they have grouped all neoghborhood that shares same postal code
#### so this condition will only replace the separator as the question requested

In [5]:
df_con1 = pd.DataFrame(rows,columns=columns) 
df_con2 = df_con1[df_con1.Borough != 'Not assigned']
df_con3 = df_con2.replace(' / ',', ',regex=True)
df_con3

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road , Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,Business reply mail Processing CentrE
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [6]:
df_con3.shape

(103, 3)

## Part 2: Geo-Coding Neighborhoods

Retrieve CSV files and store it as DataFrame

In [30]:
#urre.urlretrieve('https://cocl.us/Geospatial_data','geo.csv')
df_lalo = pd.read_csv('geo.csv')
df_part2 = pd.merge(left=df_con3,right=df_lalo,left_on='Postal code', right_on='Postal Code')
df_part2.drop(columns='Postal Code',inplace=True)
df_part2

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road , Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,Business reply mail Processing CentrE,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509
