# Segmenting and Clustering Neighborhoods in Toronto

## Description
### This notebook contains the code to complete webpage scraping using beautiful soup 

In [2]:
import numpy as np
import pandas as pd

In [5]:
#install beautifulsoup4
!conda install beautifulsoup4 -y

Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda

  added / updated specs: 
    - beautifulsoup4


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    cryptography-2.5           |   py36h1ba5d50_0         643 KB
    ca-certificates-2019.1.23  |                0         126 KB
    soupsieve-1.7.1            |           py36_0          50 KB
    grpcio-1.16.1              |   py36hf8bcb03_1         1.1 MB
    libarchive-3.3.3           |       h5d8350f_5         1.5 MB
    certifi-2018.11.29         |           py36_0         146 KB
    beautifulsoup4-4.7.1       |           py36_1         143 KB
    conda-4.6.3                |           py36_0         1.7 MB
    python-3.6.8               |       h0371630_0        34.4 MB
    ------------------------------------------------------------
                                           Total:        39.7 

In [3]:
from bs4 import BeautifulSoup

In [61]:
#install lxml parser
import sys
!{sys.executable} -m pip install lxml



In [4]:
import requests

In [5]:
# open webpage with beautiful soup using the lxml web parser
html_file = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(html_file, 'lxml')

In [10]:
# print soup to find html to find
#print(soup.prettify()) #Hide once we explore html code 

In [71]:
# Scrape table using beautifulsoup and pandas
table = soup.find('table',class_='wikitable sortable')
table_rows = table.find_all('tr')
    
res = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td if tr.text.strip()]
    if row:
        res.append(row)
 
df = pd.DataFrame(res, columns=["Postcode", "Borough", "Neighborhood"])
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [72]:
# Ignore cells with a borough that is Not assigned

df.drop(df[df['Borough']=="Not assigned"].index,axis=0, inplace=True)
df.head(5)

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [75]:
# Group by postal code and combine neighborhoods, separating by commas
df1=df.groupby("Postcode").agg(lambda x:','.join(set(x)))
df1.head()

Unnamed: 0_level_0,Borough,Neighborhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Malvern,Rouge"
M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
M1E,Scarborough,"Morningside,Guildwood,West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


In [85]:
# if a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
df1.loc[df1['Neighborhood']=="Not assigned",'Neighborhood']=df1.loc[df1['Neighborhood']=="Not assigned",'Borough']

In [86]:
print(df1)

                   Borough                                       Neighborhood
Postcode                                                                     
M1B            Scarborough                                      Malvern,Rouge
M1C            Scarborough               Highland Creek,Rouge Hill,Port Union
M1E            Scarborough                    Morningside,Guildwood,West Hill
M1G            Scarborough                                             Woburn
M1H            Scarborough                                          Cedarbrae
M1J            Scarborough                                Scarborough Village
M1K            Scarborough          Ionview,Kennedy Park,East Birchmount Park
M1L            Scarborough                      Golden Mile,Clairlea,Oakridge
M1M            Scarborough      Cliffside,Scarborough Village West,Cliffcrest
M1N            Scarborough                         Cliffside West,Birch Cliff
M1P            Scarborough  Scarborough Town Centre,Wexford Heig

In [84]:
df1.shape

(103, 2)