# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

## Part 1: transform the data in the table on the Wikipedia page into the above pandas dataframe.

### Part 1a: scraping data using Beautiful Soup

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from urllib.request import urlopen
!pip install beautifulsoup4
from bs4 import BeautifulSoup



In [4]:
#define the URL
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
#Get the html of the page
source = urlopen(url)
#pass the html to the BeautifulSoup() function, parser 'html.parser' is used here as this is already included in the standard library.
soup = BeautifulSoup(source,'html.parser')
#examine the type of soup
type(soup)

bs4.BeautifulSoup

### Part 1b: getting the right table

In [5]:
#get the table to 'table'
table = soup.find('table',{'class':'wikitable sortable'})

In [6]:
#using the tab 'tr' and 'td' to define columns and rows"
table_rows = table.find_all('tr')
res = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td if tr.text.strip()]
    if row:
        res.append(row)

df = pd.DataFrame(res, columns=["PostalCode",'Borough','Neightborhood'])
df.head()
print('the original shape of the table is',df.shape)


the original shape of the table is (180, 3)


### Part 1c cleaning up data as per the assignement requirement

In [7]:
# 1) Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
df = df[df.Borough != 'Not assigned']
print('after removing rows without an assigned borough, the shape is',df.shape)

after removing rows without an assigned borough, the shape is (103, 3)


In [8]:
# 2) More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.
print('Since the count and unique of PostalCode have the same value, there is no doubling of PostaCode in the dataframe')
df.describe()

Since the count and unique of PostalCode have the same value, there is no doubling of PostaCode in the dataframe


Unnamed: 0,PostalCode,Borough,Neightborhood
count,103,103,103
unique,103,10,98
top,M5M,North York,Downsview
freq,1,24,4


In [9]:
# 3) If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
print('since there is no null value, none of the cell under neighborhood is not assigned')
df.isnull().values.any()

since there is no null value, none of the cell under neighborhood is not assigned


False

In [10]:
# 4) In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.
print('this is the final dataframe')
print(df.head())
print('and the shape is:',df.shape)

this is the final dataframe
  PostalCode           Borough                                Neightborhood
2        M3A        North York                                    Parkwoods
3        M4A        North York                             Victoria Village
4        M5A  Downtown Toronto                    Regent Park, Harbourfront
5        M6A        North York             Lawrence Manor, Lawrence Heights
6        M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government
and the shape is: (103, 3)
