In [1]:
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
#Code to display the entire rows and colomns for ease of analysis

pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)

#### Getting the html document of the link using requests method

In [3]:
data_url = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M", verify = False).text
data_url



'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>List of postal codes of Canada: M - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRequestId":"XkqtQgpAIDEAADI1A80AAADN","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":935851093,"wgRevisionId":935851093,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communications in O

#### Parsing through the html document using BeautifulSoap

In [4]:
soup = BeautifulSoup(data_url, 'lxml')
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRequestId":"XkqtQgpAIDEAADI1A80AAADN","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":935851093,"wgRevisionId":935851093,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communi

#### finding the table tag and curresponding data

In [5]:
table = soup.find('table', {'class', 'wikitable sortable'})
table

<table class="wikitable sortable">
<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Harbourfront</a>
</td></tr>
<tr>
<td>M6A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Lawrence_Heights" title="Lawrence Heights">Lawrence Heights</a>
</td></tr>
<tr>
<td>M6A</td>
<td><a href="/wiki/North

#### Parsing through the table to scrap its content

In [6]:
content = table.findAll('tr')
postcode = []
borough = []
neighbourhood = []
for data in content[1:]:
    postcode.append(str(data.findAll('td')[0].get_text()))
    borough.append(str(data.findAll('td')[1].get_text()))
    neighbourhood.append(str(data.findAll('td')[2].get_text()))

#### Constructing pandas dataframe from the table data

In [7]:
df = pd.DataFrame()
df['Postcode'] = postcode
df['Borough'] = borough
df['Neighbourhood'] = neighbourhood

df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


# Cleaning the Pandas Dtaframe


#### removing the rows having values  "Not assigned" in df['Borough']

In [8]:
df_cleaned = df[df.Borough != 'Not assigned']
df_cleaned

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n
5,M6A,North York,Lawrence Heights\n
6,M6A,North York,Lawrence Manor\n
7,M7A,Downtown Toronto,Queen's Park\n
9,M9A,Queen's Park,Not assigned\n
10,M1B,Scarborough,Rouge\n
11,M1B,Scarborough,Malvern\n
13,M3B,North York,Don Mills North\n


In [9]:
 df_cleaned['Borough'].value_counts()

Etobicoke           44
North York          38
Downtown Toronto    37
Scarborough         37
Central Toronto     17
West Toronto        13
York                 9
East Toronto         7
East York            6
Queen's Park         1
Mississauga          1
Name: Borough, dtype: int64

In [10]:
print(df_cleaned['Neighbourhood'].value_counts())

Runnymede\n                                            2
St. James Town\n                                       2
Summerhill East\n                                      1
Stn A PO Boxes 25 The Esplanade\n                      1
Riverdale\n                                            1
Beaumond Heights\n                                     1
Lawrence Heights\n                                     1
York University\n                                      1
Scarborough Town Centre\n                              1
Downsview\n                                            1
Brockton\n                                             1
Henry Farm\n                                           1
Cliffcrest\n                                           1
Woburn\n                                               1
Downsview East\n                                       1
Silverstone\n                                          1
St. Phillips\n                                         1
South Hill\n                   

#### Removing '\n' from the end of every values in Neighbourhood Columns

In [11]:
df_cleaned['Neighbourhood'] = df_cleaned['Neighbourhood'].map(lambda x: x.rstrip('\n'))
df_cleaned

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Queen's Park,Not assigned
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


#### Finding the postal code in more than one neibours

In [12]:
multi_neibour = []
for key, value in dict(df['Postcode'].value_counts()).items():
    if int(value) > 1:
        multi_neibour.append(key)
        
print(multi_neibour)

['M9V', 'M8Y', 'M5V', 'M8Z', 'M9B', 'M4V', 'M9R', 'M6M', 'M9C', 'M1V', 'M6L', 'M5T', 'M1M', 'M5J', 'M3H', 'M5H', 'M8X', 'M1T', 'M8V', 'M1C', 'M2J', 'M1L', 'M1E', 'M5R', 'M6K', 'M1P', 'M1K', 'M3J', 'M5B', 'M1R', 'M5K', 'M6J', 'M5X', 'M6N', 'M6S', 'M4B', 'M6H', 'M5P', 'M4L', 'M5M', 'M6P', 'M6A', 'M3K', 'M1B', 'M2L', 'M5S', 'M2M', 'M9M', 'M6R', 'M3C', 'M4K', 'M8W', 'M1N', 'M4T', 'M5L', 'M4X']


In [13]:
borough = []
neibourhood = []
for code in multi_neibour:
    neibours = ''
    borough.append(df_cleaned[df_cleaned['Postcode'] == code]['Borough'].values[0])
    for neibour in df_cleaned[df_cleaned['Postcode'] == code]['Neighbourhood'].values:
        neibours = neibours + neibour + ','
    neibourhood.append(neibours)

In [14]:
df_multiNeibours = pd.DataFrame()
df_multiNeibours['Postcode'] = multi_neibour
df_multiNeibours['Borough'] = borough
df_multiNeibours['Neighbourhood'] = neibourhood
df_multiNeibours

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam..."
1,M8Y,Etobicoke,"Humber Bay,King's Mill Park,Kingsway Park Sout..."
2,M5V,Downtown Toronto,"CN Tower,Bathurst Quay,Island airport,Harbourf..."
3,M8Z,Etobicoke,"Kingsway Park South West,Mimico NW,The Queensw..."
4,M9B,Etobicoke,"Cloverdale,Islington,Martin Grove,Princess Gar..."
5,M4V,Central Toronto,"Deer Park,Forest Hill SE,Rathnelly,South Hill,..."
6,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie..."
7,M6M,York,"Del Ray,Keelesdale,Mount Dennis,Silverthorn,"
8,M9C,Etobicoke,"Bloordale Gardens,Eringate,Markland Wood,Old B..."
9,M1V,Scarborough,"Agincourt North,L'Amoreaux East,Milliken,Steel..."


In [15]:
for code in multi_neibour:
    df_cleaned = df_cleaned[df_cleaned.Postcode != code]

In [16]:
X = pd.concat([df_cleaned, df_multiNeibours], axis = 0)

In [17]:
X.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Queen's Park,Not assigned


In [18]:
X.shape

(103, 3)