### Import necessary libraries to scrape table and manipulate data for final pandas dataframe

In [1]:
import requests
import pandas as pd
import lxml.html as lh

### Use requests library to scrape the content of the Toronto Postal Codes Table from the url listed below

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

page = requests.get(url)

doc = lh.fromstring(page.content)

tr_elements = doc.xpath('//tr')


### Create columns with empty lists and column header names

In [3]:
col = []
i = 0 

for t in tr_elements[0]:
    i+=1
    name = t.text_content()
    col.append((name, []))

### Populate each column with the associated data from the table

In [4]:
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 10, the //tr data is not from our table 
    if len(T)!= 3:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1

### Make sure each column is of the same length

In [5]:
[len(C) for (title, C) in col]

[181, 181, 181]

### Create dictionary with the keys as the column names and the values as the column values

In [6]:
Dict = {title:column for (title,column) in col}
df = pd.DataFrame(Dict)

In [7]:
df.head()

Unnamed: 0,Postal Code\n,Borough\n,Neighborhood\n
0,M1A\n,Not assigned\n,\n
1,M2A\n,Not assigned\n,\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"


### Replace all instances of '\n'

In [8]:
for column in df:
    df[column] = df[column].str.replace(r'\n', '')

In [9]:
df.head()

Unnamed: 0,Postal Code\n,Borough\n,Neighborhood\n
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Rename columns

In [10]:
df.columns = ['PostalCode', 'Borough', 'Neighborhood']

In [11]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Only keep rows that do not have Borough values == 'Not assigned'

In [12]:
df = df[df['Borough'] != 'Not assigned']

In [13]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Check that each row has an assigned neighborhood

In [14]:
df_no_neighborhood = df[df['Neighborhood'] == '']

In [15]:
df_no_neighborhood.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
180,,Canadian postal codes,


### Drop row that does not correspond to a neighborhood

In [16]:
df = df.drop([180])

In [17]:
df.tail()

Unnamed: 0,PostalCode,Borough,Neighborhood
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,Business reply mail Processing Centre
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


### Reset index

In [18]:
df = df.reset_index(drop = True)

In [19]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [20]:
df.tail()

Unnamed: 0,PostalCode,Borough,Neighborhood
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing Centre
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."
102,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


### Display shape of final df

In [22]:
df.shape

(103, 3)