## This notebook will be used for the Capstone Project from the 
## "Applied Data Science Capstone" IBM course

### First, import the libraries and repositories used in this notebook.

In [1]:
# import libraries and packages
import pandas as pd
import numpy as np
import requests
import lxml.html as lh
#from IPython.display import display, HTML

### Then, using the Wikipedia url provided in the assignment.
### The contents of the site are stored, then parsed the HTML code, storing tr elements in a list as the table should be contained in these lines

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# Create a handle to the contents of the website
page = requests.get(url)

#Store the contents of the website under doc
doc = lh.fromstring(page.content)

#Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')
tr_elements[0:10]

[<Element tr at 0x7f60af204a48>,
 <Element tr at 0x7f60af2049f8>,
 <Element tr at 0x7f60af204a98>,
 <Element tr at 0x7f60af204ae8>,
 <Element tr at 0x7f60af204d18>,
 <Element tr at 0x7f60af204d68>,
 <Element tr at 0x7f60af204db8>,
 <Element tr at 0x7f60af204e08>,
 <Element tr at 0x7f60af204e58>,
 <Element tr at 0x7f60af204ea8>]

### Corroboration that the each elements has 3 columns, as observed in the Wikipedia website

In [3]:
row_length = [len(T) for T in tr_elements[:10]]
print(row_length)

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3]


### Another verification corroborating the headers

In [4]:
tr_elements = doc.xpath('//tr')
# Create an empty list
column=[]
i=0
# For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    print('%d: "%s"'%(i,name))
    column.append((name,[]))

1: "Postal Code
"
2: "Borough
"
3: "Neighbourhood
"


### Since first row is the header, the data will be stored starting from the second row

In [5]:
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 3, the //tr data is not from our table 
    if len(T)!=row_length[0]:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data = t.text_content() 
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        column[i][1].append(data)
        #Increment i for the next column
        i+=1

### Another check, now on the initial number of rows for each column

In [6]:
[len(C) for (title,C) in column]

[181, 181, 181]

### The data is first stored in a dictionary, then it is turned into a Dataframe

In [8]:
Dict = {title:column for (title,column) in column}
df = pd.DataFrame(Dict)

### This is the cleaning steps:
* Get rid of the \n symbols
* Drop rows in Borough column containig 'Not assigned'
* Replace elements in Neighbourhood column containing 'Not assigned' 
  with Borough values from the same row

In [9]:
# Clean the data
# Get rid of the \n symbols 
df = df.replace('\n','', regex=True)
df.columns = df.columns.str.replace('\n','', regex=True)

# Drop rows with Borough == 'Not assigned'
df.drop(df[df.Borough == 'Not assigned'].index, inplace=True)
nan_value = np.nan
df.replace('', nan_value, inplace=True)
df = df.dropna(how='any')

# Replace cell in Neighbourhood == 'Not assigned' with Borough values from the same row
condition = df.Neighbourhood == 'Not assigned'
df.Neighbourhood[condition] = df.Borough
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Final assesment of the remaining number of rows in the dataframe

In [10]:
print('The number of rows in the dataframe is: {}'.format(df.shape[0]))

The number of rows in the dataframe is: 103
