### Web Scraping 

In [1]:
# importing some required libraries useful to scrape the website 
import requests
import pandas as pd  

In [3]:
url = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M") # 

In [4]:
from bs4 import BeautifulSoup
import lxml.html as hl

In [5]:
doc = hl.fromstring(url.content) # parsing the html content as single  element

In [6]:
tr_elements = doc.xpath("//tr")  # evaluating /tr  element and  storing  it. 

In [7]:
[len(C) for C in tr_elements[:5]]

[3, 3, 3, 3, 3]

In [8]:
tr_elements = doc.xpath('//tr')

#Create empty list
col=[]
i=0

#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    print('%d : %s'%(i,name))
    col.append((name,[]))

1 : Postcode
2 : Borough
3 : Neighbourhood



### Storing all Table content

In [9]:
#Since out first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 10, the //tr data is not from our table 
    if len(T)!=3:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1

### Storing dictionary as pandas dataframe 

In [11]:
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict) 

In [10]:
[len(C) for (title, C) in col]

[288, 288, 288]

In [12]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


In [13]:
df.dtypes

Postcode           object
Borough            object
Neighbourhood\n    object
dtype: object

In [14]:
df.shape

(288, 3)

### Filtering out unnecessary content  out of  dataframe  

In [15]:
f1 = df['Borough'] == 'Not assigned' 

In [16]:
data = df[~f1] 

In [17]:
data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n
5,M5A,Downtown Toronto,Regent Park\n
6,M6A,North York,Lawrence Heights\n


#### Reset the index

In [18]:
data.set_index('Postcode', inplace = True)

In [19]:
data.head()

Unnamed: 0_level_0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M3A,North York,Parkwoods\n
M4A,North York,Victoria Village\n
M5A,Downtown Toronto,Harbourfront\n
M5A,Downtown Toronto,Regent Park\n
M6A,North York,Lawrence Heights\n


In [20]:
data.index.name = 'Postcode'
data.reset_index(inplace = True)

In [21]:
data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods\n
1,M4A,North York,Victoria Village\n
2,M5A,Downtown Toronto,Harbourfront\n
3,M5A,Downtown Toronto,Regent Park\n
4,M6A,North York,Lawrence Heights\n


In [22]:
data.rename(columns={'Neighbourhood\n':'Neighbourhood'}, inplace = True)  #  removing some string part of Neighbourhood\n

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [23]:
data_crop = data["Neighbourhood"]

In [24]:
data.drop("Neighbourhood", axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [25]:
data_df = data_crop.to_frame()

In [26]:
data_2 = data_df.Neighbourhood.str.replace("\n","")

In [27]:
type(data_2)

pandas.core.series.Series

In [28]:
cleaned_data = data.merge(data_2.to_frame(), left_index=True, right_index=True)

In [29]:
cleaned_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [30]:
type(cleaned_data)

pandas.core.frame.DataFrame

In [31]:
cleaned_data.shape

(211, 3)

####  Combine  the rows who have same Postcode value

In [32]:
merge_data = cleaned_data.groupby(['Postcode','Borough'], as_index = False, sort= False).agg(','.join)

#### Assign the value at certain position 

In [33]:
merge_data.at[4, 'Neighbourhood'] = 'Queen\'s Park'

In [35]:
merge_data.shape # Filtered datasets shape 

(103, 3)