# Web scraping

 ### Imports for web scraping

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import os
import csv

### Find the table on the page

In [3]:
source= requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup= BeautifulSoup(source,'lxml')
tbl= soup.find('table')
#print(tbl.prettify())

### Set up columns names

In [4]:
col1_name='PostalCode'
col2_name='Borough'
col3_name='Neighborhood'

### Retrieve the data from the table and create a dictionary:

In [5]:
col1_list=[]
col2_list=[]
col3_list=[]

for tr in tbl.tbody.find_all('tr')[1:]:
    col1_list.append(tr.text.strip().split('\n')[0])
    col2_list.append(tr.text.strip().split('\n')[1])
    col3_list.append(tr.text.strip().split('\n')[2])
    
print(col1_list[:5],'\n',col2_list[:5],'\n',col3_list[:5])
   

['M1A', 'M2A', 'M3A', 'M4A', 'M5A'] 
 ['Not assigned', 'Not assigned', 'North York', 'North York', 'Downtown Toronto'] 
 ['Not assigned', 'Not assigned', 'Parkwoods', 'Victoria Village', 'Harbourfront']


### Transform the dictionary into a dataframe:

In [6]:
tbl_dic={col1_name:col1_list, col2_name:col2_list, col3_name:col3_list}
df= pd.DataFrame(tbl_dic)
print(df.shape)
df.head()

(287, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


# Data cleaning

### Drop the rows with borough that is 'Not assigned':

In [7]:
df1= df[df.Borough != 'Not assigned']
print(df1.shape)
df1.head()

(210, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


### Combine the neighborhoods:

In [8]:
df2=df1.groupby( ['PostalCode','Borough'])['Neighborhood'].apply(','.join).reset_index()
print(df2.shape)
df2.head()

(103, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Rename the neighborhoods that are 'Not assigned' by their borough name:

In [9]:
df2[df2.Neighborhood.isin(['Not assigned'])]

Unnamed: 0,PostalCode,Borough,Neighborhood
93,M9A,Queen's Park,Not assigned


In [10]:
df3=df2.copy()
for i in df3.index.values:
    if df3.loc[i,'Neighborhood']=='Not assigned':
        df3.loc[i,'Neighborhood']=df3.loc[i,'Borough']
    else:
        next
    

In [11]:
df3[df3.Borough.isin(['Queen\'s Park'])]

Unnamed: 0,PostalCode,Borough,Neighborhood
93,M9A,Queen's Park,Queen's Park


### The number of row in the dataframe:

In [12]:
print('The number of rows in the dataframe is: ',df3.shape[0])

The number of rows in the dataframe is:  103
