### Let's Scrape the data using Selenium and BeautifulSoup

In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import pandas as pd

In [2]:
def scraper(url: str, driver_path: str):
    """
    Method to scrape a webpage
    """
    driver = webdriver.Chrome(executable_path=driver_path)
    driver.get(url)
    time.sleep(2)
    table_element = driver.find_element_by_xpath('/html/body/div[3]/div[3]/div[4]/div/table[1]')
    data = {col.text: [] for col in table_element.find_elements_by_xpath(".//*[self::th]")}
    header = list(data.keys())
    for table_row in table_element.find_elements_by_xpath(".//*[self::tr]"):
        row_data = [table_data.text for table_data in table_row.find_elements_by_xpath(".//*[self::td]")]
        # We skip empty rows
        if not row_data:
            continue
        # We skip rows which Borough is Not assigned
        if row_data[1] == 'Not assigned':
            continue
        for i, col in enumerate(row_data):
            data[header[i]].append(col)
    frame = pd.DataFrame(data=data)
    driver.quit()
    return frame

In [3]:
URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
DRIVER_PATH = '/Users/koutouodilon/Documents/ibm-data-science-professional-certificate/applied-data-science-capstone/Coursera_Capstone/chromedriver'
dataframe = scraper(url=URL, driver_path=DRIVER_PATH)

In [4]:
dataframe.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [5]:
def transform_neighborhood(row):
    """
    Method to transform the dataframe
    """
    # If a cell has a neighbohood Not assigned, then it's the same as the Borough
    if row['Neighborhood'] == 'Not assigned':
        row['Neighborhood'] = row['Borough']

    # We split the Neighborhood and join them by comma
    neighborhoods = row['Neighborhood'].split('/')
    row['Neighborhood'] = ", ".join([n.strip() for n in neighborhoods])
    return row

In [6]:
dataframe = dataframe.apply(transform_neighborhood, axis=1)

In [7]:
dataframe.head(12)

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [8]:
print(f"Number of rows in the dataframe: {dataframe.shape[0]}")

Number of rows in the dataframe: 103
