In [14]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By

In [20]:
"""
By default the table only shows locations. But we have the option to show all locations in a single table.
Use selenium to automatically show all locations.
This code block throws an error on initial run. Subsequent runs work fine.
"""

website_address = 'https://ircc.canada.ca/english/newcomers/services/index.asp'

driver = webdriver.Chrome()
driver.get(website_address)
select = Select(driver.find_element("name", "table1_length"))
select.select_by_visible_text('all')

In [21]:
"""
Parse the selenium page into a soup
"""
table = driver.find_element("id","table1tbody")
table_html = table.get_attribute('outerHTML')


# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(table_html, 'html.parser')
driver.quit()

In [17]:
"""
Address, telephone, and email addresses are sandwiched between two <br> elements. They are not inside an element unlike <a> tags.
This code
1)Loops through all <br> tags.
2)Finds the following <br> tag
3)Extract the text between the two tags.
"""
def get_texts(col):
    br_tags = col.find_all('br')
    texts = []
# Extract the text that follows the <br> tag
    for br_tag in br_tags:
        if br_tag:
            try:
                br_text = br_tag.find_next_sibling(text=True).strip()
                texts.append(br_text)
            except(AttributeError):
                pass
    return texts

In [18]:
# Headers for the CSV file
SPOs = [["Name", "Address", "City/Province", "Postal Code", "Country", "Telephone", "Email"]]

rows = soup.find_all('tr')
for row in rows:
    cols = row.find_all('td')
    for col in cols:
        all_a = col.find_all("a")
        # We need to handle the case where some entries in the table are missing information, such as a telephone number. 
        # We can do this by checking if the desired information exists before trying to extract it, and handling the case where it does not exist using an exception or a default value."
        if len(all_a) > 0:
            br_texts = get_texts(col)
            try: 
                if len(all_a) < 3:
                    SPO = [all_a[0].text, all_a[1].text, br_texts[0], br_texts[1], br_texts[2], br_texts[3], " "]
                else: 
                    SPO = [all_a[0].text, all_a[1].text, br_texts[0], br_texts[1], br_texts[2], br_texts[3], all_a[2].text]
            except(IndexError):
                if len(all_a) < 3:
                    SPO = [all_a[0].text, all_a[1].text, br_texts[0], br_texts[1], br_texts[2], "" , " "]
                else: 
                    SPO = [all_a[0].text, all_a[1].text, br_texts[0], br_texts[1], br_texts[2], " ", all_a[2].text]
    SPOs.append(SPO)

In [19]:
import csv

with open("SPOs.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(SPOs)