# Scrapping IRCC (Immigration, Refugees and Citizenship Canada) Help Centre page

<img src = 'https://user-images.githubusercontent.com/83589431/260982896-24daffcf-769e-4eae-8b75-e863523d153a.png'>

## Importing Libraries

In [1]:
import re
import json

from tqdm import tqdm
from time import sleep
from datetime import datetime

import requests
from bs4.element import Tag
from bs4 import BeautifulSoup as sp

In [2]:
# IRCC Help centre site address

URL = 'https://www.cic.gc.ca/english/helpcentre/'
MAIN_PAGE = 'index-a-z-can.asp'
QNA_URL = 'questions-answers-by-topic.asp?top='

## Scrapping Starts

In [3]:
# Defining helper function to get requests object
def get_request_obj(full_url):

    '''
    Input : website URL
    Output : requests object
    '''
    return requests.get(full_url)

# Defining helper function to get BeautifulSoup object
def get_soup_obj(request_obj):

    '''
    Input : requests object
    Output : BeautifulSoup Object
    '''
    return sp(request_obj.content, 'lxml')

In [4]:
# Fetching webpage source code to 'master_soup'

master_req = get_request_obj(URL + MAIN_PAGE)

if master_req.status_code != 200:
    raise requests.exceptions.ConnectionError(f'Expects response code 200, but recieved {master_req}')

master_soup = get_soup_obj(master_req)

print('Parsed page successfully')

Parsed page successfully


In [5]:
# Targetting topics using 'h2' tag
main_topics = master_soup.find('section', {'class' : 'container'}).find_all('h2')
main_topics

[<h2 id="a"><a href="results-by-topic.asp?top=1">Access to Information and Privacy</a></h2>,
 <h2><a href="results-by-topic.asp?top=2">Adoption</a></h2>,
 <h2><a href="results-by-topic.asp?top=3">Application status</a></h2>,
 <h2><a href="results-by-topic.asp?top=4">Applying - General</a></h2>,
 <h2><a href="results-by-topic.asp?top=23">Applying online</a></h2>,
 <h2><a href="results-by-topic.asp?top=20">Asylum claims</a></h2>,
 <h2 id="b"><a href="results-by-topic.asp?top=19">Biometrics</a></h2>,
 <h2 id="c"><a href="results-by-topic.asp?top=28">Caregiver Program</a></h2>,
 <h2><a href="results-by-topic.asp?top=32">Changing the sex or gender identifier on your documents</a></h2>,
 <h2><a href="results-by-topic.asp?top=5">Citizenship</a></h2>,
 <h2><a href="results-by-topic.asp?top=34">Contact us</a></h2>,
 <h2 id="d"><a href="results-by-topic.asp?top=18">Downloading files</a></h2>,
 <h2><a href="results-by-topic.asp?top=35">Destination Canada</a></h2>,
 <h2 id="e"><a href="results-by-

In [6]:
# Extracting links of each topics
topics_with_link = [top.find('a') for top in main_topics if top.find('a') != None]
topics_with_link

[<a href="results-by-topic.asp?top=1">Access to Information and Privacy</a>,
 <a href="results-by-topic.asp?top=2">Adoption</a>,
 <a href="results-by-topic.asp?top=3">Application status</a>,
 <a href="results-by-topic.asp?top=4">Applying - General</a>,
 <a href="results-by-topic.asp?top=23">Applying online</a>,
 <a href="results-by-topic.asp?top=20">Asylum claims</a>,
 <a href="results-by-topic.asp?top=19">Biometrics</a>,
 <a href="results-by-topic.asp?top=28">Caregiver Program</a>,
 <a href="results-by-topic.asp?top=32">Changing the sex or gender identifier on your documents</a>,
 <a href="results-by-topic.asp?top=5">Citizenship</a>,
 <a href="results-by-topic.asp?top=34">Contact us</a>,
 <a href="results-by-topic.asp?top=18">Downloading files</a>,
 <a href="results-by-topic.asp?top=35">Destination Canada</a>,
 <a href="results-by-topic.asp?st=16.7">Electronic Travel Authorization (eTA)</a>,
 <a href="results-by-topic.asp?top=29">Express Entry</a>,
 <a href="results-by-topic.asp?top=3

**Extracting numericals from the above generated list and generating a Python dictionary**

Example:

`<a href="results-by-topic.asp?top=16">Visiting</a>` from this we are only interested in getting **16** from `top=16` and it's text **Visiting**.

Using these 2 information we'll create a dictionary as,
```python
{
    'Visiting' : '16'
}
```



In [7]:
# Extracting numbers present after '=' sign from topics_with_link
topic_sections = {topics_.text:topics_['href'].split('=')[1] for topics_ in topics_with_link if len(topics_.text) > 0}
topic_sections

{'Access to Information and Privacy': '1',
 'Adoption': '2',
 'Application status': '3',
 'Applying - General': '4',
 'Applying online': '23',
 'Asylum claims': '20',
 'Biometrics': '19',
 'Caregiver Program': '28',
 'Changing the sex or gender identifier on your documents': '32',
 'Citizenship': '5',
 'Contact us': '34',
 'Downloading files': '18',
 'Destination Canada': '35',
 'Electronic Travel Authorization (eTA)': '16.7',
 'Express Entry': '29',
 'Fraud and scams': '31',
 'Immigrating': '6',
 'Immigration representatives': '7',
 'Interim Federal Health Program': '33',
 'Inadmissibility': '8',
 'International Experience Canada': '25',
 'Leaving/Returning to Canada': '22',
 'Link your application to your online account': '23.4',
 'Permanent Residents': '10',
 'Port of Entry Letter': '26',
 'Refugees': '11',
 'Replacing documents': '12',
 'Service standards': '13',
 'Settlement services': '27',
 'Sponsoring your family': '14',
 'Studying': '15',
 'Ukraine: Immigration measures': '38'

**Final Data storing structure**

```python
{'Topic' :
        {
            'Question' : '.........',
            'Content' : Answers,
            'Source' : URL ,
            'DateOfScrapping' : Date
        }
}
```

In [8]:
topic_qna = {} # For storing scraped text datas
failed_fetching = [] # for keeping track of scrape failed topics

for topic, number in tqdm(topic_sections.items()):
    r_temp = get_request_obj(URL + QNA_URL + number)
    if r_temp.status_code != 200: # only if the responce code is 200, the scraper runs
        failed_fetching.append(topic)
    else:
        topic_qna[topic] = {}
        qna_object = get_soup_obj(r_temp)

        for faq_tag in qna_object.find('main').find('section'):
            if isinstance(faq_tag, Tag):
                if faq_tag.name == 'h2':
                    key = faq_tag.text.strip()
                    topic_qna[topic][key] = {}
                    solutions_ = []
                if faq_tag.name != 'h2':
                    solutions_.append(faq_tag.text.strip())
                    space_removal_1 = re.sub(r'\n|\r|\xa0|\t', ' ', ''.join(solutions_)) # Cleaning text : removing new-line, tab and white space charaters
                    topic_qna[topic][key]['Content'] = re.sub('\s+', ' ', space_removal_1) # Cleaning text : removing multiple white space charaters
                    topic_qna[topic][key]['Source'] = URL + QNA_URL + number
                    topic_qna[topic][key]['DateOfScrapping'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' UTC'
    sleep(1)

print('\n\nTopics failed to scrape due to errors\n', '-' * 35)
for topics in failed_fetching: print(topics)

100%|███████████████████████████████| 34/34 [01:26<00:00,  2.56s/it]



Topics failed to scrape due to errors
 -----------------------------------
Application status
Studying





## Saving data into JSON format

In [9]:
with open('IRCC_Data.json', 'w') as f:
    json.dump(topic_qna, f)

IRCC_Data.join is aavailable at : https://github.com/OmdenaAI/toronto-canada-smartguide/tree/task-1-data-collection/src/data/task-1-data-collection/IRCC_Data.json