In [1]:
import pandas as pd

url = 'https://www.ircc.canada.ca/opendata-donneesouvertes/data/ODP-EE_candidates-ITA_score.csv'
df = pd.read_csv(url, sep='\t')

print("DataFrame reloaded successfully with tab separator.")
print(f"Shape of the DataFrame: {df.shape}")
df.head()

DataFrame reloaded successfully with tab separator.
Shape of the DataFrame: (1767, 9)


Unnamed: 0,EN_YEAR,FR_ANNEÉ,EN_PROVINCE_TERRITORY,FR_PROVINCE_TERRITOIRE,EN_INVITATION_CATEGORY,FR_CATEGORIE_D'INVITATION,EN_ITA_SCORE,FR_NOTE_D'IPD,TOTAL
0,2015,2015,Alberta,Alberta,Canadian Experience Class,Catégorie de l'expérience canadienne,Score 1001 - 1100,Note 1001 - 1100,55
1,2015,2015,Alberta,Alberta,Canadian Experience Class,Catégorie de l'expérience canadienne,Score 401 - 450,Note 401 - 450,--
2,2015,2015,Alberta,Alberta,Canadian Experience Class,Catégorie de l'expérience canadienne,Score 451 - 500,Note 451 - 500,275
3,2015,2015,Alberta,Alberta,Canadian Experience Class,Catégorie de l'expérience canadienne,Score 501 - 600,Note 501 - 600,25
4,2015,2015,Alberta,Alberta,Canadian Experience Class,Catégorie de l'expérience canadienne,Score 601 - 700,Note 601 - 700,95


In [2]:
# remove unnecessary columns
import re

# Drop columns that start with 'FR_'
columns_to_drop = [col for col in df.columns if col.startswith('FR_')]
df = df.drop(columns=columns_to_drop)
print(f"Dropped columns: {columns_to_drop}")

# Extract the minimum score from 'EN_ITA_SCORE'
def extract_min_score(score_range):
    if pd.isna(score_range): # Handle NaN values if any
        return None
    match = re.search(r'(\d+)\s*-\s*\d+', score_range)
    if match:
        return int(match.group(1))
    elif 'Score' in score_range and '-' not in score_range:
        # Handle cases like 'Score 700' if they exist, though not seen in head()
        single_score_match = re.search(r'Score\s*(\d+)', score_range)
        if single_score_match:
            return int(single_score_match.group(1))
    return None # Return None if no numeric score can be extracted

df['EN_ITA_SCORE_MIN'] = df['EN_ITA_SCORE'].apply(extract_min_score)

# Convert the new column to numeric, coercing errors to NaN
df['EN_ITA_SCORE_MIN'] = pd.to_numeric(df['EN_ITA_SCORE_MIN'], errors='coerce')

print("\n'EN_ITA_SCORE' column transformed to 'EN_ITA_SCORE_MIN' (minimum score extracted).")

print("\nUpdated DataFrame Info:")
df.info()

print("\nFirst 5 rows of the updated DataFrame:")
df.head()

Dropped columns: ['FR_ANNEÉ', 'FR_PROVINCE_TERRITOIRE', "FR_CATEGORIE_D'INVITATION", "FR_NOTE_D'IPD"]

'EN_ITA_SCORE' column transformed to 'EN_ITA_SCORE_MIN' (minimum score extracted).

Updated DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1767 entries, 0 to 1766
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   EN_YEAR                 1767 non-null   int64 
 1   EN_PROVINCE_TERRITORY   1767 non-null   object
 2   EN_INVITATION_CATEGORY  1767 non-null   object
 3   EN_ITA_SCORE            1767 non-null   object
 4   TOTAL                   1767 non-null   object
 5   EN_ITA_SCORE_MIN        1767 non-null   int64 
dtypes: int64(2), object(4)
memory usage: 83.0+ KB

First 5 rows of the updated DataFrame:


Unnamed: 0,EN_YEAR,EN_PROVINCE_TERRITORY,EN_INVITATION_CATEGORY,EN_ITA_SCORE,TOTAL,EN_ITA_SCORE_MIN
0,2015,Alberta,Canadian Experience Class,Score 1001 - 1100,55,1001
1,2015,Alberta,Canadian Experience Class,Score 401 - 450,--,401
2,2015,Alberta,Canadian Experience Class,Score 451 - 500,275,451
3,2015,Alberta,Canadian Experience Class,Score 501 - 600,25,501
4,2015,Alberta,Canadian Experience Class,Score 601 - 700,95,601


In [1]:
import requests
from bs4 import BeautifulSoup

url = "https://www.canada.ca/en/immigration-refugees-citizenship/corporate/mandate/corporate-initiatives/levels/supplementary-immigration-levels-2026-2028.html"

# Fetch the webpage content
response = requests.get(url)
response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)

# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')

print("Webpage content fetched and parsed successfully. You can now use the 'soup' object to extract data.")

Webpage content fetched and parsed successfully. You can now use the 'soup' object to extract data.


In [2]:
import pandas as pd

# Find the table directly by its ID
target_table = soup.find('table', id='pr')

if target_table:
    # Convert the BeautifulSoup table object to a string for pandas.read_html
    table_html_string = str(target_table)

    # Use pandas to read the HTML table into a list of DataFrames
    # Even if there's only one table, read_html returns a list
    tables = pd.read_html(table_html_string)

    if tables:
        df_targets = tables[0]
        print("Table extracted successfully into a pandas DataFrame.")
        print("First 5 rows of the DataFrame:")
        display(df_targets.head())
    else:
        print("Could not extract table data with pandas.read_html.")
else:
    print("Table with id 'pr' not found on the page. Please double-check the HTML structure or ID.")

Table extracted successfully into a pandas DataFrame.
First 5 rows of the DataFrame:


  tables = pd.read_html(table_html_string)


Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,2026,2026,2026,2027,2027,2027,2028,2028,2028
Unnamed: 0_level_1,Immigrant Category,Immigrant Category,Target,Low Range,High Range,TargetTable footnote 1,Low RangeTable footnote 2,High Range,Target,Low Range,High Range
0,Overall Planned Permanent Resident Admissions,Overall Planned Permanent Resident Admissions,"380,000 (350,000 – 420,000)Table footnote 3","380,000 (350,000 – 420,000)Table footnote 3","380,000 (350,000 – 420,000)Table footnote 3","380,000 (350,000 – 420,000)","380,000 (350,000 – 420,000)","380,000 (350,000 – 420,000)","380,000 (350,000 – 420,000)","380,000 (350,000 – 420,000)","380,000 (350,000 – 420,000)"
1,Overall French-speaking Permanent Resident Adm...,Overall French-speaking Permanent Resident Adm...,"9% (30,267)","9% (30,267)","9% (30,267)","9.5% (31,825)","9.5% (31,825)","9.5% (31,825)","10.5% (35,175)","10.5% (35,175)","10.5% (35,175)"
2,Economic,Federal High SkilledTable footnote 5,109000,85000,120000,111000,86000,122000,111000,86000,122000
3,Economic,Federal BusinessTable footnote 6,500,250,1000,500,250,1000,500,250,1000
4,Economic,Federal Economic Pilots: CaregiversTable footn...,8175,5000,11800,8775,6600,12400,8775,6600,12400
