In [10]:
# Install dependencies in Colab
!apt-get update
!apt install -y chromium-chromedriver
!pip install selenium

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait  # Added this import
from selenium.webdriver.support import expected_conditions as EC  # Added this import
import pandas as pd

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:4 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading

In [11]:
# Setup headless Chrome
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=options)

# Base URL
base_url = "https://www.shl.com/solutions/products/product-catalog/"

In [14]:
# Function to parse a table from a page
def parse_table(driver, table_name):
    # Wait for data rows to load (10-second timeout)
    wait = WebDriverWait(driver, 10)
    if table_name == "Pre-packaged Job Solutions":
        row_selector = "tr[data-course-id]"
    else:  # Individual Test Solutions
        row_selector = "tr[data-entity-id]"

    # Wait for at least one data row
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, row_selector)))
    rows = driver.find_elements(By.CSS_SELECTOR, row_selector)

    # Debug: Print number of rows found
    print(f"Found {len(rows)} rows on this page")

    data = []
    for row in rows:
        row_id = row.get_attribute("data-course-id") or row.get_attribute("data-entity-id")
        try:
            name_elem = row.find_element(By.CSS_SELECTOR, "td.custom__table-heading__title a")
            name = name_elem.text.strip()
            url = name_elem.get_attribute("href")
        except Exception as e:
            print(f"Error finding name/URL: {e}")
            continue

        remote = "Yes" if row.find_elements(By.CSS_SELECTOR, "td:nth-child(2) .catalogue__circle.-yes") else "No"
        adaptive = "Yes" if row.find_elements(By.CSS_SELECTOR, "td:nth-child(3) .catalogue__circle.-yes") else "No"
        test_types = [t.text for t in row.find_elements(By.CSS_SELECTOR, ".product-catalogue__key")]
        test_type = " ".join(test_types)

        data.append({
            "id": row_id,
            "Table": table_name,
            "Pre-packaged Job Solutions": name,
            "URL": url,
            "Remote Testing (y/n)": remote,
            "Adaptive/IRT (y/n)": adaptive,
            "Test Type": test_type
        })
    return data

In [15]:
# Extract Table 1 (Pre-packaged Job Solutions) - 12 pages, 144 rows
table1_data = []
for page in range(0, 144, 12):
    url = f"{base_url}?start={page}&type=2&type=2"
    driver.get(url)
    print(f"Loading Table 1, page {page // 12 + 1}/12: {url}")
    try:
        table1_data.extend(parse_table(driver, "Pre-packaged Job Solutions"))
        print(f"Scraped Table 1, page {page // 12 + 1}/12 ({len(table1_data)} rows so far)")
    except Exception as e:
        print(f"Error on Table 1, page {page // 12 + 1}: {e}")

Loading Table 1, page 1/12: https://www.shl.com/solutions/products/product-catalog/?start=0&type=2&type=2
Found 12 rows on this page
Scraped Table 1, page 1/12 (12 rows so far)
Loading Table 1, page 2/12: https://www.shl.com/solutions/products/product-catalog/?start=12&type=2&type=2
Found 12 rows on this page
Scraped Table 1, page 2/12 (24 rows so far)
Loading Table 1, page 3/12: https://www.shl.com/solutions/products/product-catalog/?start=24&type=2&type=2
Found 12 rows on this page
Scraped Table 1, page 3/12 (36 rows so far)
Loading Table 1, page 4/12: https://www.shl.com/solutions/products/product-catalog/?start=36&type=2&type=2
Found 12 rows on this page
Scraped Table 1, page 4/12 (48 rows so far)
Loading Table 1, page 5/12: https://www.shl.com/solutions/products/product-catalog/?start=48&type=2&type=2
Found 12 rows on this page
Scraped Table 1, page 5/12 (60 rows so far)
Loading Table 1, page 6/12: https://www.shl.com/solutions/products/product-catalog/?start=60&type=2&type=2
Foun

In [5]:
# driver.get("https://www.shl.com/solutions/products/product-catalog/?start=0&type=2")
# print(driver.page_source[:1000])  # First 1000 characters of HTML

<html lang="en-US" data-locale="en_US" data-language="en" data-localeroot="/" class="ss-productcataloguelistpage -colour-theme-blue || js-html-tag js" data-fl-id="1" data-geo="GB"><head>
        
<!-- Google Tag Manager -->
<script async="" src="https://www.clarity.ms/s/0.8.1/clarity.js"></script><script type="text/javascript" charset="UTF-8" async="" src="https://consent.cookiebot.com/34466d9c-a30f-4341-82cc-bfb2ce498814/cc.js?renew=false&amp;referer=www.shl.com&amp;dnt=false&amp;init=false"></script><script async="" src="https://www.clarity.ms/tag/qgxubwj2gc?ref=gtm2"></script><script type="text/javascript" async="" src="https://www.googletagmanager.com/gtag/destination?id=AW-1009032174&amp;l=dataLayer&amp;cx=c&amp;gtm=45He5421v849425486za200&amp;tag_exp=102788824~102803279~102813109~102887799~102926062~102975949~103016951~103021830~103027016"></script><script type="text/javascript" async="" src="https://consent.cookiebot.com/uc.js?cbid=34466d9c-a30f-4341-82cc-bfb2ce498814&amp;consen

In [16]:
# Extract Table 2 (Individual Test Solutions) - 32 pages, 384 rows
table2_data = []
for page in range(0, 384, 12):
    url = f"{base_url}?start={page}&type=1"
    driver.get(url)
    print(f"Loading Table 2, page {page // 12 + 1}/32: {url}")
    try:
        table2_data.extend(parse_table(driver, "Individual Test Solutions"))
        print(f"Scraped Table 2, page {page // 12 + 1}/32 ({len(table2_data)} rows so far)")
    except Exception as e:
        print(f"Error on Table 2, page {page // 12 + 1}: {e}")

Loading Table 2, page 1/32: https://www.shl.com/solutions/products/product-catalog/?start=0&type=1
Found 12 rows on this page
Scraped Table 2, page 1/32 (12 rows so far)
Loading Table 2, page 2/32: https://www.shl.com/solutions/products/product-catalog/?start=12&type=1
Found 12 rows on this page
Scraped Table 2, page 2/32 (24 rows so far)
Loading Table 2, page 3/32: https://www.shl.com/solutions/products/product-catalog/?start=24&type=1
Found 12 rows on this page
Scraped Table 2, page 3/32 (36 rows so far)
Loading Table 2, page 4/32: https://www.shl.com/solutions/products/product-catalog/?start=36&type=1
Found 12 rows on this page
Scraped Table 2, page 4/32 (48 rows so far)
Loading Table 2, page 5/32: https://www.shl.com/solutions/products/product-catalog/?start=48&type=1
Found 12 rows on this page
Scraped Table 2, page 5/32 (60 rows so far)
Loading Table 2, page 6/32: https://www.shl.com/solutions/products/product-catalog/?start=60&type=1
Found 12 rows on this page
Scraped Table 2, pa

In [17]:
# Combine data
all_data = table1_data + table2_data

In [18]:
# Convert to DataFrame and drop 'Table' column
df = pd.DataFrame(all_data)
df = df[["id", "Pre-packaged Job Solutions", "URL", "Remote Testing (y/n)", "Adaptive/IRT (y/n)", "Test Type"]]

# Save to CSV
df.to_csv("shl_catalog.csv", index=False)