In [1]:
pip install selenium

Collecting selenium
  Downloading selenium-4.31.0-py3-none-any.whl.metadata (7.5 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.29.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.31.0-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m63.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading trio-0.29.0-py3-none-any.whl (492 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.9/492.9 kB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio_websocket-0.12.2-py3-none-any.whl (21 kB)
Downloading outcome-1.

In [5]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import csv

def scrape_shl_assessments():
    """Scrapes SHL's product catalog with pagination and saves data to CSV."""
    base_url = "https://www.shl.com"
    assessments = []
    
    # Test type mapping
    test_type_map = {
        'A': 'Ability & Aptitude',
        'B': 'Biodata & Situational Judgement',
        'C': 'Competencies',
        'D': 'Development & 360',
        'E': 'Assessment Exercises',
        'K': 'Knowledge & Skills',
        'P': 'Personality & Behavior',
        'S': 'Simulations'
    }
    
    for page_num in range(0, 32):  # Assuming 32 pages as mentioned
        start = page_num * 12
        catalog_url = f"{base_url}/solutions/products/product-catalog/?start={start}"
        
        print(f"Scraping page: {catalog_url}")
        response = requests.get(catalog_url, headers={'User-Agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find table rows containing assessment data
        assessment_rows = soup.select('tr[data-entity-id]')
        
        if not assessment_rows:
            print("No assessment rows found on this page.")
            break  # Stop if no rows are found on a page
        
        for row in assessment_rows:
            try:
                # Extract basic information
                title_cell = row.select_one('td.custom__table-heading__title')
                name = title_cell.text.strip()
                relative_url = title_cell.find('a')['href']
                full_url = urljoin(base_url, relative_url)
                
                # Extract remote testing and adaptive information
                remote_testing_cell = row.select_one('td:nth-child(2)')  # Second cell
                adaptive_cell = row.select_one('td:nth-child(3)')  # Third cell
                
                remote = 'Yes' if remote_testing_cell.find('span', class_='catalogue__circle -yes') else 'No'
                adaptive = 'Yes' if adaptive_cell.find('span', class_='catalogue__circle -yes') else 'No'
                
                # Extract test types
                test_type_cell = row.select_one('td:nth-child(4)')  # Fourth cell
                test_type_spans = test_type_cell.select('span[data-has-tooltip="true"]')
                
                test_types = [test_type_map.get(span.text.strip(), 'Unknown') for span in test_type_spans]
                
                assessments.append({
                    'name': name,
                    'url': full_url,
                    'remote': remote,
                    'adaptive': adaptive,
                    'test_type': test_types,
                })
                
            except Exception as e:
                print(f"Error parsing assessment row: {e}")
                continue
    
    return assessments

def save_to_csv(assessments, filename="shl_assessments.csv"):
    """Saves the scraped assessment data to a CSV file."""
    
    if not assessments:
        print("No data to save.")
        return
    
    # Define CSV header
    header = assessments[0].keys()
    
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=header)
        
        # Write header row
        writer.writeheader()
        
        # Write data rows
        writer.writerows(assessments)
    
    print(f"Data saved to {filename}")

# Usage example
if __name__ == '__main__':
    assessments = scrape_shl_assessments()
    
    if assessments:
        save_to_csv(assessments)


Scraping page: https://www.shl.com/solutions/products/product-catalog/?start=0
Scraping page: https://www.shl.com/solutions/products/product-catalog/?start=12
Scraping page: https://www.shl.com/solutions/products/product-catalog/?start=24
Scraping page: https://www.shl.com/solutions/products/product-catalog/?start=36
Scraping page: https://www.shl.com/solutions/products/product-catalog/?start=48
Scraping page: https://www.shl.com/solutions/products/product-catalog/?start=60
Scraping page: https://www.shl.com/solutions/products/product-catalog/?start=72
Scraping page: https://www.shl.com/solutions/products/product-catalog/?start=84
Scraping page: https://www.shl.com/solutions/products/product-catalog/?start=96
Scraping page: https://www.shl.com/solutions/products/product-catalog/?start=108
Scraping page: https://www.shl.com/solutions/products/product-catalog/?start=120
Scraping page: https://www.shl.com/solutions/products/product-catalog/?start=132
Scraping page: https://www.shl.com/sol

In [5]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import pandas as pd
import time
import re

def slugify(name):
    """Converts assessment name to SHL-compatible URL slug."""
    name = re.sub(r'[()]', '', name)
    name = name.replace('.', '-')  # dots in middle replaced with hyphen
    name = name.lower()
    name = re.sub(r'[^a-z0-9]+', '-', name)
    return name.strip('-')

def setup_driver():
    """Sets up the Selenium WebDriver with headless Chrome."""
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    return webdriver.Chrome(options=options)

def fetch_duration_selenium(driver, slug):
    url = f"https://www.shl.com/solutions/products/product-catalog/view/{slug}"
    print(f"\n🔗 Visiting: {url}")
    try:
        driver.get(url)
        time.sleep(2)  # Allow time for the page to load

        headings = driver.find_elements(By.TAG_NAME, 'h4')
        for heading in headings:
            if heading.text.strip().lower() == 'assessment length':
                sibling = heading.find_element(By.XPATH, 'following-sibling::p')
                match = re.search(r'\d+', sibling.text)
                duration = match.group() if match else 'N/A'
                print(f"🕒 Duration found: {duration} minutes")
                return duration
    except Exception as e:
        print(f"❌ Error fetching from {url}: {e}")
        return 'N/A'
    print("⚠️ Assessment length not found.")
    return 'N/A'

def update_assessments_with_duration_selenium(input_csv="shl_assessments.csv", output_csv="shl_assessments_with_duration.csv"):
    df = pd.read_csv(input_csv)
    durations = []
    driver = setup_driver()

    for index, name in enumerate(df['name']):
        print(f"\n➡️ Processing {index + 1}/{len(df)}: {name}")
        slug = slugify(name)
        duration = fetch_duration_selenium(driver, slug)
        durations.append(duration)
        time.sleep(1)  # Be nice to the server

    driver.quit()
    df['duration_minutes'] = durations
    df.to_csv(output_csv, index=False)
    print(f"\n✅ CSV saved with durations: {output_csv}")

# Run the scraper
if __name__ == "__main__":
    update_assessments_with_duration_selenium()



➡️ Processing 1/384: Global Skills Development Report

🔗 Visiting: https://www.shl.com/solutions/products/product-catalog/view/global-skills-development-report
⚠️ Assessment length not found.

➡️ Processing 2/384: .NET Framework 4.5

🔗 Visiting: https://www.shl.com/solutions/products/product-catalog/view/net-framework-4-5
🕒 Duration found: 30 minutes

➡️ Processing 3/384: .NET MVC (New)

🔗 Visiting: https://www.shl.com/solutions/products/product-catalog/view/net-mvc-new
🕒 Duration found: 17 minutes

➡️ Processing 4/384: .NET MVVM (New)

🔗 Visiting: https://www.shl.com/solutions/products/product-catalog/view/net-mvvm-new
🕒 Duration found: 5 minutes

➡️ Processing 5/384: .NET WCF (New)

🔗 Visiting: https://www.shl.com/solutions/products/product-catalog/view/net-wcf-new
🕒 Duration found: 11 minutes

➡️ Processing 6/384: .NET WPF (New)

🔗 Visiting: https://www.shl.com/solutions/products/product-catalog/view/net-wpf-new
🕒 Duration found: 9 minutes

➡️ Processing 7/384: .NET XAML (New)

🔗 V

In [6]:
df=pd.read_csv('shl_assessments_with_duration.csv')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384 entries, 0 to 383
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              384 non-null    object 
 1   url               384 non-null    object 
 2   remote            384 non-null    object 
 3   adaptive          384 non-null    object 
 4   test_type         384 non-null    object 
 5   duration_minutes  327 non-null    float64
dtypes: float64(1), object(5)
memory usage: 18.1+ KB


In [12]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import pandas as pd
import time
import re

def slugify(name):
    name = re.sub(r'[()]', '', name)
    name = name.replace('.', '-')
    name = name.lower()
    name = re.sub(r'[^a-z0-9]+', '-', name)
    return name.strip('-')

def setup_driver():
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    return webdriver.Chrome(options=options)

def extract_detail_text(driver, heading_text):
    """Extracts text from the <p> following the given <h4> heading text."""
    headings = driver.find_elements(By.TAG_NAME, 'h4')
    for heading in headings:
        if heading.text.strip().lower() == heading_text.lower():
            try:
                sibling = heading.find_element(By.XPATH, 'following-sibling::p')
                return sibling.text.strip()
            except:
                print(f"⚠️ No <p> tag found after heading: {heading_text}")
                return ''
    return ''

def clean_duration(raw_text):
    """Returns '15 to 20 minutes' if two numbers, '20 minutes' if one number, else 'N/A'."""
    numbers = re.findall(r'\d+', raw_text)
    if len(numbers) >= 2:
        return f"{numbers[0]} to {numbers[1]} minutes"
    elif len(numbers) == 1:
        return f"{numbers[0]} minutes"
    return "N/A"

def fetch_assessment_info(driver, slug):
    url = f"https://www.shl.com/solutions/products/product-catalog/view/{slug}"
    print(f"\n🔗 Visiting: {url}")
    try:
        driver.get(url)
        time.sleep(2)

        description = extract_detail_text(driver, 'Description')
        job_levels = extract_detail_text(driver, 'Job levels')
        raw_duration = extract_detail_text(driver, 'Assessment length')
        duration = clean_duration(raw_duration)

        print(f"📝 Description: {description[:60]}...")
        print(f"📌 Job Levels: {job_levels}")
        print(f"🕒 Duration: {duration}")

        return description, job_levels, duration

    except Exception as e:
        print(f"❌ Error on {slug}: {e}")
        return '', '', 'N/A'

def update_assessments_csv(input_csv="shl_assessments.csv", output_csv="shl_assessments_full.csv"):
    df = pd.read_csv(input_csv)
    descriptions, job_levels, durations = [], [], []

    driver = setup_driver()

    for idx, name in enumerate(df['name']):
        print(f"\n➡️ Processing {idx + 1}/{len(df)}: {name}")
        slug = slugify(name)
        desc, jobs, dur = fetch_assessment_info(driver, slug)
        descriptions.append(desc)
        job_levels.append(jobs)
        durations.append(dur)
        time.sleep(1)

    driver.quit()
    df['description'] = descriptions
    df['job_levels'] = job_levels
    df['duration'] = durations
    df.to_csv(output_csv, index=False)
    print(f"\n✅ CSV updated and saved as: {output_csv}")

# Run the scraper
if __name__ == "__main__":
    update_assessments_csv()



➡️ Processing 1/384: Global Skills Development Report

🔗 Visiting: https://www.shl.com/solutions/products/product-catalog/view/global-skills-development-report
📝 Description: This report is designed to be given to individuals who have ...
📌 Job Levels: Director, Entry-Level, Executive, General Population, Graduate, Manager, Mid-Professional, Front Line Manager, Supervisor,
🕒 Duration: N/A

➡️ Processing 2/384: .NET Framework 4.5

🔗 Visiting: https://www.shl.com/solutions/products/product-catalog/view/net-framework-4-5
📝 Description: The.NET Framework 4.5 test measures knowledge of .NET enviro...
📌 Job Levels: Professional Individual Contributor, Mid-Professional,
🕒 Duration: 30 minutes

➡️ Processing 3/384: .NET MVC (New)

🔗 Visiting: https://www.shl.com/solutions/products/product-catalog/view/net-mvc-new
📝 Description: Multi-choice test that measures the knowledge of Model-View-...
📌 Job Levels: Mid-Professional, Professional Individual Contributor,
🕒 Duration: 17 minutes

➡️ Process

In [14]:
df=pd.read_csv('shl_assessments_full.csv')

In [15]:
df.head()

Unnamed: 0,name,url,remote,adaptive,test_type,description,job_levels,duration
0,Global Skills Development Report,https://www.shl.com/solutions/products/product...,Yes,No,"['Ability & Aptitude', 'Assessment Exercises',...",This report is designed to be given to individ...,"Director, Entry-Level, Executive, General Popu...",
1,.NET Framework 4.5,https://www.shl.com/solutions/products/product...,Yes,Yes,['Knowledge & Skills'],The.NET Framework 4.5 test measures knowledge ...,"Professional Individual Contributor, Mid-Profe...",30 minutes
2,.NET MVC (New),https://www.shl.com/solutions/products/product...,Yes,No,['Knowledge & Skills'],Multi-choice test that measures the knowledge ...,"Mid-Professional, Professional Individual Cont...",17 minutes
3,.NET MVVM (New),https://www.shl.com/solutions/products/product...,Yes,No,['Knowledge & Skills'],Multi-choice test that measures the knowledge ...,"Mid-Professional, Professional Individual Cont...",5 minutes
4,.NET WCF (New),https://www.shl.com/solutions/products/product...,Yes,No,['Knowledge & Skills'],Multi-choice test that measures the knowledge ...,"Mid-Professional, Professional Individual Cont...",11 minutes


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384 entries, 0 to 383
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         384 non-null    object
 1   url          384 non-null    object
 2   remote       384 non-null    object
 3   adaptive     384 non-null    object
 4   test_type    384 non-null    object
 5   description  356 non-null    object
 6   job_levels   321 non-null    object
 7   duration     248 non-null    object
dtypes: object(8)
memory usage: 24.1+ KB


In [30]:
import pandas as pd
import itertools

# Load dataset
df = pd.read_csv("shl_assessments_full.csv")

print("📄 BASIC DATASET INFO")
print("-" * 40)
print(f"🔢 Number of rows: {df.shape[0]}")
print(f"🔠 Number of columns: {df.shape[1]}")
print(f"🧾 Columns: {list(df.columns)}\n")

print("🧪 DATA TYPES AND NON-NULL COUNTS")
print("-" * 40)
print(df.info())

print("\n📊 BASIC STATISTICS (numerical columns)")
print("-" * 40)
print(df.describe())

print("\n🧭 UNIQUE VALUES PER COLUMN")
print("-" * 40)
for col in df.columns:
    print(f"\n🔹 Column: '{col}'")
    print(f"🔸 Number of unique values: {df[col].nunique()}")
    print(df[col].unique()[:10])  # show top 10 unique values

print("\n📌 DUPLICATE ROWS CHECK")
print("-" * 40)
duplicate_rows = df[df.duplicated()]
print(f"🔁 Total duplicate rows: {duplicate_rows.shape[0]}")
if not duplicate_rows.empty:
    print(duplicate_rows.head())

print("\n🚫 MISSING VALUES CHECK")
print("-" * 40)
print(df.isnull().sum())

# Optional: Flatten and print unique job levels if available
if 'job_levels' in df.columns:
    print("\n👔 UNIQUE JOB LEVELS (flattened):")
    job_levels_split = df['job_levels'].dropna().apply(lambda x: [j.strip() for j in x.split(',')])
    flat_job_levels = sorted(set(itertools.chain.from_iterable(job_levels_split)))
    print(flat_job_levels)

# Optional: Summary of durations
if 'duration' in df.columns:
    print("\n⏱️ UNIQUE DURATIONS:")
    print(df['duration'].unique())

# Optional: Value counts of durations
    print("\n📈 DURATION VALUE COUNTS:")
    print(df['duration'].value_counts())


📄 BASIC DATASET INFO
----------------------------------------
🔢 Number of rows: 384
🔠 Number of columns: 8
🧾 Columns: ['name', 'url', 'remote', 'adaptive', 'test_type', 'description', 'job_levels', 'duration']

🧪 DATA TYPES AND NON-NULL COUNTS
----------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384 entries, 0 to 383
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         384 non-null    object
 1   url          384 non-null    object
 2   remote       384 non-null    object
 3   adaptive     384 non-null    object
 4   test_type    384 non-null    object
 5   description  356 non-null    object
 6   job_levels   321 non-null    object
 7   duration     248 non-null    object
dtypes: object(8)
memory usage: 24.1+ KB
None

📊 BASIC STATISTICS (numerical columns)
----------------------------------------
                              name  \
count                          384  