In [7]:
!pip install -q requests beautifulsoup4 pandas tqdm
!pip install -q sentence-transformers faiss-cpu
!pip install -q langchain langchain-community openai


In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

BASE_URL = "https://www.shl.com/products/product-catalog/"
HEADERS = {"User-Agent": "Mozilla/5.0"}

def scrape_individual_tests(limit=None):
    results = []
    start = 0
    step = 12  # SHL page size

    while True:
        url = f"{BASE_URL}?start={start}&type=1"
        print(f"Scraping main page start={start}")

        response = requests.get(url, headers=HEADERS)
        soup = BeautifulSoup(response.text, "html.parser")

        rows = soup.find_all("tr", attrs={"data-entity-id": True})
        print("Rows found:", len(rows))

        if not rows:
            break

        for row in rows:
            title_td = row.find("td", class_="custom__table-heading__title")
            if not title_td:
                continue

            link = title_td.find("a")
            if not link:
                continue

            name = link.text.strip()
            href = link.get("href")

            if name and href:
                results.append({
                    "name": name,
                    "url": "https://www.shl.com" + href
                })

            if limit and len(results) >= limit:
                return results

        start += step
        time.sleep(1)  # keep exactly like your working version

    return results


In [5]:
basic_data = scrape_individual_tests()
df_basic = pd.DataFrame(basic_data)
print(len(df_basic))


Scraping main page start=0
Rows found: 12
Scraping main page start=12
Rows found: 12
Scraping main page start=24
Rows found: 12
Scraping main page start=36
Rows found: 12
Scraping main page start=48
Rows found: 12
Scraping main page start=60
Rows found: 12
Scraping main page start=72
Rows found: 12
Scraping main page start=84
Rows found: 12
Scraping main page start=96
Rows found: 12
Scraping main page start=108
Rows found: 12
Scraping main page start=120
Rows found: 12
Scraping main page start=132
Rows found: 12
Scraping main page start=144
Rows found: 12
Scraping main page start=156
Rows found: 12
Scraping main page start=168
Rows found: 12
Scraping main page start=180
Rows found: 12
Scraping main page start=192
Rows found: 12
Scraping main page start=204
Rows found: 12
Scraping main page start=216
Rows found: 12
Scraping main page start=228
Rows found: 12
Scraping main page start=240
Rows found: 12
Scraping main page start=252
Rows found: 12
Scraping main page start=264
Rows found: 1

In [7]:
df_basic

Unnamed: 0,name,url
0,Global Skills Development Report,https://www.shl.com/products/product-catalog/v...
1,.NET Framework 4.5,https://www.shl.com/products/product-catalog/v...
2,.NET MVC (New),https://www.shl.com/products/product-catalog/v...
3,.NET MVVM (New),https://www.shl.com/products/product-catalog/v...
4,.NET WCF (New),https://www.shl.com/products/product-catalog/v...
...,...,...
372,Written English v1,https://www.shl.com/products/product-catalog/v...
373,Written Spanish,https://www.shl.com/products/product-catalog/v...
374,Zabbix (New),https://www.shl.com/products/product-catalog/v...
375,360 Digital Report,https://www.shl.com/products/product-catalog/v...


In [8]:
df_basic.to_csv("shl_assessments.csv", index=False)


In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

HEADERS = {"User-Agent": "Mozilla/5.0"}

def extract_description(soup):
    header = soup.find(
        lambda tag: tag.name in ["h2", "h3", "h4"] and "Description" in tag.text
    )
    if not header:
        return ""
    p = header.find_next("p")
    return p.text.strip() if p else ""

def extract_job_levels(soup):
    header = soup.find(
        lambda tag: tag.name in ["h2", "h3", "h4"] and "Job levels" in tag.text
    )
    if not header:
        return []
    p = header.find_next("p")
    if not p:
        return []
    return [x.strip() for x in p.text.split(",")]

def extract_test_types(soup):
    test_types = []
    p_tags = soup.find_all("p", class_="product-catalogue__small-text")

    for p in p_tags:
        if "Test Type" in p.text:
            spans = p.find_all("span", class_="product-catalogue__key")
            for s in spans:
                test_types.append(s.text.strip())
            break

    return list(set(test_types))

def extract_assessment_length(soup):
    header = soup.find(
        lambda tag: tag.name in ["h2", "h3", "h4"]
        and "Assessment length" in tag.text
    )
    if not header:
        return None

    p = header.find_next("p")
    if not p:
        return None

    match = re.search(r"(\d+)", p.text)
    return int(match.group(1)) if match else None

def scrape_assessment_details(df_basic):
    detailed_results = []

    for idx, row in df_basic.iterrows():
        print(f"Scraping details {idx+1}/{len(df_basic)} : {row['name']}")

        try:
            response = requests.get(row["url"], headers=HEADERS)
            soup = BeautifulSoup(response.text, "html.parser")

            detailed_results.append({
                "name": row["name"],
                "url": row["url"],
                "description": extract_description(soup),
                "job_levels": extract_job_levels(soup),
                "test_types": extract_test_types(soup),
                "assessment_length_mins": extract_assessment_length(soup)
            })

            time.sleep(1)  # polite delay

        except Exception as e:
            print("Failed:", row["url"], e)

    return pd.DataFrame(detailed_results)

# ---------- RUN SCRAPER 2 ----------
df_final = scrape_assessment_details(df_basic)
df_final


Scraping details 1/377 : Global Skills Development Report
Scraping details 2/377 : .NET Framework 4.5
Scraping details 3/377 : .NET MVC (New)
Scraping details 4/377 : .NET MVVM (New)
Scraping details 5/377 : .NET WCF (New)
Scraping details 6/377 : .NET WPF (New)
Scraping details 7/377 : .NET XAML (New)
Scraping details 8/377 : Accounts Payable (New)
Scraping details 9/377 : Accounts Payable Simulation (New)
Scraping details 10/377 : Accounts Receivable (New)
Scraping details 11/377 : Accounts Receivable Simulation (New)
Scraping details 12/377 : ADO.NET (New)
Scraping details 13/377 : Adobe Experience Manager (New)
Scraping details 14/377 : Adobe Photoshop CC
Scraping details 15/377 : Aeronautical Engineering (New)
Scraping details 16/377 : Aerospace Engineering (New)
Scraping details 17/377 : Agile Software Development
Scraping details 18/377 : Agile Testing (New)
Scraping details 19/377 : AI Skills
Scraping details 20/377 : Amazon Web Services (AWS) Development (New)
Scraping details

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,name,url,description,job_levels,test_types,assessment_length_mins
0,Global Skills Development Report,https://www.shl.com/products/product-catalog/v...,This report is designed to be given to individ...,"[Director, Entry-Level, Executive, General Pop...","[P, E, A, D, C, B]",
1,.NET Framework 4.5,https://www.shl.com/products/product-catalog/v...,The.NET Framework 4.5 test measures knowledge ...,"[Professional Individual Contributor, Mid-Prof...",[K],30.0
2,.NET MVC (New),https://www.shl.com/products/product-catalog/v...,Multi-choice test that measures the knowledge ...,"[Mid-Professional, Professional Individual Con...",[K],17.0
3,.NET MVVM (New),https://www.shl.com/products/product-catalog/v...,Multi-choice test that measures the knowledge ...,"[Mid-Professional, Professional Individual Con...",[K],5.0
4,.NET WCF (New),https://www.shl.com/products/product-catalog/v...,Multi-choice test that measures the knowledge ...,"[Mid-Professional, Professional Individual Con...",[K],11.0
...,...,...,...,...,...,...
372,Written English v1,https://www.shl.com/products/product-catalog/v...,The Written English test measures knowledge of...,"[Entry-Level, Front Line Manager, Mid-Professi...",[K],30.0
373,Written Spanish,https://www.shl.com/products/product-catalog/v...,This test measures knowledge of Spanish gramma...,"[Entry-Level, Front Line Manager, Mid-Professi...",[K],22.0
374,Zabbix (New),https://www.shl.com/products/product-catalog/v...,Multi-choice test that measures the knowledge ...,"[Mid-Professional, Professional Individual Con...",[K],9.0
375,360 Digital Report,https://www.shl.com/products/product-catalog/v...,Introducing the new 360 Digital Report from SH...,"[Director, Manager, Entry-Level, Executive, Fr...",[D],


In [13]:
df_final.to_csv("shl_assessments_full.csv", index=False)


In [2]:
import pandas as pd

In [4]:
df = pd.read_csv("/kaggle/input/shl-asses/shl_assessments_full.csv")

In [11]:
import ast

TEST_TYPE_MAP = {
    "A": "Ability & Aptitude",
    "B": "Biodata & Situational Judgement",
    "C": "Competencies",
    "D": "Development & 360",
    "E": "Assessment Exercises",
    "K": "Knowledge & Skills",
    "P": "Personality & Behavior",
    "S": "Simulations"
}

def expand_test_types(codes):
    if pd.isna(codes):
        return []

    # Case 1: already a list
    if isinstance(codes, list):
        return [TEST_TYPE_MAP.get(c.strip(), c.strip()) for c in codes]

    # Case 2: string representation of list → "[K, P]"
    if isinstance(codes, str):
        try:
            parsed = ast.literal_eval(codes)
            if isinstance(parsed, list):
                return [TEST_TYPE_MAP.get(c.strip(), c.strip()) for c in parsed]
        except:
            pass

        # Case 3: single value string → "K"
        return [TEST_TYPE_MAP.get(codes.strip(), codes.strip())]

    return []
df["test_type_codes"] = df["test_types"]
df["test_type_labels"] = df["test_types"].apply(expand_test_types)


In [12]:
df.columns

Index(['name', 'url', 'description', 'job_levels', 'test_types',
       'assessment_length_mins', 'test_type_codes', 'test_type_labels'],
      dtype='object')

In [14]:
df['test_type_labels']

0      [Personality & Behavior, Assessment Exercises,...
1                                   [Knowledge & Skills]
2                                   [Knowledge & Skills]
3                                   [Knowledge & Skills]
4                                   [Knowledge & Skills]
                             ...                        
372                                 [Knowledge & Skills]
373                                 [Knowledge & Skills]
374                                 [Knowledge & Skills]
375                                  [Development & 360]
376          [Personality & Behavior, Development & 360]
Name: test_type_labels, Length: 377, dtype: object

In [17]:
df.drop('test_types', axis = 1,inplace = True)

In [18]:
df.columns

Index(['name', 'url', 'description', 'job_levels', 'assessment_length_mins',
       'test_type_codes', 'test_type_labels'],
      dtype='object')

In [20]:
df.dtypes 

name                       object
url                        object
description                object
job_levels                 object
assessment_length_mins    float64
test_type_codes            object
test_type_labels           object
dtype: object

In [21]:
df.to_csv("final",index = False)