# Method 1

In [None]:
# Install dependencies
!pip install sentence-transformers faiss-cpu pandas

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import re



In [None]:
# Load the dataset
df = pd.read_csv("shl_catalog_detailed.csv")
print(f"Loaded {len(df)} rows from shl_catalog_detailed.csv")

Loaded 518 rows from shl_catalog_detailed.csv


In [None]:
# Preprocess Assessment Length
def parse_duration(text):
    if pd.isna(text) or text == "":
        return np.nan
    text = str(text).strip().lower()
    if text in ["variable", "untimed"]:
        return text
    match = re.search(r'(\d+)', text)  # Extract first number
    return float(match.group(1)) if match else np.nan

In [None]:
df["Assessment Length Parsed"] = df["Assessment Length"].apply(parse_duration)
print("Sample of parsed durations:")
print(df[["Assessment Length", "Assessment Length Parsed"]].head(10))

Sample of parsed durations:
  Assessment Length Assessment Length Parsed
0                49                     49.0
1                36                     36.0
2                51                     51.0
3                30                     30.0
4                20                     20.0
5                35                     35.0
6                45                     45.0
7                45                     45.0
8                43                     43.0
9                49                     49.0


In [None]:
# Combine fields for embedding
def combine_text(row):
    return f"{row['Pre-packaged Job Solutions']}. {row['Description']} Test Type: {row['Test Type']}"

df["combined_text"] = df.apply(combine_text, axis=1)

In [None]:
# Generate embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df["combined_text"].tolist(), show_progress_bar=True)
embeddings = np.array(embeddings, dtype='float32')
print(f"Generated embeddings shape: {embeddings.shape}")

# Setup FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
print(f"FAISS index built with {index.ntotal} vectors")

# Save index and DataFrame
faiss.write_index(index, "shl_assessments_index.faiss")
df.to_csv("shl_catalog_with_text.csv", index=False)
print("Saved FAISS index and updated CSV")

Batches:   0%|          | 0/17 [00:00<?, ?it/s]

Generated embeddings shape: (518, 384)
FAISS index built with 518 vectors
Saved FAISS index and updated CSV


In [None]:
# Retrieval function
def retrieve_assessments(query, k=10, max_duration=None):
    query_lower = query.lower()
    wants_untimed = "untimed" in query_lower
    wants_variable = "variable" in query_lower

    # Embed the query
    query_embedding = model.encode([query], show_progress_bar=False)[0]
    query_embedding = np.array([query_embedding], dtype='float32')

    # Search FAISS index
    distances, indices = index.search(query_embedding, k * 2)  # Get extra to filter
    results = df.iloc[indices[0]].copy()
    results["similarity_score"] = 1 - distances[0] / 2

    # Filter based on duration and query intent
    if max_duration is not None or wants_untimed or wants_variable:
        filtered = []
        for _, row in results.iterrows():
            duration = row["Assessment Length Parsed"]
            if pd.isna(duration):  # Missing duration
                filtered.append(row)
            elif isinstance(duration, str):  # "variable" or "untimed"
                if (duration == "untimed" and wants_untimed) or \
                   (duration == "variable" and wants_variable):
                    filtered.append(row)
            elif isinstance(duration, float) and max_duration is not None:  # Numeric
                if duration <= max_duration:
                    filtered.append(row)
            else:
                filtered.append(row)  # Include if no specific filter applies
        results = pd.DataFrame(filtered)

    # Sort and limit
    results = results.sort_values("similarity_score", ascending=False).head(k)
    return results[["id", "Pre-packaged Job Solutions", "URL", "Remote Testing (y/n)",
                    "Adaptive/IRT (y/n)", "Assessment Length", "Test Type", "similarity_score"]]

In [None]:
# Test the pipeline
sample_queries = [
    "Java developers, 40 mins",
    "untimed test for managers",
    "variable length coding test"
]
for query in sample_queries:
    max_duration = float(re.search(r'(\d+)\s*min', query).group(1)) if "min" in query else None
    results = retrieve_assessments(query, k=10, max_duration=max_duration)
    print(f"\nSample query: '{query}'")
    print(results)

print("\nRAG pipeline built successfully!")


Sample query: 'Java developers, 40 mins'
       id                      Pre-packaged Job Solutions  \
205  4034                Core Java (Advanced Level) (New)   
206  4032                   Core Java (Entry Level) (New)   
161  4160                       Android Development (New)   
237  4063                     Enterprise Java Beans (New)   
431  4130                    Salesforce Development (New)   
223  4221       Digital Readiness Development Report - IC   
224  4239  Digital Readiness Development Report - Manager   
298  4085                      Job Control Language (New)   
384  4156                    Oracle WebLogic Server (New)   
478   251                          Time Management (U.S.)   

                                                   URL Remote Testing (y/n)  \
205  https://www.shl.com/solutions/products/product...                  Yes   
206  https://www.shl.com/solutions/products/product...                  Yes   
161  https://www.shl.com/solutions/products/produ

In [None]:
# Test the pipeline
sample_queries = [
    "I am hiring for Java developers who can also collaborate effectively with my business teams. Looking for an assessment(s) that can be completed in 40 minutes"
]
for query in sample_queries:
    max_duration = float(re.search(r'(\d+)\s*min', query).group(1)) if "min" in query else None
    results = retrieve_assessments(query, k=10, max_duration=max_duration)
    print(f"\nSample query: '{query}'")
    print(results)

print("\nRAG pipeline built successfully!")


Sample query: 'I am hiring for Java developers who can also collaborate effectively with my business teams. Looking for an assessment(s) that can be completed in 40 minutes'
       id                         Pre-packaged Job Solutions  \
502  4215         Virtual Assessment and Development Centers   
130   492  Technology Professional 8.0 Job Focused Assess...   
46    497                Graduate 8.0 Job Focused Assessment   
487  3733                       Verify - Deductive Reasoning   
454   116                         Software Business Analysis   
298  4085                         Job Control Language (New)   
44    496              Graduate + 8.0 Job Focused Assessment   
45    494                Graduate 7.1 Job Focused Assessment   
399  3484                               PJM Selection Report   
501  4290              Verify Interactive Process Monitoring   

                                                   URL Remote Testing (y/n)  \
502  https://www.shl.com/solutions/produc

# Method 2

In [None]:
# Install dependencies
!pip install sentence-transformers faiss-cpu pandas google-generativeai

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import google.generativeai as genai
import time
import re

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_6

In [None]:
# Configure Gemini API
API_KEY = "AIzaSyBxFG2RWw6yBa2_CIqTCrEXVfyMWfwBbZo"
genai.configure(api_key=API_KEY)
model = genai.GenerativeModel('gemini-2.0-flash')

In [None]:
# Load the dataset
df = pd.read_csv("shl_catalog_detailed.csv")
print(f"Loaded {len(df)} rows from shl_catalog_detailed.csv")

Loaded 518 rows from shl_catalog_detailed.csv


In [None]:
# Preprocess Assessment Length
def parse_duration(text):
    if pd.isna(text) or text == "":
        return np.nan
    text = str(text).strip().lower()
    if text in ["variable", "untimed"]:
        return "flexible duration"  # Pre-interpret for LLM consistency
    match = re.search(r'(\d+)', text)
    return float(match.group(1)) if match else np.nan

df["Assessment Length Parsed"] = df["Assessment Length"].apply(parse_duration)
print("Sample of parsed durations:")
print(df[["Assessment Length", "Assessment Length Parsed"]].head(10))

Sample of parsed durations:
  Assessment Length Assessment Length Parsed
0                49                     49.0
1                36                     36.0
2                51                     51.0
3                30                     30.0
4                20                     20.0
5                35                     35.0
6                45                     45.0
7                45                     45.0
8                43                     43.0
9                49                     49.0


In [None]:
# Combine fields for LLM input
def combine_text(row):
    duration = row["Assessment Length"] if pd.notna(row["Assessment Length"]) else "unknown"
    return f"{row['Pre-packaged Job Solutions']}. {row['Description']} Test Type: {row['Test Type']}, Duration: {duration}"

df["combined_text"] = df.apply(combine_text, axis=1)

In [None]:
# df["combined_text"]

In [None]:
# LLM preprocessing with Gemini
def llm_summarize(text):
    prompt = "Compress the following assessment text into a short line. Just include the target role, key skills , duration and test types — expand test codes as: A = Ability & Aptitude, B = Biodata & Situational Judgement, C = Competencies, D = Development & 360, E = Assessment Exercises, K = Knowledge & Skills, P = Personality & Behavior, S = Simulations. Keep it short and crisp, avoid extra details and return plain text."
    try:
        response = model.generate_content(prompt + text)
        return response.text.strip()
    except Exception as e:
        print(f"LLM error: {e}")
        return text  # Fallback to raw text if API fails

In [None]:
# Process dataset with rate limiting (15 RPM = 4 seconds delay)
summaries = []
for i, text in enumerate(df["combined_text"]):
    summaries.append(llm_summarize(text))
    if (i + 1) % 15 == 0:  # Respect 15 RPM
        time.sleep(60)
    print(f"Processed {i + 1}/{len(df)} rows", end="\r")
df["llm_summary"] = summaries
print("\nLLM preprocessing complete. Sample summaries:")
print(df[["combined_text", "llm_summary"]].head(5))

Processed 518/518 rows
LLM preprocessing complete. Sample summaries:
                                       combined_text  \
0  Account Manager Solution. The Account Manager ...   
1  Administrative Professional - Short Form. The ...   
2  Agency Manager Solution. The Agency Manager so...   
3  Apprentice + 8.0 Job Focused Assessment. The A...   
4  Apprentice 8.0 Job Focused Assessment. The App...   

                                         llm_summary  
0  Account Manager, Mid-level Leadership, C P A B...  
1  Admin Professional - Skills: A(Ability/Aptitud...  
2  Agency Manager: Sales Management, A, B, P, S; ...  
3  Apprentice, Biodata & Situational Judgement (B...  
4  Apprentice, B=Biodata & Situational Judgement,...  


In [None]:
# Generate embeddings
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embedding_model.encode(df["llm_summary"].tolist(), show_progress_bar=True)
embeddings = np.array(embeddings, dtype='float32')
print(f"Generated embeddings shape: {embeddings.shape}")

# Setup FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
print(f"FAISS index built with {index.ntotal} vectors")

# Save index and DataFrame
faiss.write_index(index, "shl_assessments_index_new.faiss")
df.to_csv("shl_catalog_with_summaries_new.csv", index=False)
print("Saved FAISS index and updated CSV")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/17 [00:00<?, ?it/s]

Generated embeddings shape: (518, 384)
FAISS index built with 518 vectors
Saved FAISS index and updated CSV


In [None]:
# LLM preprocessing function for query
def llm_shorten_query(query):
    prompt = "Extract and return only the technical skills and job roles from this text, comma-separated: "

    try:
        response = model.generate_content(prompt + query)
        return response.text.strip()
    except Exception as e:
        st.error(f"Query LLM error: {e}")
        return query  # Fallback to raw query

# Retrieval function
def retrieve_assessments(query, k=10, max_duration=None):
    query_lower = query.lower()
    wants_flexible = any(x in query_lower for x in ["untimed", "variable", "flexible"])

    # LLM preprocess query
    processed_query = llm_shorten_query(query)

    # Embed query
    query_embedding = embedding_model.encode([processed_query], show_progress_bar=False)[0]
    query_embedding = np.array([query_embedding], dtype='float32')

    # Search FAISS
    distances, indices = index.search(query_embedding, k * 2)
    results = df.iloc[indices[0]].copy()
    results["similarity_score"] = 1 - distances[0] / 2

    # Filter by duration
    if max_duration is not None or wants_flexible:
        filtered = []
        for _, row in results.iterrows():
            duration = row["Assessment Length Parsed"]
            if pd.isna(duration):
                filtered.append(row)
            elif duration == "flexible duration" and wants_flexible:
                filtered.append(row)
            elif isinstance(duration, float) and max_duration is not None and duration <= max_duration:
                filtered.append(row)
        results = pd.DataFrame(filtered) if filtered else results

    # Rename columns for SHL spec
    results = results.rename(columns={"Pre-packaged Job Solutions": "Assessment Name",
                                      "Assessment Length": "Duration"})
    return results[["Assessment Name", "URL", "Remote Testing (y/n)",
                    "Adaptive/IRT (y/n)", "Duration", "Test Type"]].head(k)


In [None]:
# Test the pipeline
sample_queries = [
    "I am hiring for Java developers who can also collaborate effectively with my business teams. Looking for an assessment(s) that can be completed in 40 minutes."
]
for query in sample_queries:
    max_duration = float(re.search(r'(\d+)\s*min', query).group(1)) if "min" in query else None
    results = retrieve_assessments(query, k=10, max_duration=max_duration)
    print(f"\nSample query: '{query}'")
    print(results)

print("\nRAG pipeline built successfully!")


Sample query: 'I am hiring for Java developers who can also collaborate effectively with my business teams. Looking for an assessment(s) that can be completed in 40 minutes.'
                      Assessment Name  \
206     Core Java (Entry Level) (New)   
474                       Swing (New)   
205  Core Java (Advanced Level) (New)   
298        Job Control Language (New)   
237       Enterprise Java Beans (New)   
174              Automata - Fix (New)   
431      Salesforce Development (New)   
406              Programming Concepts   
179                Automata Front End   
161         Android Development (New)   

                                                   URL Remote Testing (y/n)  \
206  https://www.shl.com/solutions/products/product...                  Yes   
474  https://www.shl.com/solutions/products/product...                  Yes   
205  https://www.shl.com/solutions/products/product...                  Yes   
298  https://www.shl.com/solutions/products/product...  

In [None]:
# now if we shorten

In [None]:
# Test the pipeline
sample_queries = [
    "I am hiring for Java developers who can also collaborate effectively with my business teams. Looking for an assessment(s) that can be completed in 40 minutes."
]

# # LLM preprocessing function for query
# def llm_shorten_query(query):
#     prompt = "Shorten the given line (or word, dont do anything if its already short) , retaining key skills, test type, and duration preferences.return only 1 that best suits."
#     try:
#         response = model.generate_content(prompt + query)
#         return response.text.strip()
#     except Exception as e:
#         print(f"Query LLM error: {e}")
#         return query  # Fallback to raw query if API fails

for query in sample_queries:
    # Shorten query with LLM
    shortened_query = llm_shorten_query(query)
    print(f"Original query: '{query}'")
    print(f"Shortened query: '{shortened_query}'")

    # Extract max_duration from original query
    max_duration = float(re.search(r'(\d+)\s*min', query).group(1)) if "min" in query else None

    # Retrieve results using shortened query
    results = retrieve_assessments(shortened_query, k=10, max_duration=max_duration)
    print(f"\nSample query (shortened): '{shortened_query}'")
    print(results)

print("\nRAG pipeline built successfully!")

Original query: 'I am hiring for Java developers who can also collaborate effectively with my business teams. Looking for an assessment(s) that can be completed in 40 minutes.'
Shortened query: 'Java dev assessment: collaboration skills, 40 minutes.'

Sample query (shortened): 'Java dev assessment: collaboration skills, 40 minutes.'
       id              Pre-packaged Job Solutions  \
237  4063             Enterprise Java Beans (New)   
205  4034        Core Java (Advanced Level) (New)   
206  4032           Core Java (Entry Level) (New)   
45    494     Graduate 7.1 Job Focused Assessment   
406   219                    Programming Concepts   
495  3940       Verify - Working with Information   
309  4157                    Manual Testing (New)   
505   400                  Visual Comparison - US   
499  3976  Verify Interactive G+ Candidate Report   
256   382     Following Instructions v1 - UK (R1)   

                                                   URL Remote Testing (y/n)  \
23

In [None]:
print("Nice")

Nice
