In [1]:
!pip install -q langchain langchain-community faiss-cpu sentence-transformers transformers sentencepiece requests beautifulsoup4


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m88.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m87.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
xmanager 0.7.1 requires sqlalchemy==1.2.19, but you have sqlalchemy 2.0.45 which is incompatible.[0m[31m
[0m

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("/kaggle/input/final1/final1.csv")

In [4]:
def build_embedding_text(row):
    return f"""
Assessment Name: {row['name']}
Description: {row['description']}
Job Levels: {', '.join(row['job_levels']) if isinstance(row['job_levels'], list) else row['job_levels']}
Assessment Length: {row['assessment_length_mins']} minutes
Test Domains: {', '.join(row['test_type_labels']) if isinstance(row['test_type_labels'], list) else row['test_type_labels']}
""".strip()

df["embedding_text"] = df.apply(build_embedding_text, axis=1)


In [7]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

embedding_model = HuggingFaceEmbeddings(
    model_name="intfloat/e5-large-v2",
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True}
)

vectorstore = FAISS.from_texts(
    texts=df["embedding_text"].tolist(),
    embedding=embedding_model,
    metadatas=df.to_dict("records")
)


In [8]:
train_df = pd.read_excel("/kaggle/input/input1/Gen_AI Dataset.xlsx")

In [9]:
from collections import Counter

url_popularity = Counter(train_df["Assessment_url"])


In [10]:
from transformers import pipeline

llm_parser = pipeline(
    "text2text-generation",
    model="google/flan-t5-small",
    max_new_tokens=256
)


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


In [12]:
QUERY_PARSER_PROMPT = """
Extract structured hiring intent from the query below.

Return JSON with keys:
skills (list),
job_level (entry | mid | senior),
max_duration_minutes (number or null),
role_family (string).

Query:
{query}

JSON:
"""


In [13]:
import json
import re

def parse_query_with_llm(query):
    prompt = QUERY_PARSER_PROMPT.format(query=query)
    output = llm_parser(prompt)[0]["generated_text"]

    match = re.search(r"\{.*\}", output, re.S)
    if not match:
        return {}

    try:
        return json.loads(match.group())
    except:
        return {}


In [14]:
import requests
from bs4 import BeautifulSoup

def extract_text_from_url(url):
    try:
        html = requests.get(url, timeout=15).text
        soup = BeautifulSoup(html, "html.parser")
        return soup.get_text(separator=" ", strip=True)
    except:
        return ""


In [15]:
def rerank_with_train_bias(docs_scores, popularity_map, alpha=0.15):
    reranked = []

    for doc, score in docs_scores:
        url = doc.metadata["url"]

        popularity_boost = popularity_map.get(url, 0)
        final_score = score - alpha * popularity_boost

        reranked.append((doc, final_score))

    reranked.sort(key=lambda x: x[1])  # lower score = better
    return reranked


In [16]:
def retrieve_assessments(query, k=50):
    # 1. Parse query using LLM
    parsed = parse_query_with_llm(query)

    rewritten_query = parsed.get("rewritten_query", "").strip()
    if not rewritten_query:
        rewritten_query = query  # fallback safety

    # 2. Retrieve large candidate pool
    docs_scores = vectorstore.similarity_search_with_score(
        "query: " + rewritten_query,
        k=80
    )

    # 3. Train-aware re-ranking (CRITICAL)
    docs_scores = rerank_with_train_bias(
        docs_scores,
        popularity_map=url_popularity,
        alpha=0.2
    )

    # 4. Optional hard filtering (duration, job level)
    filtered = []
    for doc, score in docs_scores:
        meta = doc.metadata

        # Duration constraint
        if parsed.get("max_duration"):
            if meta.get("assessment_length_mins"):
                if meta["assessment_length_mins"] > parsed["max_duration"]:
                    continue

        filtered.append((doc, score))

        if len(filtered) >= k:
            break

    return filtered, parsed


In [17]:
def apply_train_prior(docs):
    for d in docs:
        prior = url_prior.get(d["url"], 0)
        d["prior_boost"] = 1 + 0.15 * prior
        d["final_score"] = d["score"] / d["prior_boost"]
    return sorted(docs, key=lambda x: x["final_score"])


In [18]:
from collections import defaultdict

def balanced_selection(docs, max_total=10):
    """
    Enforces balanced selection across test domains
    without breaking ranking.
    """
    buckets = defaultdict(list)

    for d in docs:
        types = d.metadata.get("test_type_labels", [])
        primary = types[0] if isinstance(types, list) and types else "Other"
        buckets[primary].append(d)

    final = []
    while len(final) < max_total:
        added = False
        for domain in list(buckets.keys()):
            if buckets[domain]:
                final.append(buckets[domain].pop(0))
                added = True
                if len(final) == max_total:
                    break
        if not added:
            break

    return final


In [19]:
def recommend_assessments(query, top_k=10):
    docs_scores, parsed = retrieve_assessments(query, k=50)

    docs = [d for d, _ in docs_scores]

    final_docs = balanced_selection(docs, max_total=top_k)

    return [
        {
            "name": d.metadata["name"],
            "url": d.metadata["url"],
            "test_types": d.metadata.get("test_type_labels", [])
        }
        for d in final_docs
    ]


In [21]:
queries = [
    "I want to hire a Senior Data Analyst with 5 years of experience in SQL, Excel and Python. Assessment should be 1-2 hour long."
    
]

for q in queries:
    print("\nQUERY:", q)
    recs = recommend_assessments(q)
    for r in recs:
        print("-", r["name"], "→", r["url"])



QUERY: I want to hire a Senior Data Analyst with 5 years of experience in SQL, Excel and Python. Assessment should be 1-2 hour long.
- SQL Server (New) → https://www.shl.com/solutions/products/product-catalog/view/sql-server-new/
- SHL Verify Interactive - Inductive Reasoning → https://www.shl.com/solutions/products/product-catalog/view/shl-verify-interactive-inductive-reasoning/
- Automata - SQL (New) → https://www.shl.com/solutions/products/product-catalog/view/automata-sql-new/
- Microsoft Excel 365 - Essentials (New) → https://www.shl.com/solutions/products/product-catalog/view/microsoft-excel-365-essentials-new/
- Python (New) → https://www.shl.com/solutions/products/product-catalog/view/python-new/
- SQL Server Analysis Services (SSAS) (New) → https://www.shl.com/solutions/products/product-catalog/view/sql-server-analysis-services-%28ssas%29-%28new%29/
- Selenium (New) → https://www.shl.com/solutions/products/product-catalog/view/selenium-new/
- Tableau (New) → https://www.shl.c

In [22]:
def recall_at_k(true_urls, predicted_urls, k=10):
    true_urls = set(true_urls)
    predicted_urls = set(predicted_urls[:k])
    return len(true_urls & predicted_urls) / len(true_urls)


In [23]:
def evaluate_on_train_data(train_df, k=10):
    scores = []
    detailed_results = []

    grouped = train_df.groupby("Query")

    print(f"Evaluating on {len(grouped)} unique queries...\n")

    for query, group in grouped:
        true_urls = group["Assessment_url"].tolist()

        preds = recommend_assessments(query, top_k=k)
        predicted_urls = [p["url"] for p in preds]

        recall = recall_at_k(true_urls, predicted_urls, k)
        scores.append(recall)

        detailed_results.append({
            "query": query,
            "recall@10": recall,
            "num_ground_truth": len(true_urls),
            "num_hits": len(set(true_urls) & set(predicted_urls))
        })

    mean_recall = sum(scores) / len(scores)

    return mean_recall, pd.DataFrame(detailed_results)
