# Job-candidate matching

In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
import os

  from .autonotebook import tqdm as notebook_tqdm





##### 1. Load datasets

In [2]:
cv_df = pd.read_csv("cv_records.csv")
job_df = pd.read_csv("job_records.csv")

cv_id_to_idx = dict(zip(cv_df["cv_id"], range(len(cv_df))))
job_id_to_idx = dict(zip(job_df["job_id"], range(len(job_df))))

In [3]:
# Build all CV-Job posting matched pairs (same ROLE)
from collect_cv_job_pairs import collect_cv_job_pairs

job_pairs = collect_cv_job_pairs(cv_df, job_df)

Total matched pairs: 11520


In [4]:
out_dir = "similarity_matching_output"

## JobBERT-v2

https://huggingface.co/TechWolf/JobBERT-v2

##### Load model and encode texts

In [5]:
model = SentenceTransformer("TechWolf/JobBERT-v2")

In [None]:
cv_embeddings_v2 = model.encode(cv_df["description"].tolist(), normalize_embeddings=True)
job_embeddings_v2 = model.encode(job_df["text"].tolist(), normalize_embeddings=True)

##### Compute similarity only for matched pairs

In [None]:
records = []

for cv_id, job_id in job_pairs:
    cv_idx = cv_id_to_idx[cv_id]
    job_idx = job_id_to_idx[job_id]

    sim = util.cos_sim(
        cv_embeddings_v2[cv_idx],
        job_embeddings_v2[job_idx]
    ).item()

    records.append({
        "cv_id": cv_id,
        "name": cv_df.loc[cv_idx, "name"],
        "gender": cv_df.loc[cv_idx, "gender"],
        "ethnicity": cv_df.loc[cv_idx, "ethnicity"],
        "role": cv_df.loc[cv_idx, "role"],

        "job_id": job_id,
        "job_role": job_df.loc[job_idx, "role"],
        "job_domain": job_df.loc[job_idx, "domain"] if "domain" in job_df.columns else None,
        "job_level": job_df.loc[job_idx, "level"],
        "job_text": job_df.loc[job_idx, "text"],

        "similarity": sim
    })

out_df = pd.DataFrame(records)
out_path = os.path.join(out_dir, "cv_job_similarity_jb_v2.csv")
out_df.to_csv(out_path, index=False)

print(f"Saved → {out_path}")
out_df.head()

---

## JobBERT-v3

https://huggingface.co/TechWolf/JobBERT-v3

##### Load model and encode texts

In [None]:
model = SentenceTransformer("TechWolf/JobBERT-v3")

In [None]:
cv_embeddings_v3 = model.encode(cv_df["description"].tolist(), normalize_embeddings=True)
job_embeddings_v3 = model.encode(job_df["text"].tolist(), normalize_embeddings=True)

##### Compute similarity only for matched pairs

In [None]:
records = []

for cv_id, job_id in job_pairs:
    cv_idx = cv_id_to_idx[cv_id]
    job_idx = job_id_to_idx[job_id]

    sim = util.cos_sim(
        cv_embeddings_v3[cv_idx],
        job_embeddings_v3[job_idx]
    ).item()

    records.append({
        "cv_id": cv_id,
        "name": cv_df.loc[cv_idx, "name"],
        "gender": cv_df.loc[cv_idx, "gender"],
        "ethnicity": cv_df.loc[cv_idx, "ethnicity"],
        "role": cv_df.loc[cv_idx, "role"],

        "job_id": job_id,
        "job_role": job_df.loc[job_idx, "role"],
        "job_domain": job_df.loc[job_idx, "domain"] if "domain" in job_df.columns else None,
        "job_level": job_df.loc[job_idx, "level"],
        "job_text": job_df.loc[job_idx, "text"],

        "similarity": sim
    })

out_df = pd.DataFrame(records)
out_path = os.path.join(out_dir, "cv_job_similarity_jb_v3.csv")
out_df.to_csv(out_path, index=False)

print(f"Saved → {out_path}")
out_df.head()

---

## Paraphrase-multilingual-MiniLM-L12 V2 

A job matching sentence-transformers model finetuned from **sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2**
https://huggingface.co/forestav/job_matching_sentence_transformer

##### Load model and encode texts

In [None]:
model_2 = SentenceTransformer("forestav/job_matching_sentence_transformer")

In [None]:
cv_embeddings_minilm = model_2.encode(cv_df["description"].tolist(), normalize_embeddings=True)
job_embeddings_minilm = model_2.encode(job_df["text"].tolist(), normalize_embeddings=True)

In [None]:
records = []

for cv_id, job_id in job_pairs:
    cv_idx = cv_id_to_idx[cv_id]
    job_idx = job_id_to_idx[job_id]

    sim = util.cos_sim(
        cv_embeddings_minilm[cv_idx],
        job_embeddings_minilm[job_idx]
    ).item()

    records.append({
        "cv_id": cv_id,
        "name": cv_df.loc[cv_idx, "name"],
        "gender": cv_df.loc[cv_idx, "gender"],
        "ethnicity": cv_df.loc[cv_idx, "ethnicity"],
        "role": cv_df.loc[cv_idx, "role"],

        "job_id": job_id,
        "job_role": job_df.loc[job_idx, "role"],
        "job_domain": job_df.loc[job_idx, "domain"] if "domain" in job_df.columns else None,
        "job_level": job_df.loc[job_idx, "level"],
        "job_text": job_df.loc[job_idx, "text"],

        "similarity": sim
    })

out_df = pd.DataFrame(records)
out_path = os.path.join(out_dir, "cv_job_similarity_minilm_base.csv")
out_df.to_csv(out_path, index=False)

print(f"Saved → {out_path}")
out_df.head()

---

## Fairness LoRA 

Fairness-aware LoRA adapter for resume–job matching built on top of **BAAI/bge-large-en-v1.5** \
https://huggingface.co/renhehuang/fair-resume-job-matcher-lora

In [None]:
from transformers import AutoTokenizer, AutoModel
from peft import PeftModel
import torch
import torch.nn.functional as F

BASE = "BAAI/bge-large-en-v1.5"
ADAPTER = "renhehuang/fair-resume-job-matcher-lora"  # HF repo id

tokenizer = AutoTokenizer.from_pretrained(BASE)
base_model = AutoModel.from_pretrained(BASE)

# Load LoRA adapter from Hugging Face
model_bge = PeftModel.from_pretrained(base_model, ADAPTER)
model_bge.eval();

In [None]:
def encode_batch(texts, max_length: int = 256):
    """Encode a batch of texts into L2-normalized embeddings (torch.Tensor)."""
    enc = tokenizer(
        texts,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=max_length,
    )
    enc = {k: v.to("cpu") for k, v in enc.items()}

    with torch.no_grad():
        out = model_bge(**enc)
        hidden = out.last_hidden_state                      # (batch, seq_len, hidden)
        emb = hidden.mean(dim=1)                            # simple mean pooling
        emb = F.normalize(emb, p=2, dim=1)                  # L2 norm

    return emb  # (batch, hidden)

def encode_all(texts, batch_size: int = 16, max_length: int = 256):
    """Encode an iterable of texts to a single (N, d) tensor."""
    all_embs = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i : i + batch_size]
        batch_embs = encode_batch(batch_texts, max_length=max_length)
        all_embs.append(batch_embs.cpu())
    return torch.cat(all_embs, dim=0)   # (N, d)

In [None]:
cv_embeddings_bge = encode_all(cv_df["description"].tolist(), batch_size=16)
job_embeddings_bge = encode_all(job_df["text"].tolist(), batch_size=16)

In [None]:
records = []

for cv_id, job_id in job_pairs:
    cv_idx = cv_id_to_idx[cv_id]
    job_idx = job_id_to_idx[job_id]

    cv_emb = cv_embeddings_bge[cv_idx]
    job_emb = job_embeddings_bge[job_idx]

    # embeddings are already normalized → dot product = cosine similarity
    sim = torch.dot(cv_emb, job_emb).item()

    # if you want the "probability" style score they use in the example:
    # prob = torch.sigmoid(torch.tensor(sim)).item()

    records.append({
        "cv_id": cv_id,
        "name": cv_df.loc[cv_idx, "name"],
        "gender": cv_df.loc[cv_idx, "gender"],
        "ethnicity": cv_df.loc[cv_idx, "ethnicity"],
        "role": cv_df.loc[cv_idx, "role"],

        "job_id": job_id,
        "job_role": job_df.loc[job_idx, "role"],
        "job_domain": job_df.loc[job_idx, "domain"] if "domain" in job_df.columns else None,
        "job_level": job_df.loc[job_idx, "level"],
        "job_text": job_df.loc[job_idx, "text"],

        "similarity": sim,
        # "prob": prob,              # uncomment if you want this column too
    })

out_df = pd.DataFrame(records)
out_path = os.path.join(out_dir, "cv_job_similarity_fair_lora.csv")
out_df.to_csv(out_path, index=False)

print(f"Saved → {out_path}")
out_df.head()