
# Job–CV Matching (All-in-One) — **SBERT + FAISS** + 5-Fold CV + 8 Methods

**Điểm mới**: SBERT thật (multilingual) + FAISS thật, đủ 3 cải tiến **C6, C7, C8**, classification trước ranking, GroupKFold theo `job_id`, bộ metrics P@K, R@K, MAP, MRR, nDCG@K.  
Notebook có fallback nếu thiếu thư viện.


In [1]:

%pip install matplotlib

# ==== Environment & Imports ====
# !pip install faiss-cpu sentence-transformers  # nếu thiếu thư viện

import os, math, random
from typing import List
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

SBERT_OK = True
FAISS_OK = True
try:
    from sentence_transformers import SentenceTransformer
except Exception:
    SBERT_OK = False
    print("[WARN] sentence-transformers unavailable. pip install sentence-transformers")
try:
    import faiss  # type: ignore
except Exception:
    FAISS_OK = False
    print("[WARN] faiss unavailable. pip install faiss-cpu")

random.seed(42); np.random.seed(42)
DATA_DIR = "./data"
RESULTS_CSV_PATH = "/mnt/data/match_results_summary_sbert_faiss.csv"
SBERT_MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"


Collecting matplotlib
  Downloading matplotlib-3.10.5-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.3-cp311-cp311-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.59.1-cp311-cp311-win_amd64.whl.metadata (111 kB)
     ---------------------------------------- 0.0/111.1 kB ? eta -:--:--
     ---------------------------------------- 0.0/111.1 kB ? eta -:--:--
     --- ------------------------------------ 10.2/111.1 kB ? eta -:--:--
     ---------- -------------------------- 30.7/111.1 kB 262.6 kB/s eta 0:00:01
     ------------- ----------------------- 41.0/111.1 kB 281.8 kB/s eta 0:00:01
     -------------------- ---------------- 61.4/111.1 kB 409.6 kB/s eta 0:00:01
     -------------------- ---------------- 61.4/111.1 kB 409.6 kB/s eta 0:00:01
     ----------


[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip
  from .autonotebook import tqdm as notebook_tqdm


[WARN] faiss unavailable. pip install faiss-cpu


In [2]:

# ==== Ontology & Utils ====
SKILL_ONTO = {
    "python":["python"], "java":["java"], "javascript":["javascript","js"],
    "typescript":["typescript","ts"], "c#":["c#","csharp"], "cpp":["c++","cpp"],
    "php":["php"], "go":["golang","go"], "html":["html"], "css":["css"],
    "react":["react","react.js","reactjs"], "vue":["vue","vue.js","nuxt"],
    "node.js":["node","node.js","nodejs","express"], "django":["django"], "flask":["flask"], "spring":["spring"],
    "sql":["sql"], "nosql":["mongodb","cassandra","dynamodb"],
    "aws":["aws"], "gcp":["gcp"], "azure":["azure"],
    "docker":["docker"], "kubernetes":["kubernetes","k8s"],
    "spark":["spark"], "hadoop":["hadoop"],
    "communication":["communication","presentation"],
    "teamwork":["teamwork","collaboration"],
    "problem-solving":["problem solving","analytical"]
}
TECH_GROUP = set(["python","java","javascript","typescript","c#","cpp","php","go","html","css","react","vue","node.js","django","flask","spring","sql","nosql","aws","gcp","azure","docker","kubernetes","spark","hadoop"])
SOFT_GROUP = set(["communication","teamwork","problem-solving"])

def normalize_text(s: str) -> str:
    return " ".join(str(s).lower().strip().split())

def jaccard(a: set, b: set) -> float:
    if not a and not b: return 0.0
    return len(a & b) / max(1, len(a | b))

def keyword_overlap(job_terms: set, cv_terms: set) -> float:
    if not job_terms or not cv_terms: return 0.0
    return len(job_terms & cv_terms) / max(1, len(job_terms))

def exp_gap_penalty(required_years: int, cand_years: int) -> float:
    gap = max(0, required_years - cand_years)
    return gap / (required_years + 1e-6)

def group_weighted_skill_score(job_skills: set, cv_skills: set) -> float:
    inter = job_skills & cv_skills
    if not inter: return 0.0
    score = 0.0
    for s in inter:
        if s in TECH_GROUP: score += 1.0
        elif s in SOFT_GROUP: score += 0.3
        else: score += 0.5
    return score / (len(job_skills) + 1e-6)


In [3]:

# ==== Data Loader & Synthetic ====
def generate_synthetic(n_jobs=100, n_cvs=500):
    industries = ["fintech","ecommerce","healthcare","education","gaming"]
    locations = ["hanoi","danang","hcmc","remote","bangkok"]
    all_skills = list(SKILL_ONTO.keys())

    jobs = []
    for jid in range(n_jobs):
        req_skills = sorted(set(np.random.choice(all_skills, size=np.random.randint(3,7), replace=False)))
        jobs.append({
            "job_id": f"J{jid:04d}",
            "title": f"{np.random.choice(['Backend','Frontend','Fullstack','Data','DevOps'])} Engineer",
            "description": " ".join(req_skills) + " " + np.random.choice(industries),
            "required_skills": req_skills,
            "required_exp": int(np.random.choice([0,1,2,3,4,5,7,10])),
            "industry": np.random.choice(industries),
            "location": np.random.choice(locations),
        })
    jobs = pd.DataFrame(jobs)

    cvs = []
    for cid in range(n_cvs):
        cv_skills = sorted(set(np.random.choice(all_skills, size=np.random.randint(4,10), replace=False)))
        years = int(np.random.choice([0,1,2,3,4,5,7,10]))
        cvs.append({
            "cv_id": f"C{cid:05d}",
            "raw_text": " ".join(cv_skills) + " experienced projects",
            "skills": cv_skills,
            "experience_years": years,
            "industry": np.random.choice(industries),
            "location": np.random.choice(locations),
        })
    cvs = pd.DataFrame(cvs)
    return jobs, cvs

def load_or_generate():
    jobs_path = os.path.join(DATA_DIR, "jobs.csv")
    cvs_path = os.path.join(DATA_DIR, "cvs.csv")
    if os.path.exists(jobs_path) and os.path.exists(cvs_path):
        jobs = pd.read_csv(jobs_path)
        cvs = pd.read_csv(cvs_path)
        print("Loaded real CSVs from", DATA_DIR)
    else:
        print("CSV not found, generating synthetic data...")
        jobs, cvs = generate_synthetic()
    return jobs, cvs

jobs, cvs = load_or_generate()
display(jobs.head(2)); display(cvs.head(2))


CSV not found, generating synthetic data...


Unnamed: 0,job_id,title,description,required_skills,required_exp,industry,location
0,J0000,Fullstack Engineer,html kubernetes node.js problem-solving sql he...,"[html, kubernetes, node.js, problem-solving, sql]",7,ecommerce,remote
1,J0001,Data Engineer,azure cpp django go problem-solving python gaming,"[azure, cpp, django, go, problem-solving, python]",10,healthcare,hanoi


Unnamed: 0,cv_id,raw_text,skills,experience_years,industry,location
0,C00000,hadoop kubernetes php spring experienced projects,"[hadoop, kubernetes, php, spring]",3,education,remote
1,C00001,c# communication django kubernetes php react t...,"[c#, communication, django, kubernetes, php, r...",0,healthcare,hcmc


In [4]:

# ==== Build pairs & Ground Truth ====
def build_pairs(jobs: pd.DataFrame, cvs: pd.DataFrame, max_pairs_per_job=150):
    rows = []
    for _, j in jobs.iterrows():
        sampled = cvs.sample(min(max_pairs_per_job, len(cvs)), random_state=42)
        jskills = set(j["required_skills"]) if isinstance(j["required_skills"], (list,set)) else set(str(j["required_skills"]).split())
        for _, c in sampled.iterrows():
            cskills = set(c["skills"]) if isinstance(c["skills"], (list,set)) else set(str(c["skills"]).split())
            label_gt1 = 1 if len(jskills & cskills) >= 2 else 0
            rows.append({
                "job_id": j["job_id"], "cv_id": c["cv_id"],
                "job_title": j["title"], "job_desc": j["description"],
                "job_skills": list(jskills), "job_reqexp": j["required_exp"],
                "job_industry": j["industry"], "job_location": j["location"],
                "cv_text": c["raw_text"],
                "cv_skills": list(cskills), "cv_exp": c["experience_years"],
                "cv_industry": c["industry"], "cv_location": c["location"],
                "label_gt1": label_gt1,
            })
    return pd.DataFrame(rows)

pairs = build_pairs(jobs, cvs, max_pairs_per_job=120)
len(pairs), pairs.head(2)


(12000,
   job_id   cv_id           job_title  \
 0  J0000  C00361  Fullstack Engineer   
 1  J0000  C00073  Fullstack Engineer   
 
                                             job_desc  \
 0  html kubernetes node.js problem-solving sql he...   
 1  html kubernetes node.js problem-solving sql he...   
 
                                           job_skills  job_reqexp job_industry  \
 0  [html, kubernetes, sql, problem-solving, node.js]           7    ecommerce   
 1  [html, kubernetes, sql, problem-solving, node.js]           7    ecommerce   
 
   job_location                                            cv_text  \
 0       remote  c# communication cpp docker flask spring sql v...   
 1       remote  django docker gcp hadoop nosql problem-solving...   
 
                                            cv_skills  cv_exp cv_industry  \
 0  [c#, spring, sql, cpp, docker, vue, communicat...       3     fintech   
 1  [django, gcp, sql, docker, problem-solving, no...       7   ecommerce   
 
 

In [None]:

# ==== TF-IDF & BM25-lite ====
def compute_tfidf_vectors(job_texts, cv_texts):
    vect = TfidfVectorizer(ngram_range=(1,2), min_df=2)
    all_text = job_texts + cv_texts
    X = vect.fit_transform(all_text)
    return vect, X[:len(job_texts)], X[len(job_texts):]

class BM25OkapiLite:
    def __init__(self, corpus_tokens, k1=1.5, b=0.75):
        self.k1 = k1; self.b = b
        self.corpus = corpus_tokens
        self.doc_freq = {}
        self.doc_len = [len(d) for d in corpus_tokens]
        self.avgdl = np.mean(self.doc_len) if self.doc_len else 0.0
        for doc in corpus_tokens:
            for w in set(doc):
                self.doc_freq[w] = self.doc_freq.get(w, 0) + 1
        self.N = len(corpus_tokens)
    def idf(self, term):
        n_qi = self.doc_freq.get(term, 0) + 0.5
        return np.log((self.N - n_qi + 0.5) / n_qi + 1.0)
    def score(self, query_tokens, index):
        score = 0.0; doc = self.corpus[index]; dl = len(doc) or 1
        for t in query_tokens:
            f = doc.count(t)
            if f == 0: continue
            idf = self.idf(t)
            denom = f + self.k1*(1 - self.b + self.b*dl/(self.avgdl + 1e-9))
            score += idf * (f*(self.k1+1)) / denom
        return score
    def get_scores(self, query_tokens):
        return np.array([self.score(query_tokens, i) for i in range(self.N)])

def tokenize_simple(s: str): return normalize_text(s).split()

job_texts = pairs["job_desc"].tolist()
cv_texts = pairs["cv_text"].tolist()
tfidf_vect, J_mat, C_mat = compute_tfidf_vectors(job_texts, cv_texts)
feat_names = np.array(tfidf_vect.get_feature_names_out())

def top_terms(matrix, feature_names, row_i, topk=12):
    row = matrix[row_i]
    if hasattr(row, "toarray"): row = row.toarray()
    row = row.flatten()
    idxs = np.argsort(-row)[:topk]
    return set([feature_names[i] for i in idxs if row[i] > 0])

job_terms_top = [top_terms(J_mat, feat_names, i, 12) for i in range(J_mat.shape[0])]
cv_terms_top  = [top_terms(C_mat, feat_names, i, 12) for i in range(C_mat.shape[0])]

bm25 = BM25OkapiLite([tokenize_simple(t) for t in cv_texts])


: 

In [None]:

# ==== SBERT encode & FAISS build ====
def sbert_encode(texts: List[str], model_name: str):
    if not SBERT_OK:
        print("[FALLBACK] Using TF-IDF cosine for semantic features.")
        vect = TfidfVectorizer(ngram_range=(1,2), min_df=2).fit(texts)
        mat = vect.transform(texts).astype("float32")
        from sklearn.preprocessing import normalize
        mat = normalize(mat)
        return mat, None
    model = SentenceTransformer(model_name)
    emb = model.encode(texts, batch_size=64, show_progress_bar=False, normalize_embeddings=True)
    return np.asarray(emb, dtype="float32"), model

job_embeds, _ = sbert_encode(job_texts, SBERT_MODEL_NAME)
cv_embeds, _  = sbert_encode(cv_texts, SBERT_MODEL_NAME)

def build_faiss_index(vectors: np.ndarray):
    if not FAISS_OK: return None
    dim = vectors.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(vectors)
    return index

faiss_index = build_faiss_index(cv_embeds)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [None]:

# ==== Assemble features (SBERT cosine) & labels ====
if isinstance(job_embeds, np.ndarray) and isinstance(cv_embeds, np.ndarray) and job_embeds.shape[0]==len(pairs):
    sem_cos = np.sum(job_embeds * cv_embeds, axis=1)
else:
    sem_cos = [float(cosine_similarity(J_mat[i], C_mat[i])[0,0]) for i in range(len(pairs))]

pairs["feat_semantic_cosine"] = sem_cos
pairs["feat_keyword_overlap"] = [keyword_overlap(job_terms_top[i], cv_terms_top[i]) for i in range(len(pairs))]
pairs["feat_skill_jaccard"] = [jaccard(set(js), set(cs)) for js,cs in zip(pairs["job_skills"], pairs["cv_skills"])]
pairs["feat_group_weighted_skill"] = [group_weighted_skill_score(set(js), set(cs)) for js,cs in zip(pairs["job_skills"], pairs["cv_skills"])]
pairs["feat_exp_gap_penalty"] = [exp_gap_penalty(req, got) for req,got in zip(pairs["job_reqexp"], pairs["cv_exp"])]
pairs["feat_location_match"] = (pairs["job_location"] == pairs["cv_location"]).astype(float)
pairs["feat_industry_match"] = (pairs["job_industry"] == pairs["cv_industry"]).astype(float)
pairs["feat_bm25"] = [bm25.get_scores(tokenize_simple(pairs["job_desc"].iloc[i]))[i] for i in range(len(pairs))]

pairs["cos_by_job_rank"] = pairs.groupby("job_id")["feat_semantic_cosine"].rank(pct=True)
pairs["label_gt2"] = ((pairs["feat_keyword_overlap"] > 0) & (pairs["cos_by_job_rank"] >= 0.75)).astype(int)
pairs["label"] = ((pairs["label_gt2"] == 1) | (pairs["label_gt1"] == 1)).astype(int)
pairs[["feat_semantic_cosine","feat_keyword_overlap","feat_skill_jaccard","feat_bm25","label"]].head(3)


In [None]:

# ==== Scoring functions ====
def score_B1_IWF_Textrank(row):
    return 0.6*row["feat_keyword_overlap"] + 0.4*row["feat_semantic_cosine"]
def score_B2_Embedding(row):
    return row["feat_semantic_cosine"]
def score_B3_JobVacancy(row, alpha=0.7, beta=0.3):
    return alpha*row["feat_semantic_cosine"] + beta*row["feat_keyword_overlap"]
def score_B4_KSA_SPLS(row):
    return 0.6*row["feat_group_weighted_skill"] + 0.4*row["feat_skill_jaccard"]
def score_B5_ResumeSummarizer(row):
    return row["feat_semantic_cosine"]
def score_C6_Hybrid_Improved(row):
    base = 0.5*row["feat_semantic_cosine"] + 0.3*row["feat_skill_jaccard"] + 0.2*row["feat_keyword_overlap"]
    penalty = 0.25*row["feat_exp_gap_penalty"]
    return base - penalty
def score_C7_BM25_SkillWeight(row):
    return 0.6*row["feat_bm25"] + 0.4*row["feat_group_weighted_skill"]


In [None]:

# ==== Metrics ====
def precision_at_k(labels_sorted, k):
    k = min(k, len(labels_sorted)); 
    return 0.0 if k==0 else sum(labels_sorted[:k]) / k
def recall_at_k(labels_sorted, k):
    tot = sum(labels_sorted); 
    if tot==0: return 0.0
    k = min(k, len(labels_sorted)); 
    return sum(labels_sorted[:k]) / tot
def average_precision(labels_sorted):
    num_pos = sum(labels_sorted); 
    if num_pos==0: return 0.0
    ap, hits = 0.0, 0
    for i, y in enumerate(labels_sorted, 1):
        if y==1: hits+=1; ap += hits / i
    return ap / num_pos
def reciprocal_rank(labels_sorted):
    for i, y in enumerate(labels_sorted, 1):
        if y==1: return 1.0/i
    return 0.0
def dcg_at_k(labels_sorted, k):
    s = 0.0
    for i, y in enumerate(labels_sorted[:k], 1):
        s += (2**y - 1) / math.log2(i+1)
    return s
def ndcg_at_k(labels_sorted, k):
    ideal = sorted(labels_sorted, reverse=True)
    denom = dcg_at_k(ideal, k)
    return 0.0 if denom==0 else dcg_at_k(labels_sorted, k)/denom
def evaluate_ranking(group_df, score_col, k_list=[5,10]):
    df = group_df.sort_values(by=score_col, ascending=False)
    labels = df["label"].astype(int).tolist()
    metrics = {}
    for k in k_list:
        metrics[f"P@{k}"] = precision_at_k(labels, k)
        metrics[f"R@{k}"] = recall_at_k(labels, k)
        metrics[f"nDCG@{k}"] = ndcg_at_k(labels, k)
    metrics["MAP"] = average_precision(labels)
    metrics["MRR"] = reciprocal_rank(labels)
    return metrics


In [None]:

# ==== FAISS retrieval + classifier + reranker ====
FEATURE_COLS = [
    "feat_semantic_cosine","feat_skill_jaccard","feat_keyword_overlap","feat_exp_gap_penalty",
    "feat_industry_match","feat_location_match","feat_bm25","feat_group_weighted_skill"
]

def retrieve_candidates_alljobs(K=200):
    # Query all jobs over the CV corpus to get candidate cv_id lists per row index.
    if FAISS_OK and isinstance(cv_embeds, np.ndarray) and faiss_index is not None:
        D, I = faiss_index.search(np.asarray(job_embeds, dtype="float32"), min(K, cv_embeds.shape[0]))
        return I.tolist()
    # brute-force cosine similarity
    def l2norm(x):
        n = np.linalg.norm(x, axis=1, keepdims=True) + 1e-12
        return x / n
    if isinstance(job_embeds, np.ndarray) and isinstance(cv_embeds, np.ndarray):
        Qn, Vn = l2norm(job_embeds), l2norm(cv_embeds)
        sims = Qn @ Vn.T
        return np.argsort(-sims, axis=1)[:, :min(K, Vn.shape[0])].tolist()
    # fallback TF-IDF
    out = []
    for i in range(J_mat.shape[0]):
        row = cosine_similarity(J_mat[i], C_mat).flatten()
        order = np.argsort(-row)[:min(K, C_mat.shape[0])]
        out.append(order.tolist())
    return out

def run_fold(train_df, test_df, k_list=[5,10], K_candidates=200):
    X_tr = train_df[FEATURE_COLS].values; y_tr = train_df["label"].values
    scaler = MinMaxScaler().fit(X_tr)
    clf = LogisticRegression(max_iter=1000).fit(scaler.transform(X_tr), y_tr)

    # Candidate retrieval for all rows (then subset within each job at eval time)
    cand_lists = retrieve_candidates_alljobs(K=K_candidates)

    test_df = test_df.copy()
    X_te = scaler.transform(test_df[FEATURE_COLS].values)
    test_df["clf_proba"] = clf.predict_proba(X_te)[:,1]
    thr = 0.5  # tune on val if needed

    methods = {
        "B1_IWF_TextRank": lambda r: score_B1_IWF_Textrank(r),
        "B2_Embedding": lambda r: score_B2_Embedding(r),
        "B3_JobVacancy": lambda r: score_B3_JobVacancy(r),
        "B4_KSA_SPLS": lambda r: score_B4_KSA_SPLS(r),
        "B5_ResumeSummarizer": lambda r: score_B5_ResumeSummarizer(r),
        "C6_Hybrid_Improved": lambda r: score_C6_Hybrid_Improved(r),
        "C7_BM25_SkillWeight": lambda r: score_C7_BM25_SkillWeight(r),
    }
    reranker = LogisticRegression(max_iter=1000).fit(scaler.transform(X_tr), y_tr)

    results = []
    for name in list(methods.keys()) + ["C8_ML_Reranker"]:
        metrics_agg = {f"P@{k}":[] for k in [5,10]}
        for k in [5,10]: metrics_agg[f"R@{k}"] = []; metrics_agg[f"nDCG@{k}"] = []
        MAP_list, MRR_list = [], []

        for job_id, g in test_df.groupby("job_id"):
            # classifier filter
            g = g[g["clf_proba"] >= thr].copy()
            if g.empty: continue

            # restrict to FAISS candidates (union for rows of this job)
            cand_union = set()
            for pos in g.index.values:
                top_cv_ids = [pairs.iloc[idx]["cv_id"] for idx in cand_lists[pos]]
                cand_union.update(top_cv_ids)
            g = g[g["cv_id"].isin(cand_union)].copy()
            if g.empty: continue

            if name == "C8_ML_Reranker":
                g[name] = reranker.predict_proba(scaler.transform(g[FEATURE_COLS].values))[:,1]
            else:
                g[name] = g.apply(methods[name], axis=1)

            m = evaluate_ranking(g, score_col=name, k_list=[5,10])
            for k in [5,10]:
                metrics_agg[f"P@{k}"].append(m[f"P@{k}"])
                metrics_agg[f"R@{k}"].append(m[f"R@{k}"])
                metrics_agg[f"nDCG@{k}"].append(m[f"nDCG@{k}"])
            MAP_list.append(m["MAP"]); MRR_list.append(m["MRR"])

        row = {"method": name}
        for k in [5,10]:
            row[f"P@{k}"] = float(np.mean(metrics_agg[f"P@{k}"])) if metrics_agg[f"P@{k}"] else 0.0
            row[f"R@{k}"] = float(np.mean(metrics_agg[f"R@{k}"])) if metrics_agg[f"R@{k}"] else 0.0
            row[f"nDCG@{k}"] = float(np.mean(metrics_agg[f"nDCG@{k}"])) if metrics_agg[f"nDCG@{k}"] else 0.0
        row["MAP"] = float(np.mean(MAP_list)) if MAP_list else 0.0
        row["MRR"] = float(np.mean(MRR_list)) if MRR_list else 0.0
        results.append(row)

    return pd.DataFrame(results)


In [None]:

# ==== 5-fold GroupKFold ====
gkf = GroupKFold(n_splits=5)
fold_summaries = []
for fold_idx, (tr_idx, te_idx) in enumerate(gkf.split(pairs, groups=pairs["job_id"])):
    print(f"Fold {fold_idx+1}/5")
    tr = pairs.iloc[tr_idx].reset_index(drop=True)
    te = pairs.iloc[te_idx].reset_index(drop=True)
    fold_res = run_fold(tr, te, k_list=[5,10], K_candidates=200)
    fold_res["fold"] = fold_idx+1
    fold_summaries.append(fold_res)

cv_results = pd.concat(fold_summaries, ignore_index=True)
cv_results.head(10)


In [None]:

# ==== Aggregate & Save ====
agg = cv_results.groupby("method").agg({
    "P@5":"mean","R@5":"mean","nDCG@5":"mean",
    "P@10":"mean","R@10":"mean","nDCG@10":"mean",
    "MAP":"mean","MRR":"mean"
}).reset_index()

agg = agg.sort_values(by=["MAP","MRR","nDCG@10","P@5"], ascending=False).reset_index(drop=True)
display(agg)
agg.to_csv(RESULTS_CSV_PATH, index=False)
print("Saved summary to:", RESULTS_CSV_PATH)


In [None]:

# ==== Plots ====
plt.figure()
plt.bar(agg["method"], agg["MAP"])
plt.xticks(rotation=45, ha="right"); plt.title("MAP by Method"); plt.tight_layout(); plt.show()

plt.figure()
plt.bar(agg["method"], agg["MRR"])
plt.xticks(rotation=45, ha="right"); plt.title("MRR by Method"); plt.tight_layout(); plt.show()



## Notes
- Thay `SBERT_MODEL_NAME` để dùng các model mạnh hơn (VD: `BAAI/bge-m3`).  
- FAISS dùng `IndexFlatIP` (cosine khi embedding đã normalize). Với dữ liệu lớn, chuyển IVF/PQ.  
- **C6, C7, C8** đã có đủ. **C8** là ML reranker (LogReg) học từ toàn bộ feature.  
- Có thể tách theo thời gian thay vì GroupKFold nếu dữ liệu có timestamp.
