In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
! pip install faiss-cpu sentence_transformers tokenizers transformers -q

In [7]:
import os
os.environ["TRANSFORMERS_NO_MISTRAL_REGEX_PATCH"] = "1"

In [8]:
import json
import os
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer

In [9]:
OUT_DIR = "/content/drive/MyDrive/rag"
OLD_METADATA_PATH = os.path.join(OUT_DIR, "metadata.json")

NEW_INDEX_PATH = os.path.join(OUT_DIR, "rag_finetuned.index")
NEW_METADATA_PATH = os.path.join(OUT_DIR, "metadata_finetuned.json")

FINETUNED_MODEL_PATH = "/content/drive/MyDrive/rag/bge-m3-rag-finetuned"
BATCH_SIZE = 4

In [10]:
with open(OLD_METADATA_PATH, "r", encoding="utf-8") as f:
    docs = json.load(f)
texts = [d["text"] for d in docs]

print(f"Loaded {len(texts)} chunks")

Loaded 681 chunks


In [11]:
embedder = SentenceTransformer(FINETUNED_MODEL_PATH)
dim = embedder.get_sentence_embedding_dimension()

The tokenizer you are loading from '/content/drive/MyDrive/rag/bge-m3-rag-finetuned' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


In [12]:
index = faiss.IndexFlatIP(dim)

In [13]:
for i in range(0, len(texts), BATCH_SIZE):
    batch = texts[i:i + BATCH_SIZE]

    emb = embedder.encode(
        batch,
        normalize_embeddings=True,
        show_progress_bar=False
    )

    index.add(np.asarray(emb, dtype="float32"))

    if i % (BATCH_SIZE * 10) == 0:
        print(f"Re-embedded {min(i + BATCH_SIZE, len(texts))}/{len(texts)}")

Re-embedded 4/681
Re-embedded 44/681
Re-embedded 84/681
Re-embedded 124/681
Re-embedded 164/681
Re-embedded 204/681
Re-embedded 244/681
Re-embedded 284/681
Re-embedded 324/681
Re-embedded 364/681
Re-embedded 404/681
Re-embedded 444/681
Re-embedded 484/681
Re-embedded 524/681
Re-embedded 564/681
Re-embedded 604/681
Re-embedded 644/681
Re-embedded 681/681


In [None]:
faiss.write_index(index, NEW_INDEX_PATH)

with open(NEW_METADATA_PATH, "w", encoding="utf-8") as f:
    json.dump(docs, f, ensure_ascii=False, indent=2)

In [15]:
def retrieve(query, embedder, index, all_docs, top_k=5):
    query_emb = embedder.encode(
        [query],
        normalize_embeddings=True
    )

    scores, indices = index.search(
        np.array(query_emb, dtype="float32"),
        top_k
    )

    results = []
    for score, idx in zip(scores[0], indices[0]):
        if idx == -1:
            continue
        results.append({
            "score": float(score),
            "text": all_docs[idx]["text"],
            "metadata": all_docs[idx]["metadata"]
        })

    return results

In [16]:
query = "Cần chứng chỉ tiếng anh gì để được miễn học phần tiếng anh"

results = retrieve(
    query=query,
    embedder=embedder,
    index=index,
    all_docs=docs,
    top_k=40
)

for i, r in enumerate(results, 1):
    print(f"\n--- Result {i} ---")
    print(f"Score   : {r['score']:.4f}")
    print(f"Section : {r['metadata']['section']}")
    print(f"Type    : {r['metadata']['type']}")
    print(f"Source  : {r['metadata']['source']}")
    print(r["text"])


--- Result 1 ---
Score   : 0.9853
Section : ROOT
Type    : table
Source  : QD_ngoai_ngu_tu_K68_CQ_final.pdf
Bảng 3.2 Yêu cầu chuẩn tiếng Anh theo số tín chỉ tích lũy và chuẩn đầu ra

|Số tín chỉ tích lũy| Trình độ tiếng Anh yêu cầu                                                                  |
|-----------------------|------------------------------------------------------------------------------------------------|
|Đến 63TC          |+Đạt tất cả học phần Tiếng Anh cơ sở gồm: FL1131, FL1132, FL1133+ Hoặc đạt tối thiểu Bậc 2.1|

--- Result 2 ---
Score   : 0.9842
Section : ROOT
Type    : table
Source  : 06_%20Quy%20%C4%91%E1%BB%8Bnh%20ngo%E1%BA%A1i%20ng%E1%BB%AF%20t%E1%BB%AB%20K70_ch%C3%ADnh%20quy_final.pdf
| H ạ ngm ụ c            |Yêu cầu trình độ ngoại ngữ tối thiểu                                                                                           |
|--------------------------|--------------------------------------------------------------------------------------------------

In [17]:
from google.colab import runtime
runtime.unassign()