In [None]:
 from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! pip install faiss-cpu pdfplumber docling -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.6/162.6 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m92.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m136.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.9/276.9 kB[0m [31m26.3 MB/s[0m eta 

In [None]:
import os
import json
import faiss
import numpy as np
import pdfplumber
from typing import List, Dict
from docx import Document
from sentence_transformers import SentenceTransformer
from docling.document_converter import DocumentConverter
from docling_core.types.doc import DocItemLabel

In [None]:
BATCH_SIZE = 4
MAX_CHUNK_SIZE = 1000
DOC_FILEPATH = '/content/drive/MyDrive/hust_docs'
OUT_DIR = '/content/drive/MyDrive/rag'
EMBED_MODEL = 'BAAI/bge-m3'

In [None]:
def normalize_token(w: str) -> str:
    return ''.join(c for c in w.lower() if c.isalnum())


def pdf_to_dict(pdf_path: str) -> set:
    words = set()
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text() or ""
            for w in text.split():
                w = normalize_token(w)
                if 2 <= len(w) <= 8:
                    words.add(w)
    return words


def docx_to_dict(docx_path: str) -> set:
    words = set()
    doc = Document(docx_path)
    for para in doc.paragraphs:
        text = para.text or ""
        for w in text.split():
            w = normalize_token(w)
            if 2 <= len(w) <= 8:
                words.add(w)
    return words


def repair_raw(line: str, dictionary: set) -> str:
    tokens = line.split(' ')
    repaired = []
    i = 0

    while i < len(tokens):
        t0 = normalize_token(tokens[i])

        if i + 3 < len(tokens):
            merged4 = normalize_token(tokens[i] + tokens[i + 1] + tokens[i + 2] + tokens[i + 3])
            if merged4 in dictionary:
                repaired.append(tokens[i] + tokens[i + 1] + tokens[i + 2] + tokens[i + 3])
                i += 4
                continue

        if i + 2 < len(tokens):
            merged3 = normalize_token(tokens[i] + tokens[i + 1] + tokens[i + 2])
            if merged3 in dictionary:
                repaired.append(tokens[i] + tokens[i + 1] + tokens[i + 2])
                i += 3
                continue

        if i + 1 < len(tokens):
            merged2 = normalize_token(tokens[i] + tokens[i + 1])
            if merged2 in dictionary:
                repaired.append(tokens[i] + tokens[i + 1])
                i += 2
                continue

        repaired.append(tokens[i])
        i += 1

    return " ".join(repaired)


def repair(text: str, dictionary: set) -> str:
    out = []
    for line in text.splitlines(keepends=True):
        newline = "\n" if line.endswith("\n") else ""
        content = line.rstrip("\n")
        out.append(repair_raw(content, dictionary) + newline)
    return "".join(out)

def iter_blocks(document, max_chunk_size: int):
    blocks = []

    current_section = "ROOT"
    current_content = []
    current_length = 0

    def flush():
        nonlocal current_content, current_length
        if current_content:
            blocks.append({
                "type": "section",
                "section": current_section,
                "content": "\n\n".join(current_content)
            })
            current_content = []
            current_length = 0

    for item, stack in document.iterate_items():
        label = item.label

        if label in (DocItemLabel.TITLE, DocItemLabel.SECTION_HEADER):
            text = item.text.strip()
            if text and stack <= 0:
                flush()
                current_section = text
                current_content = [text]
                current_length = len(text)
            continue

        if label in (DocItemLabel.PARAGRAPH, DocItemLabel.TEXT, DocItemLabel.LIST_ITEM):
            text = item.text.strip()
            if not text:
                continue

            text_len = len(text) + (2 if current_content else 0)

            if current_length + text_len > max_chunk_size:
                flush()

            current_content.append(text)
            current_length += text_len
            continue

        if label == DocItemLabel.TABLE:
            table_md = item.export_to_markdown(document)
            flush()
            blocks.append({
                "type": "table",
                "section": current_section,
                "content": table_md
            })
            continue

    flush()
    return blocks

def convert_file(file_path, max_chunk_size = MAX_CHUNK_SIZE) -> List[Dict]:
    try:
        ext = os.path.splitext(file_path)[1].lower()
        converter = DocumentConverter()

        print(f"Converting: {file_path}")
        result = converter.convert(file_path)

        if ext == ".pdf":
            dictionary = pdf_to_dict(file_path)
        elif ext == ".docx":
            dictionary = docx_to_dict(file_path)
        else:
            return []

        blocks = iter_blocks(result.document, max_chunk_size)
        docs = []

        for b in blocks:
            text = repair(b["content"], dictionary)
            metadata = {
                "source": os.path.basename(file_path),
                "section": repair(b["section"], dictionary),
                "type": b["type"]
            }

            docs.append({
                "text": text,
                "metadata": metadata
            })

        return docs

    except Exception as e:
        print(f"[ERROR] Failed processing: {file_path}")
        print(str(e))
        return []


def build_index(all_docs: List[Dict], embedder) -> faiss.Index:
    dim = embedder.get_sentence_embedding_dimension()
    index = faiss.IndexFlatIP(dim)

    texts = [d["text"] for d in all_docs]

    for i in range(0, len(texts), BATCH_SIZE):
        batch = texts[i:i + BATCH_SIZE]
        emb = embedder.encode(
            batch,
            normalize_embeddings=True,
            show_progress_bar=False
        )
        index.add(np.asarray(emb, dtype="float32"))
        if i % (BATCH_SIZE * 10) == 0:
            print(f"Indexed {min(i + BATCH_SIZE, len(texts))}/{len(texts)}")

    return index

In [None]:
docs: List[Dict] = []

for fn in sorted(os.listdir(DOC_FILEPATH)):
    if fn.lower().endswith((".pdf", ".docx")):
        docs.extend(convert_file(os.path.join(DOC_FILEPATH, fn), MAX_CHUNK_SIZE))

print(f"Total blocks: {len(docs)}")

embedder = SentenceTransformer(EMBED_MODEL)
index = build_index(docs, embedder)

os.makedirs(OUT_DIR, exist_ok=True)
faiss.write_index(index, os.path.join(OUT_DIR, "rag.index"))

with open(os.path.join(OUT_DIR, "metadata.json"), "w", encoding="utf-8") as f:
    json.dump(docs, f, ensure_ascii=False, indent=2)

print("Index and metadata saved")

Converting: /content/drive/MyDrive/hust_docs/01_1%202015%20TT%20Lien%20tich_QD%20danh%20gia%20QP-AN.pdf


[32m[INFO] 2026-01-03 08:20:15,093 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:20:15,108 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:20:15,117 [RapidOCR] download_file.py:68: Initiating download: https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/torch/PP-OCRv4/det/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:20:16,509 [RapidOCR] download_file.py:82: Download size: 13.83MB[0m
[32m[INFO] 2026-01-03 08:20:16,980 [RapidOCR] download_file.py:95: Successfully saved to: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:20:16,984 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:20:18,007 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:20:18,015 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0

Converting: /content/drive/MyDrive/hust_docs/01_3%20HD%20hoc%20chuyen%20tiep%20ky%20su%20180%20TC_Final.pdf


[32m[INFO] 2026-01-03 08:20:35,589 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:20:35,590 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:20:35,631 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:20:35,632 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:20:35,866 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:20:35,867 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:20:35,871 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_ptocr_mobile_v2.0_cls_infer.pth[0m
[32m[INFO] 2026-01-03 08:20:35,872 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages

Converting: /content/drive/MyDrive/hust_docs/02_%20Th%E1%BB%A7%20th%E1%BB%A5c_Bi%E1%BB%83u%20m%E1%BA%ABu%20(k%C3%A8m%20Quy%20ch%E1%BA%BF%20%C4%90T).pdf


[32m[INFO] 2026-01-03 08:20:39,153 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:20:39,155 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:20:39,199 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:20:39,200 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:20:39,430 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:20:39,431 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:20:39,436 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_ptocr_mobile_v2.0_cls_infer.pth[0m
[32m[INFO] 2026-01-03 08:20:39,436 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages

Converting: /content/drive/MyDrive/hust_docs/02_gioi%20thieu%20NoteBookLM.pdf


[32m[INFO] 2026-01-03 08:20:46,996 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:20:46,997 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:20:47,046 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:20:47,047 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:20:47,270 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:20:47,271 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:20:47,275 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_ptocr_mobile_v2.0_cls_infer.pth[0m
[32m[INFO] 2026-01-03 08:20:47,276 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages

Converting: /content/drive/MyDrive/hust_docs/03_1%20HUST's%20ACADEMIC%20REGULATIONS-2025_Final.pdf


[32m[INFO] 2026-01-03 08:20:58,325 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:20:58,326 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:20:58,368 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:20:58,369 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:20:58,599 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:20:58,600 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:20:58,605 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_ptocr_mobile_v2.0_cls_infer.pth[0m
[32m[INFO] 2026-01-03 08:20:58,606 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages

Converting: /content/drive/MyDrive/hust_docs/06_%20Quy%20%C4%91%E1%BB%8Bnh%20ngo%E1%BA%A1i%20ng%E1%BB%AF%20t%E1%BB%AB%20K70_ch%C3%ADnh%20quy_final.pdf


[32m[INFO] 2026-01-03 08:21:23,486 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:21:23,488 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:21:23,565 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:21:23,566 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:21:23,937 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:21:23,938 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:21:23,943 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_ptocr_mobile_v2.0_cls_infer.pth[0m
[32m[INFO] 2026-01-03 08:21:23,944 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages

Converting: /content/drive/MyDrive/hust_docs/11-Mẫu CNHB KKHT 2025.docx




Converting: /content/drive/MyDrive/hust_docs/2-Giay chung nhan sinh vien 2025.docx




Converting: /content/drive/MyDrive/hust_docs/23_%20%C4%90%C6%A1n%20xin%20c%E1%BA%A5p%20b%E1%BA%A3n%20sao%20v%C4%83n%20b%E1%BA%B1ng.docx




Converting: /content/drive/MyDrive/hust_docs/23_%20Don%20xin%20cap%20ban%20sao%20bang%20TN.docx




Converting: /content/drive/MyDrive/hust_docs/3-Giay gioi thieu sinh vien 2025.docx




Converting: /content/drive/MyDrive/hust_docs/Diem%20moi%20cua%20QCDT%202025.pdf


[32m[INFO] 2026-01-03 08:21:59,000 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:21:59,001 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:21:59,043 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:21:59,044 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:21:59,271 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:21:59,272 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:21:59,277 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_ptocr_mobile_v2.0_cls_infer.pth[0m
[32m[INFO] 2026-01-03 08:21:59,277 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages

Converting: /content/drive/MyDrive/hust_docs/ERR1zxfXQIFOgaAM4mULdi8B_M3a5qSmf1g04v1L5Zyabg.docx


ERROR:docling.datamodel.document:Input document ERR1zxfXQIFOgaAM4mULdi8B_M3a5qSmf1g04v1L5Zyabg.docx with format None does not match any allowed format: (dict_keys([<InputFormat.DOCX: 'docx'>, <InputFormat.PPTX: 'pptx'>, <InputFormat.HTML: 'html'>, <InputFormat.IMAGE: 'image'>, <InputFormat.PDF: 'pdf'>, <InputFormat.ASCIIDOC: 'asciidoc'>, <InputFormat.MD: 'md'>, <InputFormat.CSV: 'csv'>, <InputFormat.XLSX: 'xlsx'>, <InputFormat.XML_USPTO: 'xml_uspto'>, <InputFormat.XML_JATS: 'xml_jats'>, <InputFormat.METS_GBS: 'mets_gbs'>, <InputFormat.JSON_DOCLING: 'json_docling'>, <InputFormat.AUDIO: 'audio'>, <InputFormat.VTT: 'vtt'>]))


[ERROR] Failed processing: /content/drive/MyDrive/hust_docs/ERR1zxfXQIFOgaAM4mULdi8B_M3a5qSmf1g04v1L5Zyabg.docx
File format not allowed: ERR1zxfXQIFOgaAM4mULdi8B_M3a5qSmf1g04v1L5Zyabg.docx
Converting: /content/drive/MyDrive/hust_docs/ERczoBK8xr9PutLrx9PkEAABP7iFQRhguBqzkg1jQcFJWQ.docx




Converting: /content/drive/MyDrive/hust_docs/HD%20chuyen%20truong%20SV%20hoc%20nuoc%20ngoai%20ve%20VN.pdf


[32m[INFO] 2026-01-03 08:22:09,141 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:22:09,142 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:22:09,186 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:22:09,187 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:22:09,414 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:22:09,415 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:22:09,420 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_ptocr_mobile_v2.0_cls_infer.pth[0m
[32m[INFO] 2026-01-03 08:22:09,420 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages

Converting: /content/drive/MyDrive/hust_docs/KH%20nam%20hoc%202025-2026_CN_KS.pdf


[32m[INFO] 2026-01-03 08:22:13,160 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:22:13,161 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:22:13,202 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:22:13,203 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:22:13,436 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:22:13,437 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:22:13,442 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_ptocr_mobile_v2.0_cls_infer.pth[0m
[32m[INFO] 2026-01-03 08:22:13,443 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages

Converting: /content/drive/MyDrive/hust_docs/Khung%20KH%202024-2025_CN_KS.pdf


[32m[INFO] 2026-01-03 08:22:17,604 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:22:17,605 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:22:17,680 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:22:17,681 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:22:18,068 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:22:18,069 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:22:18,075 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_ptocr_mobile_v2.0_cls_infer.pth[0m
[32m[INFO] 2026-01-03 08:22:18,077 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages

Converting: /content/drive/MyDrive/hust_docs/Khung%20Ke%20hoach%20thoi%20gian%20nam%20hco%202023-2024.pdf


[32m[INFO] 2026-01-03 08:22:21,239 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:22:21,240 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:22:21,281 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:22:21,282 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:22:21,513 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:22:21,514 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:22:21,518 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_ptocr_mobile_v2.0_cls_infer.pth[0m
[32m[INFO] 2026-01-03 08:22:21,520 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages

Converting: /content/drive/MyDrive/hust_docs/Khung%20ke%CC%82%CC%81%20hoa%CC%A3ch%20th%E1%BB%9Di%20gian%20n%C4%83m%20h%E1%BB%8Dc%202022-2023%20-%2011092022.pdf


[32m[INFO] 2026-01-03 08:22:30,052 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:22:30,053 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:22:30,130 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:22:30,130 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:22:30,481 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:22:30,483 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:22:30,490 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_ptocr_mobile_v2.0_cls_infer.pth[0m
[32m[INFO] 2026-01-03 08:22:30,491 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages

Converting: /content/drive/MyDrive/hust_docs/Khung-DGRL-2020-2021.pdf


[32m[INFO] 2026-01-03 08:22:37,758 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:22:37,759 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:22:37,803 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:22:37,804 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:22:38,030 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:22:38,031 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:22:38,036 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_ptocr_mobile_v2.0_cls_infer.pth[0m
[32m[INFO] 2026-01-03 08:22:38,037 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages

Converting: /content/drive/MyDrive/hust_docs/M%E1%BA%ABu%20%C4%90ATN_2019_version%201_1.docx




Converting: /content/drive/MyDrive/hust_docs/M%E1%BA%ABu%20%C4%91%C6%A1n.docx




Converting: /content/drive/MyDrive/hust_docs/M%E1%BA%ABu%20Gi%E1%BA%A5y%20x%C3%A1c%20nh%E1%BA%ADn.docx




Converting: /content/drive/MyDrive/hust_docs/Mẫu Giấy xác nhận.docx




Converting: /content/drive/MyDrive/hust_docs/Mẫu đơn đk HB TĐN.docx




Converting: /content/drive/MyDrive/hust_docs/Mẫu đơn.docx




Converting: /content/drive/MyDrive/hust_docs/QCDT-2023-upload.pdf


[32m[INFO] 2026-01-03 08:22:52,483 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:22:52,484 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:22:52,526 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:22:52,527 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:22:52,750 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:22:52,751 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:22:52,756 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_ptocr_mobile_v2.0_cls_infer.pth[0m
[32m[INFO] 2026-01-03 08:22:52,756 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages

Converting: /content/drive/MyDrive/hust_docs/QCDT_2025_5445_QD-DHBK.pdf


[32m[INFO] 2026-01-03 08:23:17,747 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:23:17,748 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:23:17,794 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:23:17,794 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:23:18,713 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:23:18,714 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:23:18,718 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_ptocr_mobile_v2.0_cls_infer.pth[0m
[32m[INFO] 2026-01-03 08:23:18,719 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages

Converting: /content/drive/MyDrive/hust_docs/QD%20HOC%20PHI%20-%202025-2026-final.pdf


[32m[INFO] 2026-01-03 08:23:43,336 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:23:43,337 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:23:43,378 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:23:43,379 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:23:43,615 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:23:43,616 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:23:43,622 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_ptocr_mobile_v2.0_cls_infer.pth[0m
[32m[INFO] 2026-01-03 08:23:43,624 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages

Converting: /content/drive/MyDrive/hust_docs/QD%20NN%20DHCQ-2020-2021-1501.pdf


[32m[INFO] 2026-01-03 08:23:53,173 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:23:53,174 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:23:53,217 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:23:53,218 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:23:53,451 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:23:53,452 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:23:53,457 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_ptocr_mobile_v2.0_cls_infer.pth[0m
[32m[INFO] 2026-01-03 08:23:53,458 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages

Converting: /content/drive/MyDrive/hust_docs/QD%20ban%20hanh%20QD%20chuyen%20doi%20hoc%20phan%20tuong%20duong.pdf


[32m[INFO] 2026-01-03 08:24:50,273 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:24:50,274 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:24:50,317 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:24:50,318 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:24:50,564 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:24:50,565 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:24:50,569 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_ptocr_mobile_v2.0_cls_infer.pth[0m
[32m[INFO] 2026-01-03 08:24:50,570 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages

Converting: /content/drive/MyDrive/hust_docs/QD%20ban%20hanh%20QD%20to%20chuc%20day%20hoc%20tren%20nen%20tang%20CN%20ket%20noi%20-%20truc%20tuyen.pdf


[32m[INFO] 2026-01-03 08:25:05,602 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:25:05,603 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:25:05,654 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:25:05,655 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:25:05,906 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:25:05,908 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:25:05,913 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_ptocr_mobile_v2.0_cls_infer.pth[0m
[32m[INFO] 2026-01-03 08:25:05,914 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages

Converting: /content/drive/MyDrive/hust_docs/QD%20ban%20hanh%20QD%20to%20chuc%20thi%20Truc%20tuyen.pdf


[32m[INFO] 2026-01-03 08:25:21,832 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:25:21,833 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:25:21,879 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:25:21,880 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:25:22,121 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:25:22,123 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:25:22,129 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_ptocr_mobile_v2.0_cls_infer.pth[0m
[32m[INFO] 2026-01-03 08:25:22,130 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages

Converting: /content/drive/MyDrive/hust_docs/QD_ngoai_ngu_tu_K68_CQ_final.pdf


[32m[INFO] 2026-01-03 08:25:40,131 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:25:40,134 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:25:40,207 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:25:40,209 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-03 08:25:40,600 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-03 08:25:40,601 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-03 08:25:40,605 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_ptocr_mobile_v2.0_cls_infer.pth[0m
[32m[INFO] 2026-01-03 08:25:40,606 [RapidOCR] main.py:50: Using /usr/local/lib/python3.12/dist-packages

Total blocks: 681
Indexed 4/681
Indexed 44/681
Indexed 84/681
Indexed 124/681
Indexed 164/681
Indexed 204/681
Indexed 244/681
Indexed 284/681
Indexed 324/681
Indexed 364/681
Indexed 404/681
Indexed 444/681
Indexed 484/681
Indexed 524/681
Indexed 564/681
Indexed 604/681
Indexed 644/681
Indexed 681/681
Index and metadata saved


In [None]:
def retrieve(query, embedder, index, all_docs, top_k=5):
    query_emb = embedder.encode(
        [query],
        normalize_embeddings=True
    )

    scores, indices = index.search(
        np.array(query_emb, dtype="float32"),
        top_k
    )

    results = []
    for score, idx in zip(scores[0], indices[0]):
        if idx == -1:
            continue
        results.append({
            "score": float(score),
            "text": all_docs[idx]["text"],
            "metadata": all_docs[idx]["metadata"]
        })

    return results

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
! pip install faiss-cpu -q

In [None]:
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

index = faiss.read_index(f"{OUT_DIR}/rag.index")

with open(f"{OUT_DIR}/metadata.json", "r", encoding="utf-8") as f:
    docs = json.load(f)

print("Index size :", index.ntotal)
print("Docs count :", len(docs))

assert index.ntotal == len(docs)

embedder = SentenceTransformer(EMBED_MODEL)

Index size : 681
Docs count : 681


In [None]:
query = "Cần chứng chỉ tiếng anh gì để được miễn học phần tiếng anh"

results = retrieve(
    query=query,
    embedder=embedder,
    index=index,
    all_docs=docs,
    top_k=40
)

for i, r in enumerate(results, 1):
    print(f"\n--- Result {i} ---")
    print(f"Score   : {r['score']:.4f}")
    print(f"Section : {r['metadata']['section']}")
    print(f"Type    : {r['metadata']['type']}")
    print(f"Source  : {r['metadata']['source']}")
    print(r["text"])



--- Result 1 ---
Score   : 0.7023
Section : ROOT
Type    : table
Source  : QD_ngoai_ngu_tu_K68_CQ_final.pdf
Bảng 6.1 Danh mục các học phần tiếng Anh yêu cầu

|TT |Mã học phần| Tên học phần            |Thời lượng| Tính tín chỉ trong CTĐT| Học phần được miễn/cần học theo chứng chỉ đạt được| Học phần được miễn/cần học theo chứng chỉ đạt được| Học phần được miễn/cần học theo chứng chỉ đạt được| Học phần được miễn/cần học theo chứng chỉ đạt được| Học phần được miễn/cần học theo chứng chỉ đạt được|
|------|---------------|------------------------------|--------------|---------------------------|------------------------------------------------------|------------------------------------------------------|------------------------------------------------------|------------------------------------------------------|------------------------------------------------------|
|      |               |                              |              |                           | [1]                         

In [None]:
from google.colab import runtime
runtime.unassign()