*Code chạy trên Google Colab*

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# =========================
# 0) CÀI THƯ VIỆN
# =========================
!pip -q install --upgrade pip
!pip -q install "sentence-transformers>=3.0.0" faiss-cpu "rank_bm25>=0.2.2" datasets
!pip -q install "transformers>=4.41.0" "accelerate>=0.30.0"
!pip -q install fastapi uvicorn nest_asyncio
!pip install ijson



In [3]:
# =========================
# 1) CẤU HÌNH & IMPORT
# =========================
import os, json, math, gzip, pickle, textwrap, re, uuid, time
from typing import List, Dict, Tuple, Any

import numpy as np
import faiss

from sentence_transformers import SentenceTransformer, CrossEncoder
from rank_bm25 import BM25Okapi

from collections import defaultdict
import re

# --- Đường dẫn làm việc trên Colab ---
WORK_DIR = "/content/drive/MyDrive/eGov-Bot/ITB"
os.makedirs(WORK_DIR, exist_ok=True)

RAW_JSON_PATH   = f"{WORK_DIR}/toan_bo_du_lieu.json"      # dữ liệu gốc (list các thủ tục)
CHUNKS_JSONL    = f"{WORK_DIR}/chunks.jsonl"  # dữ liệu đã chunk (mỗi dòng 1 chunk)
FAISS_INDEX     = f"{WORK_DIR}/index.faiss"   # file chỉ mục FAISS
METAS_PKL_GZ    = f"{WORK_DIR}/metas.pkl.gz"  # metadata kèm text
BM25_PICKLE     = f"{WORK_DIR}/bm25.pkl.gz"   # bm25 corpus (tuỳ chọn)

# --- Model embedding & reranker ---
EMBED_MODEL_NAME   = "AITeamVN/Vietnamese_Embedding"


In [None]:
# =========================
# 3) HÀM TIỀN XỬ LÝ & CHUNKING (ít RAM)
# =========================
import json
import re
import uuid
import gc
import os
import sys
from typing import Generator, Dict, Any, List
from concurrent.futures import ProcessPoolExecutor, as_completed
import multiprocessing

FIELD_ORDER = [
    "ten_thu_tuc", "yeu_cau_dieu_kien", "thanh_phan_ho_so",
    "trinh_tu_thuc_hien", "cach_thuc_thuc_hien",
    "co_quan_thuc_hien", "thu_tuc_lien_quan"
]

FIELD_VN_NAME = {
    "ten_thu_tuc": "Tên thủ tục",
    "yeu_cau_dieu_kien": "Yêu cầu, điều kiện",
    "thanh_phan_ho_so": "Thành phần hồ sơ",
    "trinh_tu_thuc_hien": "Trình tự thực hiện",
    "cach_thuc_thuc_hien": "Cách thức thực hiện",
    "co_quan_thuc_hien": "Cơ quan thực hiện",
    "thu_tuc_lien_quan": "Thủ tục liên quan"
}


def normalize_text(s: str) -> str:
    """Chuẩn hóa text"""
    if not s:
        return ""
    s = str(s).replace("\r\n", "\n").replace("\t", " ").strip()
    s = re.sub(r"[ \u00A0]+", " ", s)
    s = re.sub(r"\n{3,}", "\n\n", s)
    return s

def split_text_rcts(
    text: str,
    max_chars: int = 800,
    overlap: int = 50,
    separators: List[str] = None
) -> Generator[str, None, None]:
    """Recursive Text Character Split (RCTS)"""
    text = normalize_text(text)
    if not text:
        return

    if separators is None:
        separators = ["\n\n", "\n", ". ", ", ", " "]

    def _split_recursive(txt: str) -> List[str]:
        if len(txt) <= max_chars:
            return [txt]

        # tìm separator phù hợp nhất
        for sep in separators:
            idx = txt.rfind(sep, 0, max_chars)
            if idx != -1 and idx > max_chars * 0.3:  # tránh cắt quá sớm
                left = txt[:idx + len(sep)].strip()
                right = txt[idx + len(sep):].strip()
                return _split_recursive(left) + _split_recursive(right)

        # nếu không tìm được -> cắt cứng
        left = txt[:max_chars]
        right = txt[max_chars:]
        return [left] + _split_recursive(right)

    chunks = _split_recursive(text)

    # thêm overlap giữa các chunk
    final_chunks = []
    for i, chunk in enumerate(chunks):
        if i > 0 and overlap > 0:
            prefix = chunks[i-1][-overlap:]
            chunk = prefix + " " + chunk
        final_chunks.append(chunk.strip())

    return (c for c in final_chunks if c.strip())

def create_chunk(record: Dict[str, Any], field: str, piece: str, piece_idx: int) -> str:
    """Tạo một chunk JSON"""
    parent_id = str(record.get("nguon", str(uuid.uuid4())[:8]))
    title = normalize_text(record.get("ten_thu_tuc", ""))
    source = str(record.get("nguon", ""))

    field_name = FIELD_VN_NAME.get(field, field)
    text_content = f"Thủ tục: {title}\nMục: {field_name}\nNội dung: {piece}"

    chunk = {
        "id": f"{parent_id}#{field}#{piece_idx}",
        "parent_id": parent_id,
        "ten_thu_tuc": title,
        "field": field,
        "text": text_content,
        "raw": piece,
        "nguon": source
    }

    return json.dumps(chunk, ensure_ascii=False)

def process_record_streaming(record: Dict[str, Any], outfile) -> int:
    """Xử lý một record và ghi trực tiếp ra file"""
    chunk_count = 0

    for field in FIELD_ORDER:
        raw_value = record.get(field)
        if not raw_value:
            continue

        piece_idx = 0
        for piece in split_text_rcts(str(raw_value)):
            if not piece.strip():
                continue

            try:
                chunk_json = create_chunk(record, field, piece, piece_idx)
                outfile.write(chunk_json + "\n")
                chunk_count += 1
                piece_idx += 1
            except Exception as e:
                print(f"❌ Lỗi tạo chunk cho field {field}: {e}")
                continue

    return chunk_count

def safe_json_stream_parser(filepath: str) -> Generator[Dict[str, Any], None, None]:
    """Parser JSON streaming an toàn hơn"""

    print("🔄 Đang parse JSON stream...")

    try:
        with open(filepath, 'r', encoding='utf-8', buffering=8192) as f:
            # Đọc và kiểm tra ký tự đầu
            first_char = f.read(1)
            if first_char != '[':
                print(f"❌ File không bắt đầu bằng '[', mà là '{first_char}'")
                return

            # Reset về đầu và bỏ qua '['
            f.seek(1)

            buffer = ""
            brace_count = 0
            bracket_count = 0
            in_string = False
            escape_next = False
            char_count = 0

            while True:
                char = f.read(1)
                if not char:
                    break

                char_count += 1

                # Xử lý escape characters
                if escape_next:
                    buffer += char
                    escape_next = False
                    continue

                if char == '\\' and in_string:
                    buffer += char
                    escape_next = True
                    continue

                # Xử lý strings
                if char == '"':
                    in_string = not in_string
                    buffer += char
                    continue

                if in_string:
                    buffer += char
                    continue

                # Xử lý cấu trúc JSON
                if char == '{':
                    brace_count += 1
                    buffer += char
                elif char == '}':
                    brace_count -= 1
                    buffer += char

                    # Hoàn thành một object
                    if brace_count == 0 and bracket_count == 0:
                        try:
                            # Clean buffer trước khi parse
                            clean_buffer = buffer.strip().rstrip(',')
                            if clean_buffer:
                                obj = json.loads(clean_buffer)
                                yield obj
                                del obj
                        except json.JSONDecodeError as e:
                            print(f"❌ JSON decode error tại ký tự {char_count}: {e}")
                            print(f"   Buffer: {buffer[:100]}...")

                        # Reset buffer
                        buffer = ""
                        gc.collect()

                elif char == '[':
                    bracket_count += 1
                    buffer += char
                elif char == ']':
                    if bracket_count > 0:
                        bracket_count -= 1
                        buffer += char
                    else:
                        # Kết thúc main array
                        break
                elif char == ',':
                    if brace_count == 0 and bracket_count == 0:
                        # Dấu phẩy giữa các objects chính
                        continue
                    else:
                        buffer += char
                elif char in ' \t\n\r':
                    # Whitespace
                    if buffer.strip():  # Chỉ thêm nếu buffer không rỗng
                        buffer += ' '
                else:
                    buffer += char

                # Progress indicator
                if char_count % 100000 == 0:
                    print(f"📖 Đã đọc {char_count} ký tự...")

    except Exception as e:
        print(f"❌ Lỗi đọc file: {e}")

def process_one_record(record: Dict[str, Any]) -> List[str]:
    """Xử lý 1 record và trả về danh sách JSON string (chunk)."""
    chunks = []
    for field in FIELD_ORDER:
        raw_value = record.get(field)
        if not raw_value:
            continue

        piece_idx = 0
        for piece in split_text_rcts(str(raw_value)):
            if not piece.strip():
                continue
            try:
                chunk_json = create_chunk(record, field, piece, piece_idx)
                chunks.append(chunk_json)
                piece_idx += 1
            except Exception as e:
                print(f"❌ Lỗi tạo chunk cho field {field}: {e}")
                continue
    return chunks


def chunking_main():
    """Chunking chính – chạy song song bằng toàn bộ CPU core"""
    print("=== BẮT ĐẦU CHUNKING (MULTICORE) ===")

    if not os.path.exists(RAW_JSON_PATH):
        print(f"❌ File không tồn tại: {RAW_JSON_PATH}")
        return

    file_size_mb = os.path.getsize(RAW_JSON_PATH) / (1024 * 1024)
    print(f"📁 Kích thước file: {file_size_mb:.2f} MB")

    total_chunks = 0
    processed_records = 0
    num_cores = multiprocessing.cpu_count()
    print(f"🖥️ Phát hiện {num_cores} CPU core – dùng tối đa")

    try:
        with open(CHUNKS_JSONL, 'w', encoding='utf-8', buffering=1024) as outfile, \
             ProcessPoolExecutor(max_workers=num_cores) as executor:

            futures = []
            for record in safe_json_stream_parser(RAW_JSON_PATH):
                futures.append(executor.submit(process_one_record, record))
                processed_records += 1

                # để tránh dồn quá nhiều future -> flush dần
                if len(futures) >= 100:
                    for f in as_completed(futures):
                        chunks = f.result()
                        for c in chunks:
                            outfile.write(c + "\n")
                        total_chunks += len(chunks)
                    outfile.flush()
                    futures = []
                    gc.collect()

                    if processed_records % 50 == 0:
                        print(f"✅ {processed_records} records → {total_chunks} chunks")

            # Xử lý nốt các future còn lại
            for f in as_completed(futures):
                chunks = f.result()
                for c in chunks:
                    outfile.write(c + "\n")
                total_chunks += len(chunks)

    except Exception as e:
        print(f"❌ Lỗi fatal: {e}")
        return

    print(f"\n🎉 HOÀN THÀNH!")
    print(f"📊 Tổng kết: {processed_records} records → {total_chunks} chunks")
    print(f"💾 Output: {CHUNKS_JSONL}")

def fallback_simple_load():
    """Phương án dự phòng: load trực tiếp (cho file nhỏ)"""
    print("🔄 Thử phương án load trực tiếp...")

    try:
        with open(RAW_JSON_PATH, 'r', encoding='utf-8') as f:
            data = json.load(f)

        print(f"✅ Load thành công! Tổng records: {len(data)}")

        total_chunks = 0

        with open(CHUNKS_JSONL, 'w', encoding='utf-8') as outfile:
            for i, record in enumerate(data):
                try:
                    chunks_created = process_record_streaming(record, outfile)
                    total_chunks += chunks_created

                    if (i + 1) % 100 == 0:
                        print(f"✅ {i + 1}/{len(data)} → {total_chunks} chunks")
                        outfile.flush()
                        gc.collect()

                except Exception as e:
                    print(f"❌ Lỗi record {i}: {e}")
                    continue

        print(f"🎉 Fallback hoàn thành: {len(data)} records → {total_chunks} chunks")

    except Exception as e:
        print(f"❌ Fallback thất bại: {e}")

def main():
    """Main function"""
    print("=== ROBUST JSON CHUNKING ===")

    try:
        chunking_main()
    except Exception as e:
        print(f"❌ Phương pháp chính thất bại: {e}")
        print("🔄 Thử phương án dự phong...")
        fallback_simple_load()

if __name__ == "__main__":
    main()

=== ROBUST JSON CHUNKING ===
=== BẮT ĐẦU CHUNKING (MULTICORE) ===
📁 Kích thước file: 70.24 MB
🖥️ Phát hiện 2 CPU core – dùng tối đa
🔄 Đang parse JSON stream...
📖 Đã đọc 400000 ký tự...
✅ 100 records → 1023 chunks
📖 Đã đọc 600000 ký tự...
✅ 200 records → 2003 chunks
✅ 300 records → 2970 chunks
✅ 400 records → 3939 chunks
✅ 500 records → 5151 chunks
✅ 600 records → 6190 chunks
✅ 700 records → 7233 chunks
✅ 800 records → 8299 chunks
✅ 900 records → 9375 chunks
✅ 1000 records → 10358 chunks
✅ 1100 records → 11580 chunks
✅ 1200 records → 12596 chunks
📖 Đã đọc 5300000 ký tự...
✅ 1300 records → 13613 chunks
✅ 1400 records → 14614 chunks
✅ 1500 records → 15551 chunks
📖 Đã đọc 6600000 ký tự...
✅ 1600 records → 16597 chunks
✅ 1700 records → 17643 chunks
✅ 1800 records → 18674 chunks
📖 Đã đọc 7800000 ký tự...
✅ 1900 records → 19661 chunks
✅ 2000 records → 20723 chunks
✅ 2100 records → 21719 chunks
📖 Đã đọc 8800000 ký tự...
✅ 2200 records → 22998 chunks
✅ 2300 records → 24211 chunks
✅ 2400 records