In [1]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Qdrant
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

# 1. Load v√† chia nh·ªè vƒÉn b·∫£n
loader = TextLoader("KH001_Don_Xin_Vay_Von.txt", encoding="utf-8")
docs = RecursiveCharacterTextSplitter(
    chunk_size=1200,
    chunk_overlap=50,
    separators=["\n\n", "\n", ".", " "]
).split_documents(loader.load())

# 2. Kh·ªüi t·∫°o embedding model: paraphrase-multilingual-MiniLM-L12-v2
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    model_kwargs={"device": "cpu"}  # ho·∫∑c "cuda"
)

# 3. K·∫øt n·ªëi t·ªõi Qdrant
client = QdrantClient(host="localhost", port=6333)

# 4. X√≥a v√† t·∫°o l·∫°i collection v·ªõi ƒë√∫ng dimension (384) & distance
client.recreate_collection(
    collection_name="viet_kb",
    vectors_config=VectorParams(size=384, distance=Distance.COSINE)
)

# 5. Kh·ªüi t·∫°o Qdrant vectorstore
qdrant = Qdrant(
    client=client,
    collection_name="viet_kb",
    embeddings=embedding_model
)

# 6. Th√™m documents v√†o vectordb
qdrant.add_documents(documents=docs, batch_size=64)

print("‚úÖ ƒê√£ l∆∞u embedding Vietnamese KB v√†o Qdrant v·ªõi m√¥ h√¨nh MiniLM!")


‚úÖ ƒê√£ l∆∞u embedding Vietnamese KB v√†o Qdrant v·ªõi m√¥ h√¨nh MiniLM!


In [5]:
import os
import glob
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Qdrant
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from tqdm import tqdm # Th∆∞ vi·ªán ƒë·ªÉ hi·ªÉn th·ªã thanh ti·∫øn tr√¨nh ƒë·∫πp m·∫Øt

# --- 1. C·∫•u h√¨nh ---
DATA_DIRECTORY = "data/data1"
COLLECTION_NAME = "viet_kb" # ƒê·∫∑t t√™n m·ªõi ƒë·ªÉ kh√¥ng b·ªã ghi ƒë√® collection c≈©
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
CHUNK_SIZE = 1200 # TƒÉng k√≠ch th∆∞·ªõc chunk ƒë·ªÉ ch·ª©a nhi·ªÅu ng·ªØ c·∫£nh h∆°n
CHUNK_OVERLAP = 200 # TƒÉng ƒë·ªô ch·ªìng l·∫•n ƒë·ªÉ tr√°nh c·∫Øt c√¢u

# --- 2. T·∫£i t·∫•t c·∫£ c√°c file t·ª´ th∆∞ m·ª•c ---
print(f"üîç ƒêang t√¨m v√† t·∫£i c√°c file t·ª´ th∆∞ m·ª•c '{DATA_DIRECTORY}'...")
all_documents = []
# T√¨m t·∫•t c·∫£ c√°c file c√≥ ƒëu√¥i .txt trong th∆∞ m·ª•c data
file_paths = glob.glob(os.path.join(DATA_DIRECTORY, "*.txt"))

if not file_paths:
    print(f"‚ùå Kh√¥ng t√¨m th·∫•y file .txt n√†o trong th∆∞ m·ª•c '{DATA_DIRECTORY}'. Vui l√≤ng ki·ªÉm tra l·∫°i.")
else:
    for file_path in tqdm(file_paths, desc="ƒêang t·∫£i c√°c file"):
        try:
            loader = TextLoader(file_path, encoding="utf-8")
            all_documents.extend(loader.load())
        except Exception as e:
            print(f"L·ªói khi t·∫£i file {file_path}: {e}")
    
    print(f"‚úÖ ƒê√£ t·∫£i th√†nh c√¥ng {len(all_documents)} document t·ª´ {len(file_paths)} file.")

    # --- 3. Chia nh·ªè to√†n b·ªô vƒÉn b·∫£n ---
    print(f"üîÑ ƒêang chia nh·ªè vƒÉn b·∫£n v·ªõi chunk_size={CHUNK_SIZE} v√† chunk_overlap={CHUNK_OVERLAP}...")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        separators=["\n\n", "\n", "---", ".", " "] # Th√™m '---' ƒë·ªÉ c√≥ th·ªÉ t√°ch c√°c file
    )
    docs_for_db = text_splitter.split_documents(all_documents)
    print(f"‚úÖ ƒê√£ chia th√†nh {len(docs_for_db)} chunk ƒë·ªÉ chu·∫©n b·ªã embedding.")

    # --- 4. Kh·ªüi t·∫°o embedding model ---
    print(f"üß† ƒêang kh·ªüi t·∫°o embedding model: {MODEL_NAME}...")
    embedding_model = HuggingFaceEmbeddings(
        model_name=MODEL_NAME,
        model_kwargs={"device": "cpu"}  # ƒê·ªïi th√†nh "cuda" n·∫øu b·∫°n c√≥ GPU
    )
    # L·∫•y dimension c·ªßa vector t·ª´ model. MiniLM l√† 384.
    embed_dim = embedding_model.client.get_sentence_embedding_dimension()
    print(f"‚úÖ Model ƒë√£ s·∫µn s√†ng. K√≠ch th∆∞·ªõc vector: {embed_dim}")

    # --- 5. K·∫øt n·ªëi v√† t·∫°o l·∫°i collection trong Qdrant ---
    print("‚òÅÔ∏è ƒêang k·∫øt n·ªëi t·ªõi Qdrant v√† t·∫°o collection...")
    client = QdrantClient(host="localhost", port=6333)

    # X√≥a collection c≈© (n·∫øu c√≥) v√† t·∫°o l·∫°i v·ªõi c·∫•u h√¨nh m·ªõi
    client.recreate_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(size=embed_dim, distance=Distance.COSINE)
    )
    print(f"‚úÖ Collection '{COLLECTION_NAME}' ƒë√£ ƒë∆∞·ª£c t·∫°o th√†nh c√¥ng.")

    # --- 6. Th√™m c√°c chunk ƒë√£ x·ª≠ l√Ω v√†o Qdrant ---
    print("‚è≥ ƒêang th·ª±c hi·ªán embedding v√† l∆∞u v√†o Qdrant. Qu√° tr√¨nh n√†y c√≥ th·ªÉ m·∫•t v√†i ph√∫t...")
    # LangChain Qdrant s·∫Ω t·ª± ƒë·ªông x·ª≠ l√Ω vi·ªác g·ªçi embedding_model
    qdrant = Qdrant(
        client=client,
        collection_name=COLLECTION_NAME,
        embeddings=embedding_model
    )

    qdrant.add_documents(documents=docs_for_db, batch_size=64)

    print("\nüéâ HO√ÄN T·∫§T! To√†n b·ªô d·ªØ li·ªáu ƒë√£ ƒë∆∞·ª£c embedding v√† l∆∞u v√†o Qdrant.")
    print(f"üëâ B√¢y gi·ªù b·∫°n c√≥ th·ªÉ s·ª≠ d·ª•ng collection '{COLLECTION_NAME}' trong Langflow.")

üîç ƒêang t√¨m v√† t·∫£i c√°c file t·ª´ th∆∞ m·ª•c 'data/data1'...


ƒêang t·∫£i c√°c file: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:00<00:00, 104.19it/s]

‚úÖ ƒê√£ t·∫£i th√†nh c√¥ng 4 document t·ª´ 4 file.
üîÑ ƒêang chia nh·ªè vƒÉn b·∫£n v·ªõi chunk_size=1200 v√† chunk_overlap=200...
‚úÖ ƒê√£ chia th√†nh 11 chunk ƒë·ªÉ chu·∫©n b·ªã embedding.
üß† ƒêang kh·ªüi t·∫°o embedding model: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2...





‚úÖ Model ƒë√£ s·∫µn s√†ng. K√≠ch th∆∞·ªõc vector: 384
‚òÅÔ∏è ƒêang k·∫øt n·ªëi t·ªõi Qdrant v√† t·∫°o collection...


  client.recreate_collection(


‚úÖ Collection 'viet_kb' ƒë√£ ƒë∆∞·ª£c t·∫°o th√†nh c√¥ng.
‚è≥ ƒêang th·ª±c hi·ªán embedding v√† l∆∞u v√†o Qdrant. Qu√° tr√¨nh n√†y c√≥ th·ªÉ m·∫•t v√†i ph√∫t...

üéâ HO√ÄN T·∫§T! To√†n b·ªô d·ªØ li·ªáu ƒë√£ ƒë∆∞·ª£c embedding v√† l∆∞u v√†o Qdrant.
üëâ B√¢y gi·ªù b·∫°n c√≥ th·ªÉ s·ª≠ d·ª•ng collection 'viet_kb' trong Langflow.


In [4]:
import requests
import json
import time
from pprint import pprint

# --- 1. C·∫§U H√åNH ---

# URL API c·ªßa pipeline Langflow
LANGFLOW_URL = "http://localhost:7860/api/v1/run/70017ce4-caba-479c-a03c-067faebf3c6c"

# Headers cho request, ch·ªâ ƒë·ªãnh r√µ encoding UTF-8
HEADERS = {"Content-Type": "application/json; charset=utf-8"}

# C√°c tham s·ªë cho logic l·∫∑p
MAX_ITERATIONS = 20      # TƒÉng s·ªë l·∫ßn l·∫∑p t·ªëi ƒëa v√¨ c√≥ th·ªÉ ph·∫£i chia nh·ªè nhi·ªÅu l·∫ßn
INITIAL_BATCH_SIZE = 6 # S·ªë tr∆∞·ªùng th√¥ng tin h·ªèi trong l√¥ ban ƒë·∫ßu
MIN_BATCH_SIZE_TO_SPLIT = 2 # Ch·ªâ chia nh·ªè l√¥ n·∫øu n√≥ c√≤n nhi·ªÅu h∆°n 1 tr∆∞·ªùng

# Danh s√°ch c√°c tr∆∞·ªùng th√¥ng tin c·∫ßn tr√≠ch xu·∫•t (ho√†n to√†n b·∫±ng ti·∫øng Vi·ªát)
NEW_TEMPLATE_MAPPING = {
    "headerInfo": {
        "bbc": "BBC (B√°o c√°o b·ªüi)",
        "cbc": "CBC (C√°n b·ªô ch√≠nh)",
        "idDeXuat": "ID ƒë·ªÅ xu·∫•t",
        "ngayBaoCao": "Ng√†y b√°o c√°o",
        "ngayCapNhat": "Ng√†y c·∫≠p nh·∫≠t",
        "mucDichThamDinh": "M·ª•c ƒë√≠ch th·∫©m ƒë·ªãnh",
        "capNoi": "C·∫•p n∆°i",
    },
    "creditInfo": {
        "idT24": "ID T24",
        "xepHangTinDung": "X·∫øp h·∫°ng t√≠n d·ª•ng",
        "ngayXepHang": "Ng√†y x·∫øp h·∫°ng",
        "ketQuaPhanNhomTiepCan": "K·∫øt qu·∫£ ph√¢n nh√≥m ti·∫øp c·∫≠n",
        "nganh": "Ng√†nh",
        "phanNhomRuiRo": "Ph√¢n nh√≥m r·ªßi ro",
        "phanNhomUngXu": "Ph√¢n nh√≥m ·ª©ng x·ª≠",
        "khacBietHDTD": "Kh√°c bi·ªát HƒêTD",
        "ketQuaPhanLuong": "K·∫øt qu·∫£ ph√¢n lu·ªìng",
        "loaiKhoanVay": "Lo·∫°i kho·∫£n vay",
        "tongGiaTriCapTD": "T·ªïng gi√° tr·ªã c·∫•p TD",
        "tongGiaTriCoBPHD": "T·ªïng gi√° tr·ªã c√≥ BPHƒê",
        "xepHangRuiRo": "X·∫øp h·∫°ng r·ªßi ro",
        "ruiRoNganh": "R·ªßi ro ng√†nh",
        "mucDoPhucTap": "M·ª©c ƒë·ªô ph·ª©c t·∫°p",
        "tieuChiTaiChinh": "Ti√™u ch√≠ t√†i ch√≠nh",
        "mucDoRuiRo": "M·ª©c ƒë·ªô r·ªßi ro",
    },
    "businessInfo": {
        "tenDayDu": "T√™n ƒë·∫ßy ƒë·ªß c·ªßa doanh nghi·ªáp",
        "ngayThanhLap": "Ng√†y th√†nh l·∫≠p",
        "loaiHinhCongTy": "Lo·∫°i h√¨nh c√¥ng ty",
        "hoatDongKinhDoanhMoTa": "M√¥ t·∫£ ho·∫°t ƒë·ªông kinh doanh",
        "tienDoVanXuat": "Ti·∫øn ƒë·ªô v·∫≠n xu·∫•t",
        "khaNangLapKeHoach": "Kh·∫£ nƒÉng l·∫≠p k·∫ø ho·∫°ch",
    },
    "legalInfo": {
        "tinhHinhPhapLy": "T√¨nh h√¨nh ph√°p l√Ω",
        "kinhNghiemChuSoHuu": "Kinh nghi·ªám ch·ªß s·ªü h·ªØu",
    },
    "tcbRelationship": {
        "chatLuongQuanHeTD": "Ch·∫•t l∆∞·ª£ng quan h·ªá TD",
        "khongViPham": "C√≥ vi ph·∫°m kh√¥ng",
        "chiTietViPham": "Chi ti·∫øt vi ph·∫°m",
        "soThangTuongTacT24": "S·ªë th√°ng t∆∞∆°ng t√°c T24",
        "soDuTienGui12Thang": "S·ªë d∆∞ ti·ªÅn g·ª≠i 12 th√°ng",
        "soLanPhatSinhGiaoDich": "S·ªë l·∫ßn ph√°t sinh giao d·ªãch",
        "tiLeCoSuDungSPDVKhac": "T·ª∑ l·ªá c√≥ s·ª≠ d·ª•ng SPDV kh√°c",
    },
    # C√°c tr∆∞·ªùng ph·ª©c t·∫°p (array) s·∫Ω ƒë∆∞·ª£c x·ª≠ l√Ω sau
    "management": {},
    "financialStatus": {},
}

FIELDS_TO_EXTRACT = []
# T·∫°o th√™m reverse map: key -> section
KEY_TO_SECTION = {}
for section, fields in NEW_TEMPLATE_MAPPING.items():
    for key, value in fields.items():
        FIELDS_TO_EXTRACT.append(value)
        KEY_TO_SECTION[key] = section

# --- 2. C√ÅC H√ÄM H·ªñ TR·ª¢ ---

def create_prompt(fields_list: list) -> str:
    """T·∫°o prompt ƒë·ªông d·ª±a tr√™n s·ªë l∆∞·ª£ng tr∆∞·ªùng c·∫ßn h·ªèi."""
    if not fields_list:
        return ""
    
    # T·∫°o danh s√°ch c√°c tr∆∞·ªùng d∆∞·ªõi d·∫°ng m·ªôt chu·ªói text thu·∫ßn t√∫y
    fields_as_text_list = "\n- ".join(fields_list)
    
    # Thay ƒë·ªïi c√¢u l·ªánh prompt ƒë·ªÉ r√µ r√†ng h∆°n v·ªõi LLM
    if len(fields_list) == 1:
        # Prompt t·∫≠p trung v√†o m·ªôt tr∆∞·ªùng duy nh·∫•t
        return f"{fields_as_text_list}"
    else:
        # Prompt cho m·ªôt danh s√°ch c√°c tr∆∞·ªùng
        return f"""
- {fields_as_text_list}
"""

def is_valid_value(value) -> bool:
    """Ki·ªÉm tra xem gi√° tr·ªã tr·∫£ v·ªÅ c√≥ h·ª£p l·ªá kh√¥ng (kh√¥ng ph·∫£i None, kh√¥ng r·ªóng)."""
    return value is not None and str(value).strip() != ""

def query_langflow_for_json(question_prompt: str) -> dict:
    """
    G·ª≠i m·ªôt prompt ƒë·∫øn Langflow, nh·∫≠n ph·∫£n h·ªìi v√† c·ªë g·∫Øng tr√≠ch xu·∫•t m·ªôt ƒë·ªëi t∆∞·ª£ng JSON.
    (H√†m n√†y gi·ªØ nguy√™n nh∆∞ c≈©, kh√¥ng c·∫ßn thay ƒë·ªïi)
    """
    if not question_prompt:
        return {}
    payload = {
        "input_value": question_prompt, "output_type": "chat", "input_type": "chat"
    }
    try:
        response = requests.post(LANGFLOW_URL, json=payload, headers=HEADERS, timeout=120)
        response.raise_for_status()
        langflow_data = json.loads(response.text)
        llm_response_text = langflow_data['outputs'][0]['outputs'][0]['results']['message']['text']
        start = llm_response_text.find('{')
        end = llm_response_text.rfind('}')
        if start != -1 and end != -1:
            json_str = llm_response_text[start : end + 1]
            return json.loads(json_str)
        else:
            print("  - L·ªói: Kh√¥ng t√¨m th·∫•y ƒë·ªëi t∆∞·ª£ng JSON h·ª£p l·ªá trong ph·∫£n h·ªìi c·ªßa LLM.")
            return {}
    except requests.exceptions.RequestException as e:
        print(f"  - L·ªói k·∫øt n·ªëi API: {e}")
        return {}
    except (KeyError, IndexError):
        print("  - L·ªói: C·∫•u tr√∫c JSON tr·∫£ v·ªÅ t·ª´ Langflow kh√¥ng nh∆∞ mong ƒë·ª£i.")
        pprint(langflow_data)
        return {}
    except json.JSONDecodeError as e:
        print(f"  - L·ªói: Kh√¥ng th·ªÉ ph√¢n t√≠ch JSON t·ª´ ph·∫£n h·ªìi c·ªßa LLM. {e}")
        print(f"  - Ph·∫£n h·ªìi g·ªëc t·ª´ LLM: {llm_response_text}")
        return {}


# --- 3. LOGIC TR√çCH XU·∫§T "CHIA ƒê·ªÇ TR·ªä" ---

def main():
    """
    H√†m ch√≠nh ƒëi·ªÅu khi·ªÉn lu·ªìng tr√≠ch xu·∫•t th√¥ng tin th√¥ng minh.
    S·ª≠ d·ª•ng m·ªôt h√†ng ƒë·ª£i (queue) ƒë·ªÉ qu·∫£n l√Ω c√°c l√¥ c√¥ng vi·ªác.
    N·∫øu m·ªôt l√¥ th·∫•t b·∫°i, n√≥ s·∫Ω ƒë∆∞·ª£c chia nh·ªè v√† th√™m l·∫°i v√†o h√†ng ƒë·ª£i.
    """
    final_result = {}
    
    # Kh·ªüi t·∫°o h√†ng ƒë·ª£i c√¥ng vi·ªác (work queue)
    # Ban ƒë·∫ßu, chia to√†n b·ªô danh s√°ch th√†nh c√°c l√¥ c√≥ k√≠ch th∆∞·ªõc `INITIAL_BATCH_SIZE`
    work_queue = [
        FIELDS_TO_EXTRACT[i:i + INITIAL_BATCH_SIZE]
        for i in range(0, len(FIELDS_TO_EXTRACT), INITIAL_BATCH_SIZE)
    ]
    
    current_iteration = 0
    print("üöÄ B·∫Øt ƒë·∫ßu qu√° tr√¨nh tr√≠ch xu·∫•t th√¥ng tin v·ªõi logic 'Chia ƒë·ªÉ tr·ªã'...")

    # V√≤ng l·∫∑p s·∫Ω ti·∫øp t·ª•c khi v·∫´n c√≤n vi·ªác trong h√†ng ƒë·ª£i v√† ch∆∞a v∆∞·ª£t qu√° gi·ªõi h·∫°n
    while work_queue and current_iteration < MAX_ITERATIONS:
        current_iteration += 1
        
        # L·∫•y l√¥ c√¥ng vi·ªác ti·∫øp theo t·ª´ ƒë·∫ßu h√†ng ƒë·ª£i
        current_batch = work_queue.pop(0)
        
        print(f"\n--- V√íNG L·∫∂P {current_iteration}/{MAX_ITERATIONS} | L√¥ c√≤n l·∫°i: {len(work_queue)} ---")
        print(f"ƒêang x·ª≠ l√Ω l√¥ g·ªìm {len(current_batch)} tr∆∞·ªùng: {current_batch}")
        
        # T·∫°o prompt v√† g·ª≠i y√™u c·∫ßu
        prompt = create_prompt(current_batch)
        response_json = query_langflow_for_json(prompt)

        newly_found_fields = []
        if response_json:
            for field in current_batch:
                # Ki·ªÉm tra xem tr∆∞·ªùng c√≥ trong ph·∫£n h·ªìi v√† c√≥ gi√° tr·ªã h·ª£p l·ªá kh√¥ng
                if field in response_json and is_valid_value(response_json[field]):
                    value = response_json[field]
                    print(f"    ‚úÖ ƒê√£ t√¨m th·∫•y '{field}': {value}")
                    final_result[field] = value
                    newly_found_fields.append(field)
        
        # *** LOGIC "CHIA ƒê·ªÇ TR·ªä" N·∫∞M ·ªû ƒê√ÇY ***
        
        # X√°c ƒë·ªãnh c√°c tr∆∞·ªùng ch∆∞a t√¨m ƒë∆∞·ª£c trong l√¥ n√†y
        failed_fields = [f for f in current_batch if f not in newly_found_fields]
        
        if failed_fields:
            print(f"  - Kh√¥ng t√¨m th·∫•y {len(failed_fields)} tr∆∞·ªùng: {failed_fields}")
            
            # N·∫øu l√¥ th·∫•t b·∫°i c√≤n ƒë·ªß l·ªõn ƒë·ªÉ chia, th√¨ chia ƒë√¥i n√≥ ra
            if len(failed_fields) >= MIN_BATCH_SIZE_TO_SPLIT:
                print(f"  -> splitting_batch Chia nh·ªè l√¥ th·∫•t b·∫°i v√† th√™m l·∫°i v√†o h√†ng ƒë·ª£i.")
                mid_point = len(failed_fields) // 2
                first_half = failed_fields[:mid_point]
                second_half = failed_fields[mid_point:]
                
                # Th√™m 2 l√¥ nh·ªè h∆°n v√†o ƒê·∫¶U h√†ng ƒë·ª£i ƒë·ªÉ ƒë∆∞·ª£c ∆∞u ti√™n x·ª≠ l√Ω ngay
                work_queue.insert(0, second_half)
                work_queue.insert(0, first_half)
            else:
                # N·∫øu l√¥ qu√° nh·ªè ƒë·ªÉ chia (ch·ªâ c√≤n 1 tr∆∞·ªùng), ta coi nh∆∞ th·∫•t b·∫°i cu·ªëi c√πng
                print(f"  -> L√¥ qu√° nh·ªè, kh√¥ng chia n·ªØa. Ghi nh·∫≠n l√† kh√¥ng t√¨m th·∫•y.")
                # Ch√∫ng ta s·∫Ω b√°o c√°o c√°c tr∆∞·ªùng n√†y ·ªü cu·ªëi
        
        time.sleep(3) # T·∫°m d·ª´ng gi·ªØa c√°c l·∫ßn g·ªçi API

    # --- 4. T·ªîNG K·∫æT ---
    print("\n\n‚úÖ Qu√° tr√¨nh tr√≠ch xu·∫•t ho√†n t·∫•t!")
    
    # C√°c tr∆∞·ªùng c√≤n l·∫°i trong h√†ng ƒë·ª£i ho·∫∑c kh√¥ng th·ªÉ t√¨m th·∫•y sau khi chia nh·ªè
    remaining_fields = set(FIELDS_TO_EXTRACT) - set(final_result.keys())

    if remaining_fields:
        print(f"‚ö†Ô∏è Kh√¥ng th·ªÉ tr√≠ch xu·∫•t {len(remaining_fields)} tr∆∞·ªùng sau:")
        pprint(list(remaining_fields))

    print("-----------------------------------------")
    print("          K·∫æT QU·∫¢ JSON CU·ªêI C√ôNG         ")
    print("-----------------------------------------")
    
    print(json.dumps(final_result, indent=4, ensure_ascii=False))


if __name__ == "__main__":
    main()
    

üöÄ B·∫Øt ƒë·∫ßu qu√° tr√¨nh tr√≠ch xu·∫•t th√¥ng tin v·ªõi logic 'Chia ƒë·ªÉ tr·ªã'...

--- V√íNG L·∫∂P 1/20 | L√¥ c√≤n l·∫°i: 6 ---
ƒêang x·ª≠ l√Ω l√¥ g·ªìm 6 tr∆∞·ªùng: ['BBC (B√°o c√°o b·ªüi)', 'CBC (C√°n b·ªô ch√≠nh)', 'ID ƒë·ªÅ xu·∫•t', 'Ng√†y b√°o c√°o', 'Ng√†y c·∫≠p nh·∫≠t', 'M·ª•c ƒë√≠ch th·∫©m ƒë·ªãnh']
    ‚úÖ ƒê√£ t√¨m th·∫•y 'BBC (B√°o c√°o b·ªüi)': BBC-DN-HANOI
    ‚úÖ ƒê√£ t√¨m th·∫•y 'CBC (C√°n b·ªô ch√≠nh)': CBC-KHDN-LON
    ‚úÖ ƒê√£ t√¨m th·∫•y 'ID ƒë·ªÅ xu·∫•t': DX-2024-8868
    ‚úÖ ƒê√£ t√¨m th·∫•y 'Ng√†y b√°o c√°o': 05 th√°ng 10 nƒÉm 2024
    ‚úÖ ƒê√£ t√¨m th·∫•y 'Ng√†y c·∫≠p nh·∫≠t': 02/10/2024
    ‚úÖ ƒê√£ t√¨m th·∫•y 'M·ª•c ƒë√≠ch th·∫©m ƒë·ªãnh': C·∫•p m·ªõi

--- V√íNG L·∫∂P 2/20 | L√¥ c√≤n l·∫°i: 5 ---
ƒêang x·ª≠ l√Ω l√¥ g·ªìm 6 tr∆∞·ªùng: ['C·∫•p n∆°i', 'ID T24', 'X·∫øp h·∫°ng t√≠n d·ª•ng', 'Ng√†y x·∫øp h·∫°ng', 'K·∫øt qu·∫£ ph√¢n nh√≥m ti·∫øp c·∫≠n', 'Ng√†nh']
    ‚úÖ ƒê√£ t√¨m th·∫•y 'C·∫•p n∆°i': H·ªôi s·ªü ch√≠nh
    ‚úÖ ƒê√£ t√¨m th·∫•y 'ID T24': 100345