In [1]:
from milvus_db import MilvusDB, MilvusQADB# step4_vector_store
from utils import convert_df_to_documents
from model_config import VietnameseEmbeddings
import os
import pandas as pd
def load_final_df():
    final_df_path = r"D:\DATN\QA_System\data_analyze\finaldf0.pkl"
    if os.path.exists(final_df_path):
        final_df = pd.read_pickle(final_df_path)
        print(f"Đã load final_df từ file: {final_df_path}")
        return final_df
    else:
        print("File final_df chưa tồn tại.")
        return None
def step4_vector_store():
    """
    Store document vectors
    """
    final_df = load_final_df()
    documents = convert_df_to_documents(final_df)
    corpus=[doc.page_content for doc in documents]
    # Tạo Milvus database
    milvus_db = MilvusDB(collection_name="noraptor", corpus=corpus)
    milvus_db.create_collection()
    milvus_db.insert_documents(documents)
# Tối ưu 1: Sử dụng singleton pattern để cache DataFrame và corpus
class DataManager:
    _instance = None
    _final_df = None
    _corpus = None
    
    @classmethod
    def get_instance(cls):
        if cls._instance is None:
            cls._instance = cls()
        return cls._instance
    
    def get_final_df(self):
        if self._final_df is None:
            print("Loading final_df from file...")
            self._final_df = load_final_df()
        return self._final_df
    
    def get_corpus(self):
        if self._corpus is None:
            documents = convert_df_to_documents(self.get_final_df())
            self._corpus = [doc.page_content for doc in documents]
        return self._corpus

# Tối ưu 2: Singleton pattern cho MilvusDB để tránh khởi tạo lại
class MilvusManager:
    _instance = None
    _milvus_db = None
    _milvus_qa_db = None
    @classmethod
    def get_instance(cls):
        if cls._instance is None:
            cls._instance = cls()
        return cls._instance
    
    def get_milvus_db(self, collection_name):
        if self._milvus_db is None:
            data_manager = DataManager.get_instance()
            corpus = data_manager.get_corpus()
            self._milvus_db = MilvusDB(collection_name=collection_name, corpus=corpus)
            self._milvus_db.load_collection()
        return self._milvus_db
    def get_milvus_qa_db(self, qa_collection_name):
            # Khởi tạo QA collection
        if self._milvus_qa_db is None:
            self._milvus_qa_db = MilvusQADB(collection_name=qa_collection_name)
            self._milvus_qa_db.load_qa_collection()
        return self._milvus_qa_db

# Tối ưu 3: Cập nhật hàm step5_retrieval
def step5_retrieval(collection_name: str):
    data_manager = DataManager.get_instance()
    milvus_manager = MilvusManager.get_instance()
    corpus = data_manager.get_corpus()
    milvus_db = milvus_manager.get_milvus_db(collection_name)
    return corpus, milvus_db 
def step6_qa_db(collection_name: str):
    milvus_manager = MilvusManager.get_instance()
    milvus_qa_db = milvus_manager.get_milvus_qa_db(collection_name)
    return milvus_qa_db
    

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from pymilvus import (
    Collection,
    CollectionSchema,
    DataType,
    FieldSchema,
    connections,
    utility,
    MilvusException,
)
from typing import List, Dict

In [3]:
# Kết nối Milvus
def connect_to_milvus(uri="http://localhost:19530"):
    try:
        connections.connect(uri=uri)
        print("[INFO] Connected to Milvus.")
        print("list_collections: ", utility.list_collections())
    except MilvusException as e:
        print(f"[ERROR] Error connecting to Milvus: {e}")
        raise e

# Hàm kiểm tra trạng thái collection
def check_collection_status(col_name: str):
    try:
        if utility.has_collection(col_name):
            collection = Collection(name=col_name)
            print(f"[INFO] Collection '{col_name}' exists. Total entities: {collection.num_entities}")
        else:
            print(f"[INFO] Collection '{col_name}' does not exist.")
    except MilvusException as e:
        print(f"[ERROR] Error checking collection '{col_name}': {e}")
        raise e
# Main logic
if __name__ == "__main__":
    connect_to_milvus()
    step4_vector_store()
    # Tên collection
    col_name = "hybrid_demo"

    # Kiểm tra trạng thái collection
    check_collection_status(col_name)
    check_collection_status(col_name="base_qa")
    check_collection_status(col_name="raptor")
    

[INFO] Connected to Milvus.
list_collections:  ['base_qa', 'raptor', 'hybrid_demo']
Đã load final_df từ file: D:\DATN\QA_System\data_analyze\finaldf0.pkl
Processed 981 out of 981 documents.
Initializing Vietnamese embedding model: keepitreal/vietnamese-sbert




[INFO] Collection 'hybrid_demo' exists. Total entities: 27
[INFO] Collection 'base_qa' exists. Total entities: 2
[INFO] Collection 'raptor' exists. Total entities: 1198


In [4]:
utility.list_collections()

['base_qa', 'noraptor', 'hybrid_demo', 'raptor']

In [5]:
check_collection_status(col_name="noraptor")

[INFO] Collection 'noraptor' exists. Total entities: 981
