In [1]:
from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType
from pymilvus import utility
connections.connect(
    host="localhost", 
    port="19530"
)
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('keepitreal/vietnamese-sbert')
agent_short_name = 'TranHungDao'

  from tqdm.autonotebook import tqdm, trange


In [2]:
import json

# Đọc file JSON và nạp dữ liệu vào biến
with open("./data/qa/qa_TranHungDao.json", "r", encoding="utf-8") as f:
    documents = json.load(f)

In [15]:
max_question_length = max(len(pair['question']) for pair in documents if pair['question'] is not None)
max_answer_length = max(len(pair['answer']) for pair in documents  if pair['answer'] is not None)

In [16]:
print(max_question_length, max_answer_length)

10247 2922


In [3]:
agent_short_name = 'TranHungDao'

In [5]:
def count_words_in_documents(docs):
    total_words = 0
    for doc in docs:
        for key, text in doc.items():
            total_words += len(text.split())
    return total_words

# Count the total number of words in the documents
total_word_count = count_words_in_documents(documents)
total_word_count

AttributeError: 'NoneType' object has no attribute 'split'

In [6]:
documents[-1]

{'subject': 'Thời trẻ: Tuổi thơ dữ dội',
 'question': 'Mục đích cha nuôi Trần Quốc Tuấn là gì ? Cần đáp án ngắn gọn, xúc tích, và chính xác.',
 'answer': 'Trở thành người có khả năng'}

In [17]:
utility.drop_collection(f"{agent_short_name}_info")

In [18]:

fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
    FieldSchema(name="subject", dtype=DataType.VARCHAR, max_length=200),
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=3000),
    FieldSchema(name="question", dtype=DataType.VARCHAR, max_length=11000),
    FieldSchema(name="question_vector", dtype=DataType.FLOAT_VECTOR, dim=768), 
    FieldSchema(name="text_vector", dtype=DataType.FLOAT_VECTOR, dim=768), 
    FieldSchema(name="question_text_vector", dtype=DataType.FLOAT_VECTOR, dim=768)]

schema = CollectionSchema(fields=fields,enable_dynamic_field=True)

collection = Collection(name=f"{agent_short_name}_info", schema=schema)

index_params = {
    "metric_type": "IP",
    "params": {},
}

collection.create_index("question_vector", index_params)
collection.create_index("text_vector", index_params)
collection.create_index("question_text_vector", index_params)


entities = []
for i, doc in enumerate(documents): 
    try: 
        text = doc['answer'] if doc['answer'] else ""
        question = doc['question'] if doc['question'] else ""
        subject =  doc['subject'] 
        qt = question + ' ' + text
        doc['id'] = i+1
        doc['question_vector'] = model.encode(question)
        doc['text_vector'] = model.encode(text)
        doc['question_text_vector'] = model.encode(qt)
        entity = {
            "id": doc['id'],
            "subject": subject ,
            "text": text,
            "question": question,
            "question_vector": doc['question_vector'],
            "text_vector": doc['text_vector'], 
            "question_text_vector": doc['question_text_vector'], 
        }
        entities.append(entity)
    except KeyError as e:
        print(f"Missing key {e} in document {doc['id']}")
        continue
    
collection.insert(entities)

(insert count: 1507, delete count: 0, upsert count: 0, timestamp: 453567028105838594, success count: 1507, err count: 0

In [19]:
collection.release()
collection.load()

In [20]:
collection = Collection(name=f"{agent_short_name}_info")