In [101]:

api_key = "aXY4NkpKUUJaNVNfUEdnZHdVZ186MExmVk1iclZRWWlrS1hpeDRhOWRGUQ=="
ELASTIC_URL = "https://my-elasticsearch-project-fc9fd1.es.ap-southeast-1.aws.elastic.cloud:443"

In [7]:
import tiktoken
from sentence_transformers import SentenceTransformer
from langchain_groq import ChatGroq
from langchain_openai import ChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI
from transformers import AutoTokenizer
from dotenv import load_dotenv
import os
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from pyvi.ViTokenizer import tokenize
from typing import List, Optional
from langchain_core.embeddings import Embeddings
import numpy as np
# Tạo wrapper class cho SentenceTransformer

class VietnameseEmbeddings(Embeddings):
    """Singleton Embeddings for Vietnamese using SentenceTransformer."""
    _instance: Optional['VietnameseEmbeddings'] = None

    def __new__(cls, model_name: str = "keepitreal/vietnamese-sbert"): #"dangvantuan/vietnamese-embedding" or "keepitreal/vietnamese-sbert"
        # Nếu chưa có instance, tạo mới
        if cls._instance is None:
            cls._instance = super().__new__(cls)
            # Khởi tạo model chỉ một lần
            cls._instance._initialize_model(model_name)
        return cls._instance

    def _initialize_model(self, model_name: str):
        try:
            print(f"Initializing Vietnamese embedding model: {model_name}")
            self.model = SentenceTransformer(model_name)
        except Exception as e:
            print(f"Error initializing embedding model: {e}")
            raise

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self.model.encode(texts, convert_to_numpy=True).tolist()

    def embed_query(self, text: str) -> List[float]:
        return self.model.encode(text, convert_to_numpy=True).tolist()
    def embed_text(self, text: str)-> np.ndarray:  #using in RAPTOR (np.ndarray)
        return self.model.encode(text, convert_to_numpy=True)

  from tqdm.autonotebook import tqdm, trange


In [8]:
embeddings = VietnameseEmbeddings()

Initializing Vietnamese embedding model: keepitreal/vietnamese-sbert




In [104]:
from langchain_elasticsearch import ElasticsearchStore

In [105]:
elastic_vector_search = ElasticsearchStore(
    es_url=ELASTIC_URL,
    index_name = "base",
    embedding=embeddings,
    es_api_key = api_key
)

In [106]:
from uuid import uuid4
import pandas as pd
from langchain_core.documents import Document

def load_final_df():
    final_df_path = r"D:\DATN\QA_System\data_analyze\finaldf0.xlsx"
    if os.path.exists(final_df_path):
        final_df = pd.read_excel(final_df_path)
        print(f"Đã load final_df từ file: {final_df_path}")
        return final_df
    else:
        print("File final_df chưa tồn tại.")
        return None
def convert_df_to_documents(final_df):
    documents = []
    for _, row in final_df.iterrows():
        # Handle metadata - convert to dict if it's a string
        metadata = row["metadata"]
        if isinstance(metadata, str):
            source = metadata
        else:
            source = metadata.get("source", "unknown") if isinstance(metadata, dict) else "unknown"

        # Create Document with properly handled metadata
        doc = Document(
            page_content=row["text"],
            metadata={
                "source": source,
                "level": row["level"]
            }
        )
        documents.append(doc)

    # Process documents
    processed_documents = []
    for doc in documents:
        # Handle source field
        if isinstance(doc.metadata["source"], list):
            doc.metadata["source"] = " ".join(doc.metadata["source"]) if doc.metadata["source"] else "..."
        elif doc.metadata["source"] is None:
            doc.metadata["source"] = "..."

        # Skip empty content
        if not doc.page_content.strip():
            print(f"Warning: Page content is empty for document: {doc.metadata['source']}")
            continue

        processed_documents.append(doc)

    print(f"Processed {len(processed_documents)} out of {len(documents)} documents.")
    return processed_documents

final_df = load_final_df()
documents = convert_df_to_documents(final_df)
def add_documents_in_batches(documents, batch_size=100):
    total_documents = len(documents)
    for start_idx in range(0, total_documents, batch_size):
        end_idx = min(start_idx + batch_size, total_documents)
        batch = documents[start_idx:end_idx]
        uuids = [str(uuid4()) for _ in range(len(batch))]
        print(f"Adding documents {start_idx + 1} to {end_idx}...")
        elastic_vector_search.add_documents(documents=batch, ids=uuids)


Đã load final_df từ file: D:\DATN\QA_System\data_analyze\finaldf0.xlsx
Processed 981 out of 981 documents.


In [108]:
batch_size = 100
add_documents_in_batches(documents, batch_size)

Adding documents 1 to 100...
Adding documents 101 to 200...
Adding documents 201 to 300...
Adding documents 301 to 400...
Adding documents 401 to 500...
Adding documents 501 to 600...
Adding documents 601 to 700...
Adding documents 701 to 800...
Adding documents 801 to 900...
Adding documents 901 to 981...


In [19]:
retriever = elastic_vector_search.as_retriever(
    search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.2}
)
retriever.invoke("Stealing from the bank is a crime")

[Document(metadata={'source': 'news'}, page_content='Robbers broke into the city bank and stole $1 million in cash.'),
 Document(metadata={'source': 'tweet'}, page_content='I have a bad feeling I am going to get deleted :('),
 Document(metadata={'source': 'news'}, page_content='The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.'),
 Document(metadata={'source': 'news'}, page_content='The stock market is down 500 points today due to fears of a recession.')]

In [22]:
from langchain_elasticsearch import ElasticsearchRetriever
from typing import List, Optional, Dict

In [None]:
def hybrid_query(search_query: str) -> Dict:
    vector = embeddings.embed_query(search_query)  # same embeddings as for indexing
    return {
        "retriever": {
            "rrf": {
                "retrievers": [
                    {
                        "standard": {
                            "query": {
                                "match": {
                                    "text": search_query,
                                }
                            }
                        }
                    },
                    {
                        "knn": {
                            "field": "vector",
                            "query_vector": vector,
                            "k": 5,
                            "num_candidates": 10,
                        }
                    },
                ]
            }
        }
    }


hybrid_retriever = ElasticsearchRetriever.from_es_params(
    index_name="base",
    body_func=hybrid_query,
    content_field="text",
    url=  ELASTIC_URL,
    api_key = api_key
)

hybrid_retriever.invoke("giám đốc đại học là ai?")

[Document(metadata={'_index': 'base', '_id': '646343bf-060a-4f32-ab1f-b70831cbf17e', '_score': 0.016393442, '_ignored': ['text.keyword'], '_source': {'metadata': {'source': "{'source': 'Trung tâm Nghiên cứu Quốc tế về Trí tuệ nhân tạo.txt', 'id': '1'}", 'level': 0}, 'vector': [0.0737982839345932, -0.0686984732747078, -0.26467686891555786, 0.125117689371109, 0.01067876722663641, 0.4421777129173279, 0.1759449690580368, 0.21361233294010162, -0.15112701058387756, -0.15989230573177338, -0.216285839676857, 0.138368621468544, 0.6064616441726685, 0.030330242589116096, 0.23193198442459106, -0.010087798349559307, 0.503338098526001, 0.2628445625305176, -0.2959718406200409, -0.19173690676689148, 0.12053704261779785, 0.11710254102945328, 0.11125349253416061, 0.5049293637275696, -0.44178637862205505, 0.03628509119153023, 0.08750002086162567, 0.2895301580429077, -0.1871057003736496, 0.013034234754741192, 0.26485300064086914, -0.06918840110301971, 0.04624909162521362, -0.2528960406780243, 0.2167456299

In [11]:
from elasticsearch import Elasticsearch, helpers
import json
from time import time

# Kết nối tới Elasticsearch
es = Elasticsearch(
    "https://my-elasticsearch-project-fc9fd1.es.ap-southeast-1.aws.elastic.cloud:443",
    api_key="aXY4NkpKUUJaNVNfUEdnZHdVZ186MExmVk1iclZRWWlrS1hpeDRhOWRGUQ=="
)

# Tạo chỉ mục với mapping (chỉ tạo nếu chưa tồn tại)
index_name = "faq_data"
mapping = {
    "mappings": {
        "properties": {
            "question": {"type": "text"},
            "answer": {"type": "text"},
            "category": {"type": "keyword"},
            "vector": {"type": "dense_vector", "dims": 768, "similarity": "cosine", "index": True}  # Số chiều của vector embedding
        }
    }
}

# Kiểm tra và tạo chỉ mục nếu chưa tồn tại
if not es.indices.exists(index=index_name):
    es.indices.create(index=index_name, body=mapping)

embeddings = VietnameseEmbeddings()

# Đọc dữ liệu từ file JSON và chuẩn bị các tác vụ bulk
json_path = r"D:\DATN\chatbot\code\output.json"
with open(json_path, "r", encoding="utf-8") as file:
    data = json.load(file)

# Chuẩn bị các doc cho bulk
actions = []
for record in data:
    question = record["question"]
    embedding = embeddings.embed_query(question)
    doc = {
        "_op_type": "index",  # Chỉ mục bản ghi
        "_index": index_name,
        "_source": {
            "question": question,
            "answer": record["answer"],
            "category": record["category"],
            "vector": embedding
        }
    }
    actions.append(doc)

# Thực hiện bulk index
start_time = time()
helpers.bulk(es, actions)
print(f"Đã chèn {len(data)} bản ghi vào Elasticsearch. Thời gian thực hiện: {time() - start_time:.2f} giây.")


Đã chèn 2560 bản ghi vào Elasticsearch. Thời gian thực hiện: 20.97 giây.


In [4]:
import pandas as pd
import json

# Đọc file Excel với encoding UTF-8
df = pd.read_excel(r"D:\DATN\chatbot\code\final_dataset_qa.xlsx")

# Thêm cột "category" với giá trị mặc định "base"
df['category'] = 'base'
df.drop(columns=["text", "metadata"], inplace=True)
# Chuyển DataFrame thành dictionary
data = df.to_dict(orient='records')

# Ghi dữ liệu vào file JSON với encoding UTF-8
with open('output.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, indent=4, ensure_ascii=False)