In [1]:
from core.database.database import Database
from core.config.config import get_config

In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI

Settings.llm = OpenAI(
    model="gpt-3.5-turbo", 
    temperature=0.0,
    api_key=os.getenv("api_key"),
    api_version=os.getenv("api_version"),
)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from llama_index.embeddings.huggingface_optimum import OptimumEmbedding
from pathlib import Path
from transformers import AutoTokenizer

embed_path = str(Path(
    get_config("Path", "model"),
    get_config("Model", "embed")
))

tokenizer = AutoTokenizer.from_pretrained(embed_path)
tokenizer.model_input_names = ["input_ids", "attention_mask"]

Settings.embed_model = OptimumEmbedding(
    folder_name=embed_path,
    tokenizer=tokenizer,
    pooling='mean',
    max_length=256,
    device='cpu'
)

2024-05-15 00:38:29.023910: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-15 00:38:29.054432: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-15 00:38:29.054467: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-15 00:38:29.055729: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-15 00:38:29.061277: I tensorflow/core/platform/cpu_feature_guar

In [5]:
database = Database(
    path_indomain=get_config("Path", "indomain"),
    path_outdomain=get_config("Path", "outdomain")
)

[32m00:38:31 [ INFO      ] CHATBOT_UIT: [0m [34mReading file[0m
[32m00:38:31 [ INFO      ] CHATBOT_UIT: [0m [34mReading share knowledge[0m
[32m00:38:31 [ INFO      ] CHATBOT_UIT: [0m [34mReading answer database[0m
  warn(msg)
[32m00:38:31 [ INFO      ] CHATBOT_UIT: [0m [34mReading product[0m
  warn(msg)


In [6]:
from core.utils.preprocessing import clean_text

data = {}

for idx, row in database.answer.iterrows():
    answer = row["Answer"]
    pattern = row["Pattern Template"]
    intent = pattern.split("|")[0]

    if intent not in data:
        data[intent] = {}

    data[intent][pattern] = {
        'answer': answer,
        'question': []
    }

for idx, row in database.question.iterrows():
    question = row["Question"]
    pattern = row["Pattern Template"]
    intent = row['Intent']

    if intent not in data or pattern not in data[intent]: 
        continue

    question = clean_text(question, database.synonyms_dictionary, tokenizer=False)
    
    data[intent][pattern]['question'].append(question)

for intent in data.keys():
    for pattern in data[intent].keys():
        data[intent][pattern]['question'] = list(set(data[intent][pattern]['question']))

In [7]:
import joblib, pickle
from os import path

joblib.dump(data, path.join(get_config("Path", "data"), "clean_data.pkl"), protocol=pickle.HIGHEST_PROTOCOL)

['Data/clean_data.pkl']

In [8]:
from llama_index.core.schema import TextNode, IndexNode

nodes_intent = {}
nodes_dict = {}
for intent, pattern_data in data.items():
    
    nodes = []
    for pattern, value in pattern_data.items():
        
        node = TextNode(
            text=value['answer'],
            metadata = {
                "pattern": pattern,
            },
            excluded_llm_metadata_keys=["pattern"],
        )
        
        nodes.append(node)
        
        nodes.extend([
            IndexNode(
                text=pattern,
                index_id=node.node_id,
            ),
            IndexNode(
                text=intent,
                index_id=node.node_id,
            )
        ])

        for question in value['question']:
            nodes.append(
                IndexNode(
                    text=question,
                    index_id=node.node_id,
                )
            )

    nodes_intent[intent] = nodes
    nodes_dict[intent] = {node.node_id: node for node in nodes}
    
len(nodes_intent)

11

In [9]:
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.core.storage.docstore import SimpleDocumentStore
import chromadb

In [10]:
from typing import Tuple, Optional
from llama_index.core.schema import BaseNode
from llama_index.core.storage.docstore.types import RefDocInfo

class CustomDocumentStore(SimpleDocumentStore):
    def _get_kv_pairs_for_insert(
        self, node: BaseNode, ref_doc_info: Optional[RefDocInfo], store_text: bool
    ) -> Tuple[
        Optional[Tuple[str, dict]],
        Optional[Tuple[str, dict]],
        Optional[Tuple[str, dict]],
    ]:
        if isinstance(node, IndexNode):
            return None, None, None
        
        return super()._get_kv_pairs_for_insert(node, ref_doc_info, store_text)

In [11]:
from os import path
from unidecode import unidecode
import joblib, pickle

chroma_client = chromadb.PersistentClient(get_config("Path", "Index"))

for intent, nodes in nodes_intent.items():
    intent = unidecode(intent.lower())
    
    chroma_collection = chroma_client.get_or_create_collection(intent)
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    vector_store.stores_text = False
    
    storage_context = StorageContext.from_defaults(
        vector_store=vector_store,
        docstore=CustomDocumentStore(),
    )

    index = VectorStoreIndex(nodes=nodes, storage_context=storage_context, show_progress=True, insert_batch_size=256)
    
    storage_context.persist(path.join(get_config("Path", "Index"), intent))

joblib.dump(nodes_dict, path.join(get_config("Path", "Index"), "nodes_dict.pkl"), protocol=pickle.HIGHEST_PROTOCOL)

[32m00:38:49 [ INFO      ] chromadb.telemetry.product.posthog: [0m [34mAnonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.[0m
Generating embeddings: 100%|██████████| 256/256 [00:08<00:00, 29.85it/s]
Generating embeddings: 100%|██████████| 256/256 [00:08<00:00, 28.72it/s]
Generating embeddings: 100%|██████████| 74/74 [00:04<00:00, 18.37it/s]
Generating embeddings: 100%|██████████| 256/256 [00:02<00:00, 123.61it/s]
Generating embeddings: 100%|██████████| 256/256 [00:01<00:00, 144.30it/s]
Generating embeddings: 100%|██████████| 84/84 [00:00<00:00, 118.06it/s]
Generating embeddings: 100%|██████████| 256/256 [00:02<00:00, 126.51it/s]
Generating embeddings: 100%|██████████| 256/256 [00:01<00:00, 128.70it/s]
Generating embeddings: 100%|██████████| 256/256 [00:02<00:00, 92.41it/s] 
Generating embeddings: 100%|██████████| 256/256 [00:02<00:00, 97.97it/s] 
Generating embeddings: 100%|██████████| 82/82 [00:00<00:00, 116.18it/s]
Gene

['Index/nodes_dict.pkl']

In [12]:
def get_retriever(intent, **kargs):
    intent = unidecode(intent.lower())
    path_save = path.join(get_config("Path", "Index"), intent)
    
    storage_context = StorageContext.from_defaults(
        vector_store=ChromaVectorStore(
            chroma_collection=chroma_client.get_or_create_collection(intent)
        ),
        persist_dir=path_save,
    )
    
    index = VectorStoreIndex(
        nodes = [],
        storage_context = storage_context
    )
    
    return index.as_retriever(**kargs)

In [13]:
from llama_index.core.retrievers import RecursiveRetriever

intent = "hỏi_đáp_điểm_chuẩn"

vector_retriever = get_retriever(intent, similarity_top_k=10)

retriever = RecursiveRetriever(
    "vector",
    retriever_dict={"vector": vector_retriever},
    node_dict=nodes_dict[intent],
    verbose=True,
)

retriever.retrieve("Điểm chuẩn ngành Khoa học máy tính 2022")

[1;3;34mRetrieving with query id None: Điểm chuẩn ngành Khoa học máy tính 2022
[0m[1;3;38;5;200mRetrieved node with id, entering: 6896ce97-cf8f-4dad-a0e6-22a50604ecb0
[0m[1;3;34mRetrieving with query id 6896ce97-cf8f-4dad-a0e6-22a50604ecb0: Điểm chuẩn ngành Khoa học máy tính 2022
[0m[1;3;38;5;200mRetrieved node with id, entering: b96a06d4-1cce-4aaf-8dee-993119efafd2
[0m[1;3;34mRetrieving with query id b96a06d4-1cce-4aaf-8dee-993119efafd2: Điểm chuẩn ngành Khoa học máy tính 2022
[0m[1;3;38;5;200mRetrieved node with id, entering: fa34cd77-6b6b-44c5-809f-4255c40e5c75
[0m[1;3;34mRetrieving with query id fa34cd77-6b6b-44c5-809f-4255c40e5c75: Điểm chuẩn ngành Khoa học máy tính 2022
[0m[1;3;38;5;200mRetrieved node with id, entering: bfeecf33-4ad5-4e56-bf5a-3422e55d5a05
[0m[1;3;34mRetrieving with query id bfeecf33-4ad5-4e56-bf5a-3422e55d5a05: Điểm chuẩn ngành Khoa học máy tính 2022
[0m

[NodeWithScore(node=TextNode(id_='6896ce97-cf8f-4dad-a0e6-22a50604ecb0', embedding=None, metadata={'pattern': 'hỏi_đáp_điểm_chuẩn|dgnl|khmt|năm_2022'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=['pattern'], relationships={}, text='Điểm chuẩn ĐGNL năm 2022 ngành Khoa học máy tính là 888', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), score=0.7072114388823445),
 NodeWithScore(node=TextNode(id_='b96a06d4-1cce-4aaf-8dee-993119efafd2', embedding=None, metadata={'pattern': 'hỏi_đáp_điểm_chuẩn|thpt|khmt|năm_2021'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=['pattern'], relationships={}, text='điểm chuẩn ngành Khoa học máy tính năm 2021 là 27.3', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), score=0.7024271121756573),
 NodeWithScore(node=TextNode(id_='fa34cd77