In [1]:
## Configs
LLM_URL = "http://localhost:12239/v1" 
LLM_AK = "empty"
LLM_NAME = "base"  
EMBED_MODEL_PATH = "../../resources/open_models/bge-large-zh-v1.5" 
DATA_PATH = "../../resources/data/LiHua-World/data/"  # 数据路径
QUERY_PATH = "../../resources/data/LiHua-World/qa/query_set.csv"  # 查询路径
WORKING_DIR = "../../resources/data/rag_outputs" # 工作目录


In [None]:
## 初始化
import os
import csv
import logging
import numpy as np
from functools import partial
from dataclasses import asdict

from tqdm import trange
from minirag.prompt import PROMPTS
from minirag import MiniRAG, QueryParam
from minirag.utils import EmbeddingFunc, list_of_list_to_csv
from minirag.llm import openai_complete_if_cache, hf_embedding

import sys; sys.path.append("../..")
from utils.rag import prompts,retrieval,get_keyword

from transformers import AutoTokenizer, AutoModel
import nest_asyncio; nest_asyncio.apply() # 在notebook中使用async所需

PROMPTS.update(prompts) 

# 设置日志级别
logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO)

embed_tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL_PATH, model_max_length=512) 
embed_model = AutoModel.from_pretrained(EMBED_MODEL_PATH)

async def llm_model_func(
    prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs
) -> str:
    return await openai_complete_if_cache(
        model=LLM_NAME, api_key=LLM_AK, base_url=LLM_URL,
        prompt=prompt,
        system_prompt=system_prompt,
        history_messages=history_messages,
        **kwargs
    )

os.makedirs(WORKING_DIR,exist_ok=True)

rag = MiniRAG(
    working_dir=WORKING_DIR,
    llm_model_func=llm_model_func, llm_model_max_token_size=1000, llm_model_name=LLM_NAME,
    embedding_func=EmbeddingFunc(
        embedding_dim=embed_model.config.hidden_size,
        max_token_size=embed_model.config.max_position_embeddings,
        func=partial(hf_embedding, embed_model=embed_model, tokenizer=embed_tokenizer)
    )
)

INFO:minirag:Logger initialized for working directory: ../../resources/data/rag_outputs
INFO:minirag:Load KV llm_response_cache with 20 data
INFO:minirag:Load KV full_docs with 5 data
INFO:minirag:Load KV text_chunks with 5 data
INFO:minirag:Loaded graph from ../../resources/data/rag_outputs/graph_chunk_entity_relation.graphml with 7 nodes, 5 edges
INFO:nano-vectordb:Load (5, 1024) data
INFO:nano-vectordb:Init {'embedding_dim': 1024, 'metric': 'cosine', 'storage_file': '../../resources/data/rag_outputs/vdb_entities.json'} 5 data
INFO:nano-vectordb:Load (5, 1024) data
INFO:nano-vectordb:Init {'embedding_dim': 1024, 'metric': 'cosine', 'storage_file': '../../resources/data/rag_outputs/vdb_entities_name.json'} 5 data
INFO:nano-vectordb:Load (4, 1024) data
INFO:nano-vectordb:Init {'embedding_dim': 1024, 'metric': 'cosine', 'storage_file': '../../resources/data/rag_outputs/vdb_relationships.json'} 4 data
INFO:nano-vectordb:Load (5, 1024) data
INFO:nano-vectordb:Init {'embedding_dim': 1024, 

In [None]:
## 构建索引
def find_txt_files(root_path):
    txt_files = []
    for root, dirs, files in os.walk(root_path):
        for file in files:
            if file.endswith('.txt'):
                txt_files.append(os.path.join(root, file))
    return txt_files

WEEK_LIST = find_txt_files(DATA_PATH)
for WEEK in WEEK_LIST[:5]:
    id = WEEK_LIST.index(WEEK)
    print(f"{id}/{len(WEEK_LIST)}")
    with open(WEEK) as f:
        content = f.read()
        rag.insert(content)


In [None]:
## example
query = "What does LiHua predict will happen in \"The Rings of Power\"?"
type_kw,ent_kw = await get_keyword(query,rag.chunk_entity_relation_graph,asdict(rag))
ent_recall, chunk_recall = await retrieval(
    query,type_kw,ent_kw, 
    rag.chunk_entity_relation_graph,rag.entity_name_vdb,rag.relationships_vdb,rag.chunks_vdb,
)
sys_prompt = PROMPTS['sys_prompt_for_rag_answer'].format(
        entities_context=list_of_list_to_csv([["entity", "score", "description"]]+ent_recall), 
        text_units_context=list_of_list_to_csv([["id", "content"]]+chunk_recall)
    )
response = await llm_model_func(query, system_prompt=sys_prompt)
print(type_kw,ent_kw,ent_recall,chunk_recall,response)

INFO:httpx:HTTP Request: POST http://localhost:12239/v1/chat/completions "HTTP/1.1 200 OK"


['PERSON'] ['LiHua', 'The Rings of Power'] [['"THE APARTMENT"', 0.44848663, '"The apartment is the specific location that has a broken water tab."|>"specific location, issue location"'], ['"LIHUA"', 0.41985768, '"LiHua is a person initiating communication about Wi-Fi password and house rules."<SEP>"LiHua is a person who reaches out to AdamSmith for the Wi-Fi password and house rules."<SEP>"LiHua is a traveler who has arrived in the city and is planning a lunch meeting with WolfgangSchulz."<SEP>"LiHua is the person who communicates a problem and thanks Adam for help when they offer to assist."<SEP>"LiHua is trying to communicate a problem with a water tab to Adam Smith, indicating her proactive nature and request for assistance."<SEP>"LiHua is the second person in the text who responds to the appointment confirmation from AdamSmith and makes arrangements to be ready for the plumber."<SEP>"LiHua is an individual checking in with AdamSmith about the move-in timing and confirming a visit d

In [None]:
print(retrivals)