In [1]:
## Config
from minirag.prompt import PROMPTS
import sys; sys.path.append("..")
from utils.prompts import rag_prompts

PROMPTS.update(rag_prompts) # 提示词模板
LLM_MODEL = "test"  # 模型名称
DATA_PATH = "../resources/data/LiHua-World/data/"  # 数据路径
QUERY_PATH = "../resources/data/LiHua-World/qa/query_set.csv"  # 查询路径
WORKING_DIR = "../resources/data/rag_outputs" # 工作目录
OUTPUT_PATH = "./logs/Default_output.csv"  # 输出路径
EMBED_MODEL_PATH = "../resources/open_models/bge-large-zh-v1.5" # 嵌入模型路径

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
## 初始化
import os
import csv
from tqdm import trange
from minirag import MiniRAG, QueryParam
from minirag.llm import openai_complete_if_cache, hf_embedding
from minirag.utils import EmbeddingFunc
import logging
import numpy as np
from transformers import AutoTokenizer, AutoModel
import nest_asyncio; nest_asyncio.apply() # 在notebook中使用async所需

# 设置日志级别
logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO)

embed_tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL_PATH, model_max_length=512) 
embed_model = AutoModel.from_pretrained(EMBED_MODEL_PATH)

async def llm_model_func(
    prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs
) -> str:
    return await openai_complete_if_cache(
        model="base",
        api_key="empty",
        base_url="http://localhost:12239/v1",
        prompt=prompt,
        system_prompt=system_prompt,
        history_messages=history_messages,
        **kwargs
    )

async def embedding_func(texts: list[str]) -> np.ndarray:
    return await hf_embedding(
        texts,
        embed_model=embed_model,
        tokenizer=embed_tokenizer,
    )

os.makedirs(WORKING_DIR,exist_ok=True)

rag = MiniRAG(
    working_dir=WORKING_DIR,
    llm_model_func=llm_model_func,
    llm_model_max_token_size=1000,
    llm_model_name=LLM_MODEL,
    embedding_func=EmbeddingFunc(
        embedding_dim=embed_model.config.hidden_size,
        max_token_size=embed_model.config.max_position_embeddings,
        func=embedding_func
    )
)

INFO:minirag:Logger initialized for working directory: ../resources/data/rag_outputs
INFO:minirag:Load KV llm_response_cache with 14 data
INFO:minirag:Load KV full_docs with 3 data
INFO:minirag:Load KV text_chunks with 3 data
INFO:minirag:Loaded graph from ../resources/data/rag_outputs/graph_chunk_entity_relation.graphml with 6 nodes, 3 edges
INFO:nano-vectordb:Load (5, 1024) data
INFO:nano-vectordb:Init {'embedding_dim': 1024, 'metric': 'cosine', 'storage_file': '../resources/data/rag_outputs/vdb_entities.json'} 5 data
INFO:nano-vectordb:Load (5, 1024) data
INFO:nano-vectordb:Init {'embedding_dim': 1024, 'metric': 'cosine', 'storage_file': '../resources/data/rag_outputs/vdb_entities_name.json'} 5 data
INFO:nano-vectordb:Load (2, 1024) data
INFO:nano-vectordb:Init {'embedding_dim': 1024, 'metric': 'cosine', 'storage_file': '../resources/data/rag_outputs/vdb_relationships.json'} 2 data
INFO:nano-vectordb:Load (5, 1024) data
INFO:nano-vectordb:Init {'embedding_dim': 1024, 'metric': 'cosi

In [5]:
## 构建索引
def find_txt_files(root_path):
    txt_files = []
    for root, dirs, files in os.walk(root_path):
        for file in files:
            if file.endswith('.txt'):
                txt_files.append(os.path.join(root, file))
    return txt_files

WEEK_LIST = find_txt_files(DATA_PATH)
for WEEK in WEEK_LIST[:5]:
    id = WEEK_LIST.index(WEEK)
    print(f"{id}/{len(WEEK_LIST)}")
    with open(WEEK) as f:
        content = f.read()
        rag.insert(content)


INFO:minirag:[New Docs] inserting 1 docs
INFO:minirag:[New Chunks] inserting 1 chunks
INFO:minirag:Inserting 1 vectors to chunks
INFO:minirag:[Entity Extraction]...


0/442
1/442
2/442
3/442


INFO:httpx:HTTP Request: POST http://localhost:12237/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:12237/v1/chat/completions "HTTP/1.1 200 OK"


⠙ Processed 1 chunks, 2 entities(duplicated), 1 relations(duplicated)

INFO:minirag:Inserting 2 vectors to entities
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
INFO:minirag:Inserting 2 vectors to entities_name
INFO:minirag:Inserting 1 vectors to relationships
INFO:minirag:Writing graph with 6 nodes, 3 edges
INFO:minirag:[New Docs] inserting 1 docs
INFO:minirag:[New Chunks] inserting 1 chunks
INFO:minirag:Inserting 1 vectors to chunks



4/442


INFO:minirag:[Entity Extraction]...
INFO:httpx:HTTP Request: POST http://localhost:12237/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:12237/v1/chat/completions "HTTP/1.1 200 OK"


⠙ Processed 1 chunks, 2 entities(duplicated), 3 relations(duplicated)

INFO:minirag:Inserting 2 vectors to entities
INFO:minirag:Inserting 2 vectors to entities_name
INFO:minirag:Inserting 3 vectors to relationships
INFO:minirag:Writing graph with 7 nodes, 5 edges





In [6]:
## 查询与回答
query = "What does LiHua predict will happen in \"The Rings of Power\"?"
answer = rag.query(query, param=QueryParam(mode="mini")).replace("\n", "").replace("\r", "")
print(answer)

INFO:httpx:HTTP Request: POST http://localhost:12237/v1/chat/completions "HTTP/1.1 200 OK"


LiHua's prediction about events in "The Rings of Power" is not provided in the given information. The provided data mainly covers a dialogue on the issue of a broken water tap in an apartment, the confirmation that a plumber will arrive, and the interaction with Adam Smith who provides assistance and updates. There is no mention or indication regarding any predictions or storyline developments in "The Rings of Power" from which to derive LiHua's forecast.
