启动LLM及文本嵌入模型的vllm服务

vllm serve resources/open_models/Qwen2.5-3B-Instruct --trust-remote-code \
--served-model-name llm \
--max-model-len 10000 --max-num-seqs 16 \
--tensor-parallel-size 4 --pipeline-parallel-size 2 --gpu-memory-utilization 0.15 \
--quantization fp8 \
--port 12236

vllm serve resources/open_models/bge-large-zh-v1.5 --trust-remote-code \
--served-model-name ebd \
--tensor-parallel-size 8 \
--task embedding \
--port 12237

In [7]:
import os
import csv
from tqdm import trange
from minirag import MiniRAG, QueryParam
from minirag.llm import openai_complete_if_cache, openai_embedding
from minirag.utils import EmbeddingFunc
import logging
import numpy as np
from transformers import AutoTokenizer
import nest_asyncio; nest_asyncio.apply() # 在notebook中使用async所需

# 设置日志级别
logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO)

async def llm_model_func(
    prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs
) -> str:
    return await openai_complete_if_cache(
        model="llm",
        api_key="empty",
        base_url="http://localhost:12236/v1",
        prompt=prompt,
        system_prompt=system_prompt,
        history_messages=history_messages,
        **kwargs
    )

async def embedding_func(texts: list[str]) -> np.ndarray:
    return await openai_embedding(
        texts,
        model="ebd",
        api_key="empty",
        base_url="http://localhost:12237/v1"
    )

LLM_MODEL = "test"  # 模型名称
DATA_PATH = "../resources/data/cleaned/LiHua-World/data/"  # 数据路径
QUERY_PATH = "../resources/data/cleaned/LiHua-World/qa/query_set.csv"  # 查询路径
WORKING_DIR = "./rag_ouputs" # 工作目录
OUTPUT_PATH = "./logs/Default_output.csv"  # 输出路径
ENCODER = AutoTokenizer.from_pretrained("../resources/open_models/bge-large-zh-v1.5") # 用于检查token长度


if not os.path.exists(WORKING_DIR):
    os.mkdir(WORKING_DIR)

rag = MiniRAG(
    working_dir=WORKING_DIR,
    llm_model_func=llm_model_func,
    llm_model_max_token_size=200,
    llm_model_name=LLM_MODEL,
    embedding_func=EmbeddingFunc(
        embedding_dim=1024,
        max_token_size=512,
        func=embedding_func
    )
)

# Now indexing
def find_txt_files(root_path):
    txt_files = []
    for root, dirs, files in os.walk(root_path):
        for file in files:
            if file.endswith('.txt'):
                txt_files.append(os.path.join(root, file))
    return txt_files

WEEK_LIST = find_txt_files(DATA_PATH)
for WEEK in WEEK_LIST:
    id = WEEK_LIST.index(WEEK)
    print(f"{id}/{len(WEEK_LIST)}")
    with open(WEEK) as f:
        content = f.read()
        input_ids = ENCODER.encode(content)
        truncated_content = ENCODER.decode(input_ids[:512])
        print(len(truncated_content),len(ENCODER.encode(truncated_content)))
        rag.insert(truncated_content)

# A toy query
query = "What does LiHua predict will happen in \"The Rings of Power\"?"
answer = rag.query(query, param=QueryParam(mode="mini")).replace("\n", "").replace("\r", "")
print(answer)

INFO:minirag:Logger initialized for working directory: ./rag_ouputs
INFO:minirag:Load KV llm_response_cache with 24 data
INFO:minirag:Load KV full_docs with 8 data
INFO:minirag:Load KV text_chunks with 8 data
INFO:minirag:Loaded graph from ./rag_ouputs/graph_chunk_entity_relation.graphml with 45 nodes, 22 edges
INFO:nano-vectordb:Load (39, 1024) data
INFO:nano-vectordb:Init {'embedding_dim': 1024, 'metric': 'cosine', 'storage_file': './rag_ouputs/vdb_entities.json'} 39 data
INFO:nano-vectordb:Load (39, 1024) data
INFO:nano-vectordb:Init {'embedding_dim': 1024, 'metric': 'cosine', 'storage_file': './rag_ouputs/vdb_entities_name.json'} 39 data
INFO:nano-vectordb:Load (20, 1024) data
INFO:nano-vectordb:Init {'embedding_dim': 1024, 'metric': 'cosine', 'storage_file': './rag_ouputs/vdb_relationships.json'} 20 data
INFO:nano-vectordb:Load (12, 1024) data
INFO:nano-vectordb:Init {'embedding_dim': 1024, 'metric': 'cosine', 'storage_file': './rag_ouputs/vdb_chunks.json'} 12 data
INFO:minirag:[N

0/442
1107 345


INFO:minirag:[Entity Extraction]...


⠙ Processed 1 chunks, 6 entities(duplicated), 0 relations(duplicated)

INFO:minirag:Writing graph with 45 nodes, 22 edges
INFO:minirag:[New Docs] inserting 1 docs
INFO:minirag:[New Chunks] inserting 1 chunks
INFO:minirag:Inserting 1 vectors to chunks
INFO:httpx:HTTP Request: POST http://localhost:12237/v1/embeddings "HTTP/1.1 200 OK"
INFO:minirag:[Entity Extraction]...



1/442
1087 370
2/442
1108 345
⠙ Processed 1 chunks, 4 entities(duplicated), 1 relations(duplicated)

INFO:minirag:Inserting 4 vectors to entities
INFO:httpx:HTTP Request: POST http://localhost:12237/v1/embeddings "HTTP/1.1 200 OK"
INFO:minirag:Inserting 4 vectors to entities_name





INFO:httpx:HTTP Request: POST http://localhost:12237/v1/embeddings "HTTP/1.1 200 OK"
INFO:minirag:Inserting 1 vectors to relationships
INFO:httpx:HTTP Request: POST http://localhost:12237/v1/embeddings "HTTP/1.1 400 Bad Request"
INFO:minirag:Writing graph with 45 nodes, 22 edges


BadRequestError: Error code: 400 - {'object': 'error', 'message': "This model's maximum context length is 512 tokens. However, you requested 851 tokens in the input for embedding generation. Please reduce the length of the input.", 'type': 'BadRequestError', 'param': None, 'code': 400}