In [2]:
import json
from glob import glob
from pymilvus import model as milvus_model
from pymilvus import MilvusClient
from tqdm import tqdm
from openai import OpenAI
import os

# 从环境变量获取 DeepSeek API Key
api_key = os.getenv("DEEPSEEK_API_KEY")
deepseek_client = OpenAI(
    api_key=api_key,
    base_url="https://api.deepseek.com/v1",  # DeepSeek API 的基地址
)

text_lines = []

for file_path in glob("mfd.md", recursive=True):
    with open(file_path, "r") as file:
        file_text = file.read()

    text_lines += file_text.split("** ")

embedding_model = milvus_model.DefaultEmbeddingFunction()
# 测试
test_embedding = embedding_model.encode_queries(["This is a test"])[0]
embedding_dim = len(test_embedding)
print(embedding_dim)
print(test_embedding[:10])
# 创建客户发的
milvus_client = MilvusClient(uri="./milvus_mfd.db")

collection_name = "mfd_collection"
if milvus_client.has_collection(collection_name):
    print(f"存在{collection_name}")
    milvus_client.drop_collection(collection_name)
milvus_client.create_collection(
    collection_name=collection_name,
    dimension=embedding_dim,
    metric_type="IP",  # 内积距离
    consistency_level="Strong",  # 支持的值为 (`"Strong"`, `"Session"`, `"Bounded"`, `"Eventually"`)。更多详情请参见 https://milvus.io/docs/consistency.md#Consistency-Level。
)
# 插入数据
print("插入开始")
data = []
doc_embeddings = embedding_model.encode_documents(text_lines)
for i, line in enumerate(tqdm(text_lines, desc="Creating embeddings")):
    data.append({"id": i, "vector": doc_embeddings[i], "text": line})
milvus_client.insert(collection_name=collection_name, data=data)
print("插入成功")
# 检索数据
print("question1--------------------------------")
question = "物权登记机构应当履行?"
search_res = milvus_client.search(
    collection_name=collection_name,
    data=embedding_model.encode_queries(
        [question]
    ),  # 将问题转换为嵌入向量
    limit=3,  # 返回前3个结果
    search_params={"metric_type": "IP", "params": {}},  # 内积距离
    output_fields=["text"],  # 返回 text 字段
)

retrieved_lines_with_distances = [
    (res["entity"]["text"], res["distance"]) for res in search_res[0]
]
print(json.dumps(retrieved_lines_with_distances, indent=4))

context = "\n".join(
    [line_with_distance[0] for line_with_distance in retrieved_lines_with_distances]
)

SYSTEM_PROMPT = """
Human: 你是一个 AI 助手。你能够从提供的上下文段落片段中找到问题的答案。
"""
USER_PROMPT = f"""
请使用以下用 <context> 标签括起来的信息片段来回答用 <question> 标签括起来的问题。最后追加原始回答的中文翻译，并用 <translated>和</translated> 标签标注。
<context>
{context}
</context>
<question>
{question}
</question>
<translated>
</translated>
"""
response = deepseek_client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT},
    ],
)
print(response.choices[0].message.content)

768
[-0.04836059  0.07163021 -0.01130063 -0.03789341 -0.03320651 -0.01318453
 -0.03041721 -0.02269495 -0.02317858 -0.00426026]
存在mfd_collection
插入开始


Creating embeddings: 100%|██████████| 388/388 [00:00<00:00, 362528.39it/s]


插入成功
question1--------------------------------
[
    [
        "\u4e0d\u52a8\u4ea7\u767b\u8bb0\u7c3f\u7531\u767b\u8bb0\u673a\u6784\u7ba1\u7406\u3002\n\u4e0d\u52a8\u4ea7\u767b\u8bb0\u7c3f\u5e94\u5f53\u91c7\u7528\u7eb8\u8d28\u5f62\u5f0f\u6216\u8005\u7535\u5b50\u5f62\u5f0f\u3002\n\u4e0d\u52a8\u4ea7\u767b\u8bb0\u7c3f\u91c7\u7528\u7535\u5b50\u5f62\u5f0f\u7684\uff0c\u5e94\u5f53\u5907\u4efd\u3002\n\n**\u7b2c\u4e8c\u767e\u4e00\u5341\u516d\u6761",
        0.6870908737182617
    ],
    [
        "\u5171\u540c\u5171\u6709\u4eba\u5bf9\u5171\u6709\u7684\u4e0d\u52a8\u4ea7\u6216\u8005\u52a8\u4ea7\u5171\u540c\u4eab\u6709\u6240\u6709\u6743\u3002\n\u5171\u540c\u5171\u6709\u4eba\u5bf9\u5171\u6709\u8d22\u4ea7\u4eab\u6709\u5e73\u7b49\u7684\u5360\u6709\u3001\u4f7f\u7528\u3001\u6536\u76ca\u548c\u5904\u5206\u7684\u6743\u5229\u3002\n\n**\u7b2c\u4e09\u767e\u4e00\u5341\u4e09\u6761",
        0.6671562194824219
    ],
    [
        "\u4e1a\u4e3b\u5e94\u5f53\u9075\u5b88\u6cd5\u5f8b\u3001\u6cd5\u89c4\u4ee5\u53ca\u7b

In [16]:
question = "共同共有人对共有财产享有什么权利"
search_res_1 = milvus_client.search(
    collection_name=collection_name,
    data=embedding_model.encode_queries(
        [question]
    ),  # 将问题转换为嵌入向量
    limit=3,  # 返回前3个结果
    search_params={"metric_type": "IP", "params": {}},  # 内积距离
    output_fields=["text"],  # 返回 text 字段
)
retrieved_lines_with_distances_1 = [
    (res["entity"]["text"], res["distance"]) for res in search_res_1[0]
]
print(json.dumps(retrieved_lines_with_distances_1, indent=4))

context = "\n".join(
    [line_with_distance[0] for line_with_distance in retrieved_lines_with_distances_1]
)

SYSTEM_PROMPT = """
Human: 你是一个 AI 助手。你能够从提供的上下文段落片段中找到问题的答案。
"""
USER_PROMPT = f"""
请使用以下用 <context> 标签括起来的信息片段来回答用 <question> 标签括起来的问题。最后追加原始回答的中文翻译，并用 <translated>和</translated> 标签标注。
<context>
{context}
</context>
<question>
{question}
</question>
<translated>
</translated>
"""
response = deepseek_client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT},
    ],
)
print(response.choices[0].message.content)

[
    [
        "\u4e0d\u52a8\u4ea7\u767b\u8bb0\u7c3f\u7531\u767b\u8bb0\u673a\u6784\u7ba1\u7406\u3002\n\u4e0d\u52a8\u4ea7\u767b\u8bb0\u7c3f\u5e94\u5f53\u91c7\u7528\u7eb8\u8d28\u5f62\u5f0f\u6216\u8005\u7535\u5b50\u5f62\u5f0f\u3002\n\u4e0d\u52a8\u4ea7\u767b\u8bb0\u7c3f\u91c7\u7528\u7535\u5b50\u5f62\u5f0f\u7684\uff0c\u5e94\u5f53\u5907\u4efd\u3002\n\n**\u7b2c\u4e8c\u767e\u4e00\u5341\u516d\u6761",
        0.6827887892723083
    ],
    [
        "\u5171\u540c\u5171\u6709\u4eba\u5bf9\u5171\u6709\u7684\u4e0d\u52a8\u4ea7\u6216\u8005\u52a8\u4ea7\u5171\u540c\u4eab\u6709\u6240\u6709\u6743\u3002\n\u5171\u540c\u5171\u6709\u4eba\u5bf9\u5171\u6709\u8d22\u4ea7\u4eab\u6709\u5e73\u7b49\u7684\u5360\u6709\u3001\u4f7f\u7528\u3001\u6536\u76ca\u548c\u5904\u5206\u7684\u6743\u5229\u3002\n\n**\u7b2c\u4e09\u767e\u4e00\u5341\u4e09\u6761",
        0.6824849843978882
    ],
    [
        "\u4e1a\u4e3b\u5e94\u5f53\u9075\u5b88\u6cd5\u5f8b\u3001\u6cd5\u89c4\u4ee5\u53ca\u7ba1\u7406\u89c4\u7ea6\u3002\n\u7ba1\u7406\u89c4\