In [4]:
import os
from openai import OpenAI
from pymilvus import model as milvus_model
from pymilvus import MilvusClient
from tqdm import tqdm
import json
from sentence_transformers import SentenceTransformer

api_key = os.getenv("DEEPSEEK_API_KEY")

In [5]:
text_lines = []
with open("mfd.md", "r") as f:
    file_text = f.read()
    text_lines += file_text.split("\n")
    #text_lines += file_text.split("\n")

len(text_lines)

1113

In [6]:
deepseek_client = OpenAI(api_key=api_key, base_url="https://api.deepseek.com/v1")

#embedding_model = milvus_model.DefaultEmbeddingFunction()
model_name = "Qwen/Qwen3-Embedding-0.6B"
model = SentenceTransformer(model_name)
embedding_model = SentenceTransformer(model_name)
#test_embedding = embedding_model.encode_queries(["这是一个测试句子", "这是另一个示例文本"])[0]
test_embedding = embedding_model.encode(["这是一个测试句子", "这是另一个示例文本"])[0]
embedding_dim = len(test_embedding)

test_embedding[:10]

array([-0.02845293, -0.07847408, -0.0102826 , -0.05129922,  0.0151234 ,
        0.05308735, -0.01762475,  0.0274948 , -0.08045564,  0.01283308],
      dtype=float32)

In [7]:
milvus_client = MilvusClient(uri="./mfd.db")
collection_name = "my_rag_collection"
if milvus_client.has_collection(collection_name):
    milvus_client.drop_collection(collection_name)

milvus_client.create_collection(
    collection_name=collection_name,
    dimension=embedding_dim,
    metric_type="IP",
    consistency_level="Strong"
)

data = []

#doc_embeddings = embedding_model.encode_documents(text_lines)
doc_embeddings = embedding_model.encode(text_lines)

for i, line in enumerate(tqdm(text_lines, desc="Creating embeddings")):
    data.append({"id": i, "vector": doc_embeddings[i], "text": line})

milvus_client.insert(collection_name=collection_name, data=data)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Creating embeddings: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1113/1113 [00:00<00:00, 2631488.36it/s]


{'insert_count': 1113, 'ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215,

In [8]:
SYSTEM_PROMPT = """
Human: 你是一个 AI 助手。你能够从提供的上下文段落片段中找到问题的答案。
"""

USER_PROMPT = """
请使用以下用 <context> 标签括起来的信息片段来回答用 <question> 标签括起来的问题。
<context>
{context}
</context>
<question>
{question}
</question>
"""

In [9]:
def search_and_question(q):
    search_res = milvus_client.search(collection_name=collection_name,
                                      data=embedding_model.encode([q]),
                                      limit=10,
                                      search_params={"metric_type": "IP", "params": {}},
                                      output_fields=["text"]
                                      )

    retrieved_lines_with_distances = [
        (res["entity"]["text"], res["distance"]) for res in search_res[0]
    ]
    print(json.dumps(retrieved_lines_with_distances, indent=4))

    context = "\n".join(
        [line_with_distance[0] for line_with_distance in retrieved_lines_with_distances]
    )

    print(context)
    print(q)

    user_prompt = USER_PROMPT.format(context=context, question=q)
    print(user_prompt)

    response = deepseek_client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": user_prompt},
        ],
    )

    return response.choices[0].message.content

In [10]:
question = "宅基地能卖吗？"
search_and_question(question)

[
    [
        "**\u7b2c\u4e09\u767e\u516b\u5341\u6761** \u5b85\u57fa\u5730\u4f7f\u7528\u6743\u53ef\u4ee5\u8f6c\u8ba9\u3001\u4e92\u6362\u3001\u7ee7\u627f\u3002",
        0.6853860020637512
    ],
    [
        "\u5b85\u57fa\u5730\u4f7f\u7528\u6743\u4eba\u6709\u6743\u5728\u5b85\u57fa\u5730\u4e0a\u5efa\u9020\u623f\u5c4b\u53ca\u5176\u9644\u5c5e\u8bbe\u65bd\u3002",
        0.6617163419723511
    ],
    [
        "**\u7b2c\u4e09\u767e\u4e03\u5341\u516b\u6761** \u5b85\u57fa\u5730\u4f7f\u7528\u6743\u7684\u8bbe\u7acb\u3001\u884c\u4f7f\u548c\u8f6c\u8ba9\uff0c\u9002\u7528\u6cd5\u5f8b\u3001\u6cd5\u89c4\u7684\u89c4\u5b9a\u3002",
        0.6331154108047485
    ],
    [
        "\u8f6c\u8ba9\u3001\u4e92\u6362\u5b85\u57fa\u5730\u4f7f\u7528\u6743\u7684\uff0c\u5e94\u5f53\u4f9d\u7167\u6cd5\u5f8b\u89c4\u5b9a\u529e\u7406\u767b\u8bb0\u3002",
        0.63249671459198
    ],
    [
        "**\u7b2c\u4e09\u767e\u4e03\u5341\u4e03\u6761** \u5b85\u57fa\u5730\u4f7f\u7528\u6743\u4eba\u4f9d\u6cd5\u5bf9\u96c6\u4f53

'根据提供的法律条文内容，宅基地使用权是可以转让的（第三百八十条明确规定"宅基地使用权可以转让、互换、继承"），但需要特别注意以下几点法律要求：\n\n1. 转让必须符合法律法规的规定（第三百七十八条）\n2. 转让需要依法办理登记手续（第三百七十八条）\n3. 宅基地属于集体所有（第三百七十九条），因此转让的是使用权而非所有权\n\n所以准确来说，宅基地的使用权可以依法转让（即通常所说的"卖"），但土地所有权仍归集体所有。转让时需要严格遵守相关法律规定并办理登记手续。'

In [11]:
question_2 = "高空砸物怎么办？"
search_and_question(question_2)

[
    [
        "**\u7b2c\u4e8c\u767e\u516b\u5341\u516d\u6761** \u8fdd\u53cd\u56fd\u5bb6\u6709\u5173\u89c4\u5b9a\uff0c\u5728\u5efa\u7b51\u7269\u53ca\u5176\u9644\u5c5e\u8bbe\u65bd\u4e0a\u6316\u6d1e\u3001\u51ff\u5b54\u3001\u64c5\u81ea\u6539\u53d8\u623f\u5c4b\u7ed3\u6784\u3001\u7528\u9014\uff0c\u6216\u8005\u8bbe\u7f6e\u59a8\u788d\u4ed6\u4eba\u901a\u884c\u3001\u91c7\u5149\u3001\u901a\u98ce\u7684\u8bbe\u65bd\u7b49\uff0c\u635f\u5bb3\u4ed6\u4eba\u5408\u6cd5\u6743\u76ca\u7684\uff0c\u5e94\u5f53\u4f9d\u6cd5\u627f\u62c5\u6c11\u4e8b\u8d23\u4efb\u3002",
        0.5241566896438599
    ],
    [
        "\u9020\u6210\u635f\u5bb3\u7684\uff0c\u5e94\u5f53\u8d54\u507f\u635f\u5931\u3002",
        0.5239578485488892
    ],
    [
        "\u9020\u6210\u635f\u5bb3\u7684\uff0c\u5e94\u5f53\u8d54\u507f\u635f\u5931\u3002",
        0.5239578485488892
    ],
    [
        "\u9020\u6210\u635f\u5bb3\u7684\uff0c\u5e94\u5f53\u8d54\u507f\u635f\u5931\u3002",
        0.5239578485488892
    ],
    [
        "\u9020\u6210\u

'根据提供的上下文，该法律条文（第二百八十六条）主要针对的是在建筑物上擅自施工或设置障碍物损害他人权益的行为，并未直接提及高空抛物问题。不过可以类推适用以下原则：\n\n1. 如果高空砸物是因"擅自改变房屋结构/设置设施"导致（如违规安装外置物），责任人需依法承担民事责任并赔偿损失。\n\n2. 对于占有物（如阳台花盆）坠落造成的毁损灭失，占有人有权请求赔偿，同时也意味着物品所有人可能需要承担相应责任。\n\n注：更准确的处理建议需参考《民法典》第1254条等专门针对高空抛物的法律规定，当前上下文提供的信息有限。'