In [11]:
from mpmath.calculus.extrapolation import limit
from pymilvus import MilvusClient

In [12]:
client = MilvusClient("milvus_demo.db")

In [13]:
if client.has_collection(collection_name="demo_collection"):
    client.drop_collection(collection_name="demo_collection")
client.create_collection(
    collection_name="demo_collection",
    dimension=768,  # The vectors we will use in this demo has 768 dimensions
)

In [14]:
from pymilvus import model

In [16]:
embedding_fn = model.DefaultEmbeddingFunction()

In [17]:
docs = [
    "Artificial intelligence was founded as an academic discipline in 1956.",
    "Alan Turing was the first person to conduct substantial research in AI.",
    "Born in Maida Vale, London, Turing was raised in southern England.",
]

In [18]:
vectors = embedding_fn.encode_documents(docs)

In [19]:
print("Dim:", embedding_fn.dim, vectors[0].shape)  # Dim: 768 (768,)

Dim: 768 (768,)


In [20]:
data = [
    {"id": i, "vector": vectors[i], "text": docs[i], "subject": "history"}
    for i in range(len(vectors))
]

In [21]:
data

[{'id': 0,
  'vector': array([ 1.07278393e-02, -3.58951503e-02,  1.87497373e-02,  1.63487666e-02,
          3.65169027e-02,  3.58817212e-03, -4.00470114e-04,  2.85293872e-02,
          2.27454913e-03,  1.83625009e-03,  4.22585015e-03,  2.71739583e-02,
         -3.68434636e-03,  3.07915635e-02,  4.50543150e-03,  4.42281506e-02,
          1.05038375e-02, -2.94945268e-02, -6.70733939e-02, -2.05264402e-02,
          1.53227571e-02, -6.00494759e-03, -6.22854787e-02, -3.96147320e-02,
          1.42062952e-02,  3.27076629e-02, -2.08345880e-02, -4.41742957e-02,
         -2.83398640e-02,  2.94244666e-02, -2.80872147e-02, -2.08090396e-02,
          1.71597434e-02,  2.11165568e-03,  2.18237611e-02, -1.57762666e-03,
         -3.76967229e-02,  4.14607458e-02, -2.50563712e-02,  8.33367671e-02,
         -1.59791573e-02,  9.81381329e-03, -2.66053944e-02,  6.18992948e-04,
          3.73590060e-03, -3.41551657e-02,  5.87082563e-02, -2.37219631e-02,
          6.74593579e-03, -3.58418540e-02, -1.75600166e

In [22]:
print("Data has", len(data), "entities, each with fields: ", data[0].keys())
print("Vector dim:", len(data[0]["vector"]))


Data has 3 entities, each with fields:  dict_keys(['id', 'vector', 'text', 'subject'])
Vector dim: 768


In [23]:
# 插入数据
res = client.insert(collection_name="demo_collection", data=data)

print(res)

{'insert_count': 3, 'ids': [0, 1, 2]}


In [24]:
# 向量搜索
query_vectors = embedding_fn.encode_queries(["Who is Alan Turing?"])
# If you don't have the embedding function you can use a fake vector to finish the demo:
# query_vectors = [ [ random.uniform(-1, 1) for _ in range(768) ] ]

res = client.search(
    collection_name="demo_collection",  # target collection
    data=query_vectors,  # query vectors
    limit=2,  # number of returned entities
    output_fields=["text", "subject"],  # specifies fields to be returned
)

print(res)

data: ["[{'id': 2, 'distance': 0.58599454164505, 'entity': {'text': 'Born in Maida Vale, London, Turing was raised in southern England.', 'subject': 'history'}}, {'id': 1, 'distance': 0.5118259191513062, 'entity': {'text': 'Alan Turing was the first person to conduct substantial research in AI.', 'subject': 'history'}}]"]


In [25]:
# 带元数据过滤的向量搜索
# Insert more docs in another subject.
docs = [
    "Machine learning has been used for drug design.",
    "Computational synthesis with AI algorithms predicts molecular properties.",
    "DDR1 is involved in cancers and fibrosis.",
]
vectors = embedding_fn.encode_documents(docs)
data = [
    {"id": 3 + i, "vector": vectors[i], "text": docs[i], "subject": "biology"}
    for i in range(len(vectors))
]

client.insert(collection_name="demo_collection", data=data)

# This will exclude any text in "history" subject despite close to the query vector.
res = client.search(
    collection_name="demo_collection",
    data=embedding_fn.encode_queries(["tell me AI related information"]),
    filter="subject == 'biology'",
    limit=2,
    output_fields=["text", "subject"],
)

print(res)

data: ["[{'id': 4, 'distance': 0.2703055739402771, 'entity': {'text': 'Computational synthesis with AI algorithms predicts molecular properties.', 'subject': 'biology'}}, {'id': 3, 'distance': 0.16425904631614685, 'entity': {'text': 'Machine learning has been used for drug design.', 'subject': 'biology'}}]"]


In [30]:
# 查询
res = client.query(
    collection_name="demo_collection",
    filter="subject == 'history'",
    output_fields=["text", "subject"],
)

print(len(res))
print(res)

3
data: ["{'id': 0, 'text': 'Artificial intelligence was founded as an academic discipline in 1956.', 'subject': 'history'}", "{'id': 1, 'text': 'Alan Turing was the first person to conduct substantial research in AI.', 'subject': 'history'}", "{'id': 2, 'text': 'Born in Maida Vale, London, Turing was raised in southern England.', 'subject': 'history'}"]


In [33]:
# 通过主键直接检索实体
res = client.query(
    collection_name="demo_collection",
    ids=[0, 2],
    output_fields=["vector", "text", "subject"],
)
print(len(res))
print(res)

2
data: ["{'id': 0, 'text': 'Artificial intelligence was founded as an academic discipline in 1956.', 'subject': 'history', 'vector': [np.float32(0.01072784), np.float32(-0.03589515), np.float32(0.018749738), np.float32(0.016348766), np.float32(0.0365169), np.float32(0.0035881721), np.float32(-0.0004004701), np.float32(0.028529387), np.float32(0.002274549), np.float32(0.0018362501), np.float32(0.00422585), np.float32(0.027173959), np.float32(-0.0036843463), np.float32(0.030791564), np.float32(0.0045054313), np.float32(0.04422815), np.float32(0.010503838), np.float32(-0.029494528), np.float32(-0.0670734), np.float32(-0.02052644), np.float32(0.015322757), np.float32(-0.0060049477), np.float32(-0.06228548), np.float32(-0.039614733), np.float32(0.014206295), np.float32(0.03270766), np.float32(-0.020834588), np.float32(-0.044174295), np.float32(-0.028339865), np.float32(0.029424466), np.float32(-0.028087215), np.float32(-0.02080904), np.float32(0.017159743), np.float32(0.0021116557), np.flo

In [36]:
# 删除实体
# 通过主键删除实体
res = client.delete(collection_name="demo_collection", ids=[0, 2])
print(res)

# 通过过滤条件删除实体
res = client.delete(
    collection_name="demo_collection",
    filter="subject == 'biology'",
)
print(res)

{}
[3, 4, 5]


In [37]:
# 加载现有数据
"""
    由于 Milvus Lite 的所有数据都存储在本地文件中，因此即使在程序终止后，你也可以通过创建一个带有现有文件的MilvusClient ，
    将所有数据加载到内存中。例如，这将恢复 "milvus_demo.db "文件中的 Collections，并继续向其中写入数据。
"""
from pymilvus import MilvusClient
client = MilvusClient("milvus_demo.db") # 加载本地的向量数据库文件

# 删除Collections
client.drop_collection(collection_name="demo_collection")