# setup

In [3]:
# !proxychains pip install -U pymilvus

# client

In [183]:
from pymilvus import MilvusClient

base_url = "host.docker.internal"
client = MilvusClient(
    uri=f"http://{base_url}:19530",
)

client.list_databases()

['default', 'milvus_quick_start']

# create and use database

In [None]:
db_name="milvus_quick_start"
try:
    client.create_database(db_name=db_name)
except Exception as e:
    print(e)

In [None]:
client.use_database(
    db_name=db_name
)

2025-05-25 06:40:59,634 [ERROR][handler]: RPC error: [create_database], <MilvusException: (code=65535, message=database already exist: milvus_quick_start)>, <Time:{'RPC start': '2025-05-25 06:40:59.630445', 'RPC error': '2025-05-25 06:40:59.634343'}> (decorators.py:140)


<MilvusException: (code=65535, message=database already exist: milvus_quick_start)>


drop database

In [184]:
# client.drop_database(
#     db_name="milvus_quick_start"
# )

# create collection

In [53]:
if client.has_collection(collection_name="demo_collection"):
    client.drop_collection(collection_name="demo_collection")
client.create_collection(
    collection_name="demo_collection",
    dimension=768,  # The vectors we will use in this demo has 768 dimensions
)

# text embedding

In [23]:
from pymilvus import model


embedding_fn = model.DefaultEmbeddingFunction()

docs = [
    "Artificial intelligence was founded as an academic discipline in 1956.",
    "Alan Turing was the first person to conduct substantial research in AI.",
    "Born in Maida Vale, London, Turing was raised in southern England.",
]

vectors = embedding_fn.encode_documents(docs)
print("Dim:", embedding_fn.dim, vectors[0].shape)  # Dim: 768 (768,)

data = [
    {"id": i, "vector": vectors[i], "text": docs[i], "subject": "history"}
    for i in range(len(vectors))
]

print("Data has", len(data), "entities, each with fields: ", data[0].keys())
print("Vector dim:", len(data[0]["vector"]))


Dim: 768 (768,)
Data has 3 entities, each with fields:  dict_keys(['id', 'vector', 'text', 'subject'])
Vector dim: 768


In [24]:
res = client.insert(collection_name="demo_collection", data=data)
print(res)

{'insert_count': 3, 'ids': [0, 1, 2]}


In [30]:
res = client.insert(collection_name="demo_collection", data=data)
print(res)

{'insert_count': 3, 'ids': [0, 1, 2]}


# semantic search

In [37]:
query_vectors = embedding_fn.encode_queries(["Who is Alan Turing?", "Artificial intelligence", "Alan Turing"])

res = client.search(
    collection_name="demo_collection",  # target collection
    data=query_vectors,  # query vectors
    limit=2,  # number of returned entities
    output_fields=["text", "subject"],  # specifies fields to be returned
)

print(res)


data: [[{'id': 2, 'distance': 0.5859946012496948, 'entity': {'text': 'Born in Maida Vale, London, Turing was raised in southern England.', 'subject': 'history'}}], [{'id': 0, 'distance': 0.47026658058166504, 'entity': {'text': 'Artificial intelligence was founded as an academic discipline in 1956.', 'subject': 'history'}}], [{'id': 2, 'distance': 0.5865286588668823, 'entity': {'text': 'Born in Maida Vale, London, Turing was raised in southern England.', 'subject': 'history'}}]]


# meta data filtering

In [None]:
docs = [
    "Machine learning has been used for drug design.",
    "Computational synthesis with AI algorithms predicts molecular properties.",
    "DDR1 is involved in cancers and fibrosis.",
]
vectors = embedding_fn.encode_documents(docs)
data = [
    {"id": 3 + i, "vector": vectors[i], "text": docs[i], "subject": "biology"}
    for i in range(len(vectors))
]

client.insert(collection_name="demo_collection", data=data)

In [41]:
res = client.search(
    collection_name="demo_collection",
    data=embedding_fn.encode_queries(["tell me AI related information"]),
    filter="subject == 'biology'",
    limit=2,
    output_fields=["text", "subject"],
)

print(res)

data: [[{'id': 4, 'distance': 0.2703055739402771, 'entity': {'text': 'Computational synthesis with AI algorithms predicts molecular properties.', 'subject': 'biology'}}, {'id': 3, 'distance': 0.16425904631614685, 'entity': {'text': 'Machine learning has been used for drug design.', 'subject': 'biology'}}]]


In [43]:
res = client.search(
    collection_name="demo_collection",
    data=embedding_fn.encode_queries(["tell me AI related information"]),
    # filter="subject == 'biology'",
    limit=10,
    output_fields=["text", "subject"],
)

print(res)

data: [[{'id': 1, 'distance': 0.44280391931533813, 'entity': {'text': 'Alan Turing was the first person to conduct substantial research in AI.', 'subject': 'history'}}, {'id': 4, 'distance': 0.2703055739402771, 'entity': {'text': 'Computational synthesis with AI algorithms predicts molecular properties.', 'subject': 'biology'}}, {'id': 0, 'distance': 0.23993626236915588, 'entity': {'text': 'Artificial intelligence was founded as an academic discipline in 1956.', 'subject': 'history'}}]]


# query

In [45]:
res = client.query(
    collection_name="demo_collection",
    filter="subject == 'history'",
    output_fields=["text", "subject"],
)

res

data: ["{'text': 'Artificial intelligence was founded as an academic discipline in 1956.', 'subject': 'history', 'id': 0}", "{'text': 'Alan Turing was the first person to conduct substantial research in AI.', 'subject': 'history', 'id': 1}", "{'text': 'Born in Maida Vale, London, Turing was raised in southern England.', 'subject': 'history', 'id': 2}"]

In [47]:
res = client.query(
    collection_name="demo_collection",
    ids=[0, 2],
    output_fields=["text", "subject"],
)

res

data: ["{'text': 'Artificial intelligence was founded as an academic discipline in 1956.', 'subject': 'history', 'id': 0}", "{'text': 'Born in Maida Vale, London, Turing was raised in southern England.', 'subject': 'history', 'id': 2}"]

# delete entities

In [48]:
res = client.delete(collection_name="demo_collection", ids=[0, 2])

print(res)

res = client.delete(
    collection_name="demo_collection",
    filter="subject == 'biology'",
)

print(res)

{'delete_count': 2}
{'delete_count': 3}


# drop collection

In [49]:
client.drop_collection(collection_name="demo_collection")

In [50]:
client.list_collections()

[]

# test duplicate insertion

In [54]:
if client.has_collection(collection_name="demo_collection"):
    client.drop_collection(collection_name="demo_collection")
client.create_collection(
    collection_name="demo_collection",
    dimension=768,
    auto_id=True
)

In [56]:
docs = [
    "Machine learning has been used for drug design.",
    "Computational synthesis with AI algorithms predicts molecular properties.",
    "DDR1 is involved in cancers and fibrosis.",
]
vectors = embedding_fn.encode_documents(docs)
data = [
    {"vector": vectors[i], "text": docs[i], "subject": "biology"}
    for i in range(len(vectors))
]

client.insert(collection_name="demo_collection", data=data)

{'insert_count': 3, 'ids': [458266833003755594, 458266833003755595, 458266833003755596], 'cost': 0}

In [57]:
docs = [
    "Machine learning has been used for drug design.",
    "Computational synthesis with AI algorithms predicts molecular properties.",
    "DDR1 is involved in cancers and fibrosis.",
]
vectors = embedding_fn.encode_documents(docs)
data = [
    {"vector": vectors[i], "text": docs[i], "subject": "biology"}
    for i in range(len(vectors))
]

client.insert(collection_name="demo_collection", data=data)

{'insert_count': 3, 'ids': [458266833003755598, 458266833003755599, 458266833003755600], 'cost': 0}

In [58]:
res = client.search(
    collection_name="demo_collection",
    data=embedding_fn.encode_queries(["tell me AI related information"]),
    filter="subject == 'biology'",
    limit=2,
    output_fields=["text", "subject"],
)

print(res)

data: [[{'id': 458266833003755599, 'distance': 0.2703055739402771, 'entity': {'subject': 'biology', 'text': 'Computational synthesis with AI algorithms predicts molecular properties.'}}, {'id': 458266833003755595, 'distance': 0.2703055739402771, 'entity': {'subject': 'biology', 'text': 'Computational synthesis with AI algorithms predicts molecular properties.'}}]]


In [92]:
client.drop_collection(collection_name="demo_collection")

# handle duplicate insertion by id

In [167]:
from pymilvus import MilvusClient, DataType

if client.has_collection(collection_name="demo_collection"):
    client.drop_collection(collection_name="demo_collection")
client.create_collection(
    collection_name="demo_collection",
    dimension=768,
    # id_type="str",
    # max_length=64,
)

In [168]:
import hashlib
import numpy as np

def get_vector_hash(vector: list[float]) -> str:
    vec_bytes = np.array(vector, dtype=np.float32).tobytes()
    return hashlib.sha256(vec_bytes).hexdigest()


def get_vector_hash_as_int(vector: list[float]) -> int:
    vec_bytes = np.array(vector, dtype=np.float32).tobytes()
    hash_object = hashlib.sha256(vec_bytes)
    hash_int = int(hash_object.hexdigest(), 16)
    hash_int = hash_int & ((1 << 63) - 1)
    
    return np.int64(hash_int)

In [177]:
docs = [
    "Machine learning has been used for drug design.",
    "Computational synthesis with AI algorithms predicts molecular properties.",
    "DDR1 is involved in cancers and fibrosis.",
]
vectors = embedding_fn.encode_documents(docs)
data = [
    {"id": get_vector_hash_as_int(vectors[i]), "vector": vectors[i], "text": docs[i], "subject": "biology"}
    for i in range(len(vectors))
]

client.insert(collection_name="demo_collection", data=data)

{'insert_count': 3, 'ids': [4491577877253948629, 3150481097735110647, 5568380074954272606], 'cost': 0}

In [181]:
client.get_collection_stats("demo_collection")

{'row_count': 3}

In [178]:
client.close()

# generate file_id

In [191]:
import uuid
import random
import string

def generate_file_id(prefix="file-"):
    # Generate a random UUID and convert it to a string
    random_uuid = uuid.uuid4().hex[:16]  # 12 characters from UUID
    # Generate a random string of 6 characters (could be alphanumeric)
    random_str = ''.join(random.choices(string.ascii_letters + string.digits, k=6))
    # Combine the prefix, the UUID segment, and the random string
    return f"{prefix}{random_uuid}{random_str}"

# Example usage
file_id = generate_file_id()
print(file_id)

file-ecaa7994348e4444YGEE8q
