In [25]:
from pymilvus import Collection, connections
import pandas as pd
import numpy as np
import time

In [26]:
# load the existing collection
connections.connect(host='localhost', port='19530')
collection_name = 'text_embeddings'
collection = Collection(name=collection_name)

## String Query

In [27]:
collection.release()
collection.drop_index()
index_params = {
    "metric_type": "L2",
    "index_type": "IVF_FLAT",
    "params": {"nlist": 128},
    "index_name": "embedding_index"
}
collection.create_index(field_name="embedding", index_params=index_params)
collection.load()

In [28]:
expr = f"company_name == 'TheHersheyCompany'"
t0 = time.time()
query_results = collection.query(expr=expr, output_fields=['company_name', 'document_name'])
print("Query workload:")
print(time.time() - t0, 's')

Query workload:
0.0305941104888916 s


## Single vector search 

In [29]:
# generate query vectors
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
sentences = ["Does Apple prioritize the protection of user data?",
             "Will Apple share non-personal data with third parties?"]
query_vectors = model.encode(sentences)
query_vector = [query_vectors[0]]

In [30]:
# single-vector
t0 = time.time()
results = collection.search(data=query_vector, limit=3, 
                            anns_field="embedding", param={"metric_type": "L2", "params": {}},
                            output_fields=['company_name', 'document_name'])
print("Query workload:")
print(time.time() - t0, 's')
print("Query output:")
for result in results[0]:
    print(result)

Query workload:
0.06738924980163574 s
Query output:
id: 449090663332650857, distance: 137.30517578125, entity: {'company_name': 'iCloud', 'document_name': 'Privacypolicy'}
id: 449090663332650848, distance: 147.77349853515625, entity: {'company_name': 'iCloud', 'document_name': 'Privacypolicy'}
id: 449090663332650781, distance: 155.76712036132812, entity: {'company_name': 'iCloud', 'document_name': 'Privacypolicy'}


## Bulk vector search 

In [31]:
# bulk-vector
t0 = time.time()
results = collection.search(data=query_vectors, limit=3, 
                            anns_field="embedding", param={"metric_type": "L2", "params": {}},
                            output_fields=['company_name', 'document_name'])
print("Query workload:")
print(time.time() - t0, 's')
print("Query output:")
for result in results[0]:
    print(result)

Query workload:
0.0232999324798584 s
Query output:
id: 449090663332650857, distance: 137.30517578125, entity: {'company_name': 'iCloud', 'document_name': 'Privacypolicy'}
id: 449090663332650848, distance: 147.77349853515625, entity: {'company_name': 'iCloud', 'document_name': 'Privacypolicy'}
id: 449090663332650781, distance: 155.76712036132812, entity: {'company_name': 'iCloud', 'document_name': 'Privacypolicy'}


## Range search 

In [32]:
# range search
search_params = {
    "metric_type": "L2",
    "params": {
        "radius": 150 # Radius of the search circle
    }
}
t0 = time.time()
results = collection.search(data=query_vector, limit=5, 
                            anns_field="embedding", param=search_params,
                            output_fields=['company_name', 'document_name'])
print("Query workload:")
print(time.time() - t0, 's')
print("Query output:")
for result in results[0]:
    print(result)

Query workload:
0.004111051559448242 s
Query output:
id: 449090663332650857, distance: 137.30517578125, entity: {'company_name': 'iCloud', 'document_name': 'Privacypolicy'}
id: 449090663332650848, distance: 147.77349853515625, entity: {'company_name': 'iCloud', 'document_name': 'Privacypolicy'}
