In [27]:
from pymilvus import Collection, connections
import pandas as pd
import numpy as np
import time

In [28]:
# load the existing collection
connections.connect(host='localhost', port='19530')
collection_name = 'text_embeddings'
collection = Collection(name=collection_name)

In [29]:
collection.release()
collection.drop_index()
index_params = {
    "metric_type": "L2",
    "index_type": "IVF_FLAT",
    "params": {"nlist": 128},
    "index_name": "embedding_index"
}
collection.create_index(field_name="embedding", index_params=index_params)
collection.load()

## String Query

In [30]:
expr = f"company_name == 'TheHersheyCompany'"
t0 = time.time()
for _ in range(1000):
    query_results = collection.query(expr=expr, output_fields=['company_name', 'document_name'])
print("Query Timing:")
print((time.time() - t0)/1000, 's')
print("Query output:")
for result in query_results:
    print(result)

Query Timing:
0.007655658960342407 s
Query output:
{'document_name': 'PRIVACYPOLICY', 'sentence_id': 449109165435814132, 'company_name': 'TheHersheyCompany'}
{'document_name': 'PRIVACYPOLICY', 'sentence_id': 449109165435814133, 'company_name': 'TheHersheyCompany'}
{'document_name': 'PRIVACYPOLICY', 'sentence_id': 449109165435814134, 'company_name': 'TheHersheyCompany'}
{'document_name': 'PRIVACYPOLICY', 'sentence_id': 449109165435814135, 'company_name': 'TheHersheyCompany'}
{'document_name': 'PRIVACYPOLICY', 'sentence_id': 449109165435814136, 'company_name': 'TheHersheyCompany'}
{'document_name': 'PRIVACYPOLICY', 'sentence_id': 449109165435814137, 'company_name': 'TheHersheyCompany'}
{'document_name': 'PRIVACYPOLICY', 'sentence_id': 449109165435814138, 'company_name': 'TheHersheyCompany'}
{'document_name': 'PRIVACYPOLICY', 'sentence_id': 449109165435814139, 'company_name': 'TheHersheyCompany'}
{'document_name': 'PRIVACYPOLICY', 'sentence_id': 449109165435814140, 'company_name': 'TheHer

## Single vector search 

In [31]:
# generate query vectors
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
sentences = ["Does Apple prioritize the protection of user data?",
             "Will Apple share non-personal data with third parties?"]
query_vectors = model.encode(sentences)
query_vector = [query_vectors[0]]

In [32]:
# single-vector
t0 = time.time()
for _ in range(1000):
    results = collection.search(data=query_vector, limit=3, 
                                anns_field="embedding", param={"metric_type": "L2", "params": {}},
                                output_fields=['company_name', 'document_name'])
print("Query timing:")
print((time.time() - t0)/1000, 's')
print("Query output:")
for result in results[0]:
    print(result)

Query timing:
0.0036761140823364257 s
Query output:
id: 449109165436250921, distance: 137.30517578125, entity: {'document_name': 'Privacypolicy', 'company_name': 'iCloud'}
id: 449109165436250912, distance: 147.77349853515625, entity: {'document_name': 'Privacypolicy', 'company_name': 'iCloud'}
id: 449109165436250845, distance: 155.76712036132812, entity: {'document_name': 'Privacypolicy', 'company_name': 'iCloud'}


## Bulk vector search 

In [33]:
# bulk-vector
t0 = time.time()
for _ in range(1000):
    results = collection.search(data=query_vectors, limit=3, 
                                anns_field="embedding", param={"metric_type": "L2", "params": {}},
                                output_fields=['company_name', 'document_name'])
print("Query timing:")
print((time.time() - t0)/1000, 's')
print("Query output:")
for result in results[0]:
    print(result)

Query timing:
0.005069957971572876 s
Query output:
id: 449109165436250921, distance: 137.30517578125, entity: {'document_name': 'Privacypolicy', 'company_name': 'iCloud'}
id: 449109165436250912, distance: 147.77349853515625, entity: {'document_name': 'Privacypolicy', 'company_name': 'iCloud'}
id: 449109165436250845, distance: 155.76712036132812, entity: {'document_name': 'Privacypolicy', 'company_name': 'iCloud'}


## Filtered search

In [34]:
t0 = time.time()
for _ in range(1000):
    results = collection.search(data=query_vector, limit=3, 
                                anns_field="embedding", param={"metric_type": "L2", "params": {}},
                                output_fields=['company_name', 'document_name'],
                                expr="company_name == 'iCloud'")
print("Query timing:")
print((time.time() - t0)/1000, 's')
print("Query output:")
for result in results[0]:
    print(result)

Query timing:
0.0014749870300292968 s
Query output:
id: 449109165436250921, distance: 137.30517578125, entity: {'company_name': 'iCloud', 'document_name': 'Privacypolicy'}
id: 449109165436250912, distance: 147.77349853515625, entity: {'company_name': 'iCloud', 'document_name': 'Privacypolicy'}
id: 449109165436250845, distance: 155.76712036132812, entity: {'company_name': 'iCloud', 'document_name': 'Privacypolicy'}


## Range search 

In [35]:
# range search
search_params = {
    "metric_type": "L2",
    "params": {
        "radius": 150 # Radius of the search circle
    }
}
t0 = time.time()
for _ in range(1000):
    results = collection.search(data=query_vector, limit=5, 
                                anns_field="embedding", param=search_params,
                                output_fields=['company_name', 'document_name'])
print("Query timing:")
print((time.time() - t0)/1000, 's')
print("Query output:")
for result in results[0]:
    print(result)

Query timing:
0.0013716180324554444 s
Query output:
id: 449109165436250921, distance: 137.30517578125, entity: {'company_name': 'iCloud', 'document_name': 'Privacypolicy'}
id: 449109165436250912, distance: 147.77349853515625, entity: {'company_name': 'iCloud', 'document_name': 'Privacypolicy'}
