In [5]:
from pymilvus import Collection, CollectionSchema, FieldSchema, DataType, utility , connections
from transformers import BertTokenizer, BertModel
import numpy as np
import torch


In [6]:
# Define Milvus collection schema
def create_milvus_collection():
    fields = [
        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=768),
        FieldSchema(name="id", dtype=DataType.INT64, is_primary=True)
    ]
    schema = CollectionSchema(fields, "Multi-modal similarity search")
    collection = Collection("multi_modal_collection", schema)
    return collection

In [7]:
# Define function to get BERT embeddings
def get_bert_embeddings(texts):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
    return embeddings

In [9]:
connections.connect(alias="default", host='localhost', port='19530')

In [10]:

# Initialize Milvus collection
collection = create_milvus_collection()

In [15]:
# Sample text data
texts = ["Machine learning", "Natural language processing", "Programming"]
text_embeddings = get_bert_embeddings(texts)

In [12]:
# Insert embeddings into Milvus
ids = [i for i in range(len(texts))]
collection.insert([text_embeddings.tolist(), ids])


(insert count: 3, delete count: 0, upsert count: 0, timestamp: 452593085762240515, success count: 3, err count: 0

In [20]:
# **Create an index on the 'embedding' field**
index_param = {
    "index_type": "IVF_FLAT",   # You can also try "HNSW", "IVF_SQ8", etc.
    "metric_type": "L2",        # L2 distance (Euclidean) or "IP" for inner product
    "params": {"nlist": 100}    # nlist is the number of clusters for IVF index
}

In [21]:
collection.create_index("embedding", index_param)

Status(code=0, message=)

In [22]:
# Load the collection into memory
collection.load()

In [23]:
# Define search parameters
search_param = {"metric_type": "L2", "params": {"nprobe": 10}}

In [24]:
# Query for similar embeddings
query_text = ["AI and machine learning"]
query_embedding = get_bert_embeddings(query_text)

In [25]:
# Perform the search
results = collection.search(
    query_embedding.tolist(),  # Query embeddings
    anns_field="embedding",    # Field name of the embedding in the collection
    param=search_param,        # Search parameters (index and metric)
    limit=2                    # Limit the number of results
)


In [26]:
# Output search results
print("Search results:", results)

Search results: data: ["['id: 0, distance: 26.06183433532715, entity: {}', 'id: 1, distance: 52.87938690185547, entity: {}']"]
