In [1]:
## this notebook is using Jina Model to avoid using jina api and use jina model locally
%pip install --upgrade pymilvus
%pip install numpy!=1.24.0 seaborn 
%pip install pymilvus transformers torch

# Import necessary libraries
import os
from pymilvus import connections, Collection, utility, CollectionSchema, FieldSchema, DataType
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
import itertools
#import pdfplumber

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Connect to Milvus (Hardcoded Values)
MILVUS_HOST = "milvus-standalone"
MILVUS_PORT = "19530"
MILVUS_ALIAS = "default"    
def connect_milvus():
    try:
        connections.connect(
            alias=MILVUS_ALIAS,
            host=MILVUS_HOST,
            port=MILVUS_PORT
        )
        print(f"Connected to Milvus at {MILVUS_HOST}:{MILVUS_PORT}")
    except Exception as e:
        print(f"Failed to connect to Milvus: {e}")

def disconnect_milvus():
    try:
        connections.disconnect(MILVUS_ALIAS)
        print("Disconnected from Milvus")
    except Exception as e:
        print(f"Failed to disconnect: {e}")

# Connect to Milvus
connect_milvus()

Connected to Milvus at milvus-standalone:19530


In [3]:
def create_document_schema():
    fields = [
        FieldSchema(name="doc_id", dtype=DataType.VARCHAR, max_length=100),
        FieldSchema(name="chunk_id", dtype=DataType.INT64, is_primary=True, auto_id=True),
        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=768),  # Adjust dimension based on model
        FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=65535)
    ]
    schema = CollectionSchema(fields=fields, description="Document embeddings")
    return schema

In [4]:
# Create Milvus Collection
def create_document_schema():
    fields = [
        FieldSchema(name="doc_id", dtype=DataType.VARCHAR, max_length=100),
        FieldSchema(name="chunk_id", dtype=DataType.INT64, is_primary=True, auto_id=True),
        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=384),  # 384 based on the model output
        FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=65535)
    ]
    schema = CollectionSchema(fields=fields, description="Document embeddings")
    return schema

def create_document_collection():
    schema = create_document_schema()
    collection_name = "document_embeddings"
    
    if not utility.has_collection(collection_name):
        collection = Collection(name=collection_name, schema=schema)
        index_params = {
            "index_type": "IVF_FLAT",
            "metric_type": "IP",
            "params": {"nlist": 1024}
        }
        collection.create_index(field_name="embedding", index_params=index_params)
        print(f"Collection '{collection_name}' and index created!")
    else:
        collection = Collection(name=collection_name)
        print(f"Collection '{collection_name}' already exists.")
    
    collection.load()
    return collection

# Create the collection
collection = create_document_collection()


Collection 'document_embeddings' and index created!


In [5]:
#load model and tokenizer locally (using a Hugging Face model)
model_name = "sentence-transformers/all-MiniLM-L6-v2"  # Change to your preferred model
#tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=False)
model = AutoModel.from_pretrained(model_name)

# Function to batch the inputs
def batched(iterable, n):
    it = iter(iterable)
    while batch := list(itertools.islice(it, n)):
        yield batch

# Generate embeddings locally
def generate_embeddings_local(texts, max_batch_size=32):
    embeddings = []
    for batch in batched(texts, max_batch_size):
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1)
        embeddings.append(batch_embeddings)
    
    # Normalize embeddings
    embeddings = torch.cat(embeddings)
    embeddings = embeddings / torch.norm(embeddings, dim=1, keepdim=True)
    return embeddings.numpy()

In [None]:
# Load text from PDF for now we are not using the pdf since we are inserting just dummy data
#def load_pdf(file_path):
    #with pdfplumber.open(file_path) as pdf:
        #text = ""
        #for page in pdf.pages:
            #text += page.extract_text()
    #return text

In [6]:
# Insert document embeddings into Milvus
def insert_data_into_collection(collection, doc_ids, embeddings, texts):
    try:
        entities = [doc_ids, embeddings, texts]
        collection.insert(entities)
        print(f"Data inserted into collection: {collection.name}")
    except Exception as e:
        print(f"Failed to insert data: {e}")

In [7]:
docs = [
    "This is a sample document.",
    "Testing embeddings with local model.",
    "Integrating RAG with LLMs and vector databases."
]

# Generate embeddings for the documents
embeddings = generate_embeddings_local(docs)
print("Generated embeddings:", embeddings)

# Insert embeddings into Milvus
doc_ids = ["doc1", "doc2", "doc3"]
insert_data_into_collection(collection, doc_ids, embeddings, docs)

# Disconnect from Milvus
disconnect_milvus()

Generated embeddings: [[-0.03427842  0.09732049  0.01290111 ...  0.1057302   0.02004655
  -0.01950257]
 [ 0.02049447 -0.08148539  0.06382781 ...  0.0004151   0.01097185
  -0.00851156]
 [-0.02256461 -0.02677649  0.01252299 ... -0.07487489 -0.03987329
  -0.01576395]]
Data inserted into collection: document_embeddings
Disconnected from Milvus
