`upsert` operation with Milvus within the LangChain framework.

In [None]:
import os
from langchain_community.vectorstores import Milvus
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document
from pymilvus import utility, connections, Collection
from getpass import getpass
import time

# --- 0. Configuration and API Key Setup ---
# Set your OpenAI API key
# os.environ["OPENAI_API_KEY"] = getpass("Enter your OpenAI API Key: ")
os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY" # Replace with your actual key

# Milvus connection details
MILVUS_HOST = "localhost"
MILVUS_PORT = "19530"
COLLECTION_NAME = "langchain_milvus_upsert_demo"
VECTOR_DIMENSION = 1536  # OpenAI embeddings have a dimension of 1536

# Establish Milvus connection
connections.connect(host=MILVUS_HOST, port=MILVUS_PORT)
print(f"Connected to Milvus at {MILVUS_HOST}:{MILVUS_PORT}")

# Clean up existing collection if it exists for a clean run
if utility.has_collection(COLLECTION_NAME):
    utility.drop_collection(COLLECTION_NAME)
    print(f"Dropped existing collection: {COLLECTION_NAME}")
    time.sleep(2) # Give Milvus a moment to clean up


# --- 1. Initialize Embeddings and Milvus Vector Store ---
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

# Create a Milvus vector store instance. 
# LangChain will handle collection creation if it doesn't exist.
milvus_vector_store = Milvus(
    embedding_function=embeddings,
    collection_name=COLLECTION_NAME,
    connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT},
    # The primary_field should match your Milvus collection schema's primary key
    # LangChain's Milvus integration typically uses 'pk' as default if not specified
    # but we'll explicitly define it for clarity.
    # When creating a collection via LangChain, it automatically sets 'pk' as the primary field.
    # Make sure to align with how LangChain internally names the primary field.
    primary_field="pk" 
)

print(f"Initialized Milvus vector store for collection: {COLLECTION_NAME}")
time.sleep(5) # Give some time for collection creation and loading

# Ensure the collection is loaded after creation (Milvus requirement for search/query)
# This might be implicitly handled by LangChain on first operation, but good to be explicit
collection = Collection(COLLECTION_NAME)
if collection.is_empty:
    print("Collection is empty, will load on first insert/upsert.")
else:
    collection.load()
    print("Collection loaded.")


# --- 2. Initial Data Insertion (Using add_documents for initial load) ---
print("\n--- Phase 1: Initial Data Insertion ---")
initial_docs = [
    Document(page_content="Apple is a tech company known for iPhones.", metadata={"source": "wiki", "doc_id": "101"}),
    Document(page_content="Microsoft produces Windows operating systems.", metadata={"source": "wiki", "doc_id": "102"}),
    Document(page_content="Google develops the Android mobile OS.", metadata={"source": "wiki", "doc_id": "103"}),
]

# For initial insertion, we can use add_documents.
# We'll explicitly pass ids, which will correspond to the 'pk' field in Milvus
# LangChain Milvus `add_documents` automatically generates 'pk' if not provided
# but for upsert, we need to manage these IDs ourselves.
initial_ids = ["pk_101", "pk_102", "pk_103"]
milvus_vector_store.add_documents(initial_docs, ids=initial_ids)
print(f"Inserted initial documents with IDs: {initial_ids}")

# Verify initial data
results_after_initial = milvus_vector_store.similarity_search("Apple company", k=2)
print("\nResults after initial insert for 'Apple company':")
for res in results_after_initial:
    print(f"  Content: {res.page_content}, ID (via metadata): {res.metadata.get('pk')}, doc_id: {res.metadata.get('doc_id')}")


# --- 3. Perform Upsert Operation ---
print("\n--- Phase 2: Performing Upsert ---")

upsert_docs = [
    # Case 1: Update an existing document (pk_101 for Apple)
    Document(page_content="Apple is a multinational technology company, creator of iOS and macOS.", 
             metadata={"source": "wiki_updated", "doc_id": "101", "new_field": "test_update"}),
    # Case 2: Insert a new document (pk_104 for Amazon)
    Document(page_content="Amazon is an e-commerce giant and cloud provider (AWS).", 
             metadata={"source": "company_data", "doc_id": "104", "new_field": "test_insert"}),
]

upsert_ids = ["pk_101", "pk_104"]

# The `add_documents` method in LangChain's Milvus integration handles upsert
# if the IDs provided already exist. Milvus's internal `upsert` is called.
milvus_vector_store.add_documents(upsert_docs, ids=upsert_ids)
print(f"Performed upsert operation for IDs: {upsert_ids}")


# --- 4. Verify Upsert Results ---
print("\n--- Phase 3: Verifying Upsert Results ---")

# Verify the updated document (pk_101 for Apple)
results_after_upsert_apple = milvus_vector_store.similarity_search("Apple's operating systems", k=1)
print("\nResults after upsert for 'Apple' (should show updated content):")
for res in results_after_upsert_apple:
    print(f"  Content: {res.page_content}")
    print(f"  Metadata: {res.metadata}")
    assert "iOS and macOS" in res.page_content, "Apple document was not updated correctly!"
    assert res.metadata.get("source") == "wiki_updated", "Apple document metadata was not updated!"
    assert res.metadata.get("new_field") == "test_update", "New field was not added to Apple document!"


# Verify the newly inserted document (pk_104 for Amazon)
results_after_upsert_amazon = milvus_vector_store.similarity_search("AWS cloud services", k=1)
print("\nResults after upsert for 'Amazon' (should show new content):")
for res in results_after_upsert_amazon:
    print(f"  Content: {res.page_content}")
    print(f"  Metadata: {res.metadata}")
    assert "e-commerce giant and cloud provider (AWS)" in res.page_content, "Amazon document was not inserted correctly!"
    assert res.metadata.get("source") == "company_data", "Amazon document metadata was not inserted!"
    assert res.metadata.get("new_field") == "test_insert", "New field was not added to Amazon document!"

# Query for the document that was not affected by upsert (Microsoft)
results_after_upsert_microsoft = milvus_vector_store.similarity_search("Microsoft Windows", k=1)
print("\nResults after upsert for 'Microsoft' (should show original content):")
for res in results_after_upsert_microsoft:
    print(f"  Content: {res.page_content}")
    print(f"  Metadata: {res.metadata}")
    assert "Windows operating systems" in res.page_content, "Microsoft document was unexpectedly changed!"
    assert res.metadata.get("source") == "wiki", "Microsoft document metadata was unexpectedly changed!"

print("\nMilvus upsert operation successfully demonstrated with LangChain!")

# Optional: Clean up the collection at the end
# utility.drop_collection(COLLECTION_NAME)
# print(f"Cleaned up collection: {COLLECTION_NAME}")