# ETL Pipeline: Store documents as embeddings in Milvus

This notebook is used to test the parts of the document import pipeline for preparing documents for Retrieval-Augmented Generation (RAG) using popular metadata fields for robust traceability, filtering, and retrieval.

## 1) Setup Imports

In [4]:
import os
import time
from pathlib import Path
from typing import List
import json
import hashlib

from langchain_community.document_loaders import PyPDFLoader
from langchain_unstructured import UnstructuredLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain_community.vectorstores import Milvus
from langchain_core.documents import Document
from pymilvus import connections, utility, Collection


print("Environment setup complete.")

Environment setup complete.


## 2) Configuration

In [5]:
# File configuration
# test_file_name = "RAG2005.11401v4.pdf"  # Change as needed
test_file_name = "Oxford Collocations Dictionary for Students of English.pdf"  # Change as needed
staging_folder = "../staging"
test_file_path = os.path.join(staging_folder, test_file_name)

# Embeddings (Ollama) configuration
EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL', 'mxbai-embed-large')
OLLAMA_HOST = os.getenv('OLLAMA_HOST', 'http://localhost:11434')

# Chat LLM configuration (for metadata enrichment)
CHAT_MODEL = os.getenv('CHAT_MODEL', 'mistral:latest')
CHAT_BASE_URL = os.getenv('CHAT_BASE_URL', OLLAMA_HOST)
CHAT_TEMPERATURE = float(os.getenv('CHAT_TEMPERATURE', '0.1'))

# UnstructuredLoader chunking configuration
UNSTRUCTURED_CHUNKING_STRATEGY = os.getenv('UNSTRUCTURED_CHUNKING_STRATEGY', 'basic')
UNSTRUCTURED_MAX_CHARACTERS = int(os.getenv('UNSTRUCTURED_MAX_CHARACTERS', '1000'))
UNSTRUCTURED_OVERLAP = int(os.getenv('UNSTRUCTURED_OVERLAP', '200'))
UNSTRUCTURED_INCLUDE_ORIG = os.getenv('UNSTRUCTURED_INCLUDE_ORIG', 'false').lower() == 'true'

# Milvus configuration
MILVUS_HOST = os.getenv('MILVUS_HOST', 'localhost')
MILVUS_PORT = int(os.getenv('MILVUS_PORT', '19530'))
COLLECTION_NAME = os.getenv('COLLECTION_NAME', 'documents')
MILVUS_DROP_COLLECTION = os.getenv('MILVUS_DROP_COLLECTION', 'false').lower() == 'true'
MILVUS_PARTITION_BY_SOURCE = os.getenv('MILVUS_PARTITION_BY_SOURCE', 'true').lower() == 'true'

# Helpers for stable IDs
abs_path = str(Path(test_file_path).resolve())
file_stat = os.stat(test_file_path) if os.path.exists(test_file_path) else None
file_sig = f"{abs_path}|{file_stat.st_size if file_stat else 0}|{int(file_stat.st_mtime) if file_stat else 0}"
document_id = hashlib.sha1(file_sig.encode('utf-8')).hexdigest()[:16]

print("Config:")
print(f"  File: {test_file_path}")
print(f"  Embedding model: {EMBEDDING_MODEL} @ {OLLAMA_HOST}")
print(f"  Chat model: {CHAT_MODEL} @ {CHAT_BASE_URL}")
print(f"  Unstructured chunking: strategy={UNSTRUCTURED_CHUNKING_STRATEGY}, max_chars={UNSTRUCTURED_MAX_CHARACTERS}, overlap={UNSTRUCTURED_OVERLAP}, include_orig={UNSTRUCTURED_INCLUDE_ORIG}")
print(f"  Milvus: {MILVUS_HOST}:{MILVUS_PORT}, collection={COLLECTION_NAME}, drop={MILVUS_DROP_COLLECTION}, partition_by_source={MILVUS_PARTITION_BY_SOURCE}")
print(f"  Document ID (stable): {document_id}")

Config:
  File: ../staging/Oxford Collocations Dictionary for Students of English.pdf
  Embedding model: mxbai-embed-large @ 10.0.10.100
  Chat model: llama3:latest @ 10.0.10.100
  Unstructured chunking: strategy=basic, max_chars=1000, overlap=200, include_orig=False
  Milvus: localhost:19530, collection=documents, drop=False, partition_by_source=True
  Document ID (stable): 50c8e45ebb4c603a


## 3) Load Document

In [None]:
documents: List[Document] = []
used_unstructured = False
loader_chunked = False

if not os.path.exists(test_file_path):
    raise FileNotFoundError(f'File not found: {test_file_path}')

try:
    print("Using UnstructuredLoader with chunking...")
    loader = UnstructuredLoader(
        test_file_path,
        chunking_strategy=UNSTRUCTURED_CHUNKING_STRATEGY,
        max_characters=UNSTRUCTURED_MAX_CHARACTERS,
        overlap=UNSTRUCTURED_OVERLAP,
        include_orig_elements=UNSTRUCTURED_INCLUDE_ORIG,
    )
    documents = loader.load()
    used_unstructured = True
    loader_chunked = bool(UNSTRUCTURED_CHUNKING_STRATEGY)
    print(f"Loaded {len(documents)} elements via UnstructuredLoader (chunked={loader_chunked})")
except Exception as e:
    print(f"UnstructuredLoader failed: {e}")
    exit(1)

# Basic stats
total_len = sum(len(d.page_content or '') for d in documents)
print(f"Total content length: {total_len:,} chars")

In [None]:
# Preview results
print(len(documents))
for doc in documents:
    print(f"{doc}\n")

## 4) Setup Metadata and Classification of Topic

In [None]:
# Use UnstructuredLoader for OCR and Labeling and Chunking
chunks: List[Document] = []

print("Using chunks from UnstructuredLoader).")
for i, d in enumerate(documents):
    text = d.page_content or ''
    meta = d.metadata or {}
    page = meta.get('page') or meta.get('page_number') or (i + 1)
    # Deterministic IDs
    content_hash = hashlib.sha1(text.encode('utf-8')).hexdigest()[:16]
    meta.update({
        'source': test_file_name,
        'source_path': abs_path,
        'page': page,
        'document_id': document_id,
        'chunk_id': f"{document_id}:{content_hash}",
        'content_hash': content_hash,
        'content_length': len(text),
        'extraction_method': 'unstructured',
        'loader_name': 'UnstructuredLoader',
        'ocr_used': bool(meta.get('is_ocr') or meta.get('ocr_confidence')),
        'processing_timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
        'has_embedding': False,
    })
    d.metadata = meta
    chunks.append(d)

print(f"Total chunks: {len(chunks)}")

# === LLM-based metadata enrichment (topic classification) ===
print("\n🤖 Enriching chunk metadata with LLM classification (topic)...")
chat_llm = ChatOllama(model=CHAT_MODEL, base_url=CHAT_BASE_URL, temperature=CHAT_TEMPERATURE)

classification_prompt_tpl = (
    "You are classifying a text chunk for RAG metadata.\n"
    "Return ONLY compact JSON with keys: topic.\n"
    "topic: concise subject title (3-6 words).\n"
    "Text:\n{chunk}\n"
)

for c in chunks:
    snippet = c.page_content[:800]
    prompt = classification_prompt_tpl.format(chunk=snippet)
    try:
        resp = chat_llm.invoke(prompt).content.strip()
        start = resp.find('{'); end = resp.rfind('}') + 1
        if start != -1 and end > start:
            resp_json = json.loads(resp[start:end])
            if isinstance(resp_json, dict) and resp_json.get('topic'):
                c.metadata['topic'] = resp_json['topic']
    except Exception:
        c.metadata.setdefault('topic', 'unknown')

print("LLM enrichment complete.")

In [None]:
# Preview results
for doc in documents:
    print(f"{doc.metadata}\n")

## 5) Generate Embeddings

In [None]:
if not chunks:
    raise RuntimeError('No chunks to embed')

embeddings_model = OllamaEmbeddings(model=EMBEDDING_MODEL, base_url=OLLAMA_HOST)
texts = [c.page_content for c in chunks]
embs = embeddings_model.embed_documents(texts)
print(f"Embeddings generated: {len(embs)}")

# Validate dimensions and attach embedding metadata
dims = {len(vec) for vec in embs if vec is not None}
if len(dims) != 1:
    raise ValueError(f"Inconsistent embedding dimensions found: {dims}")
embedding_dim = dims.pop()

for i, c in enumerate(chunks):
    c.metadata['embedding_model'] = EMBEDDING_MODEL
    c.metadata['embedding_dimension'] = embedding_dim
    c.metadata['has_embedding'] = True
print(f"Embedding metadata attached to chunks (dim={embedding_dim}).")

## 6) Store in Milvus

In [None]:

print(f"Milvus target: {MILVUS_HOST}:{MILVUS_PORT}, collection={COLLECTION_NAME}")
connections.connect('default', host=MILVUS_HOST, port=MILVUS_PORT)

collection_name = COLLECTION_NAME
metric_type = 'COSINE'
index_params = {"index_type": "AUTOINDEX", "metric_type": metric_type}

print(f"Chunks available before dedupe: {len(chunks)}")

# Conditionally drop existing collection based on flag (dev-only)
if MILVUS_DROP_COLLECTION and utility.has_collection(collection_name):
    print(f"Dropping existing collection: {collection_name}")
    utility.drop_collection(collection_name)

print(f"Creating and populating collection: {collection_name}")

# Dedupe by content_hash before insert
seen = set()
unique_docs = []
for c in chunks:
    ch = (c.metadata or {}).get('content_hash')
    if not ch or ch in seen:
        continue
    seen.add(ch)
    unique_docs.append(c)

print(f"Unique chunks after dedupe: {len(unique_docs)}")
if len(unique_docs) != len(chunks):
    print(f"Deduped {len(chunks) - len(unique_docs)} duplicate chunks by content_hash.")

# Prepare texts and metadatas
texts = [d.page_content for d in unique_docs]
metas = [d.metadata for d in unique_docs]

# Sanitize metadata: convert lists/dicts to strings
def _sanitize_meta(m):
    clean = {}
    for k, v in (m or {}).items():
        if isinstance(v, (str, int, float, bool)) or v is None:
            clean[k] = v
        elif isinstance(v, (list, tuple)):
            clean[k] = json.dumps(v, ensure_ascii=False)
        elif isinstance(v, dict):
            clean[k] = json.dumps(v, ensure_ascii=False)
        else:
            clean[k] = str(v)
    return clean

metas = [_sanitize_meta(m) for m in metas]

# Project metadata to a fixed lean schema and fill defaults
def _project_meta(m):
    return {
        'document_id': str(m.get('document_id', '')),
        'source': str(m.get('source', '')),
        'page': int(m.get('page', 0) or 0),
        'chunk_id': str(m.get('chunk_id', '')),
        'topic': str(m.get('topic', '')),
        'category': str(m.get('category', '')),
        'content_hash': str(m.get('content_hash', '')),
        'content_length': int(m.get('content_length', 0) or 0),
    }

metas = [_project_meta(m) for m in metas]

if not texts:
    raise RuntimeError("No texts to insert into Milvus. Verify earlier steps produced chunks with content.")

# Quick preview
print("Sample meta:", metas[0] if metas else {})
print("Sample text length:", len(texts[0]) if texts else 0)

# Create and populate collection in a single step to ensure schema matches metadatas
try:
    vectorstore = Milvus.from_texts(
        texts=texts,
        embedding=embeddings_model,
        metadatas=metas,
        collection_name=collection_name,
        connection_args={'host': MILVUS_HOST, 'port': MILVUS_PORT},
        index_params=index_params,
    )
except Exception as e:
    print("Milvus.from_texts failed:", repr(e))
    raise

# Ensure collection is flushed, loaded and report stats
col = Collection(collection_name)
try:
    col.flush()
except Exception as e:
    print("Flush warning:", repr(e))

col.load()
print(f"Stored {col.num_entities} chunks in Milvus collection '{collection_name}'.")

# If zero, retry insertion explicitly via add_texts
if col.num_entities == 0 and texts:
    print("Insertion resulted in 0 entities. Retrying with add_texts()...")
    try:
        vectorstore.add_texts(texts=texts, metadatas=metas)
        try:
            col.flush()
        except Exception:
            pass
        col.load()
        print(f"After retry, stored {col.num_entities} chunks in Milvus collection '{collection_name}'.")
    except Exception as e:
        print("Retry with add_texts failed:", repr(e))

# Show index info
try:
    indexes = col.indexes
    print(f"Indexes: {[i.params for i in indexes]}")
except Exception:
    pass


## 7) Validate Retrieval

In [23]:
queries = [
    'retrieval augmented generation',
    'vector databases',
    'what is a verb in the structure of a sentence'
]
for q in queries:
    print(f"\nQuery: {q}\n{'-'*60}")
    results = vectorstore.similarity_search(q, k=3)
    for i, r in enumerate(results, 1):
        print(f"Result {i}:")
        print(f"  Content: {r.page_content[:160]}...")
        # Highlight selected metadata fields
        meta = r.metadata or {}
        print(f"  Source: {meta.get('source')}")
        print(f"  Page: {meta.get('page')}")
        print(f"  Chunk ID: {meta.get('chunk_id')}")
        print(f"  Topic: {meta.get('topic')}")
        print(f"  Category: {meta.get('category')}")
        print(f"  Embedding Model: {meta.get('embedding_model')}")


Query: retrieval augmented generation
------------------------------------------------------------


INFO: HTTP Request: POST http://10.0.10.100:11434/api/embed "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://10.0.10.100:11434/api/embed "HTTP/1.1 200 OK"
INFO: HTTP Request: POST http://10.0.10.100:11434/api/embed "HTTP/1.1 200 OK"


Result 1:
  Content: © VERB + GENERATE help (to) the opportunity to help generate ideas | be used to The wind turbines are used to generate electricity, | be expected to, be likely ...
  Source: Oxford Collocations Dictionary for Students of English.pdf
  Page: 356
  Chunk ID: 50c8e45ebb4c603a:779ba18696b6fa41
  Topic: Idea Generation
  Category: CompositeElement
  Embedding Model: None
Result 2:
  Content: @ PREP. in a/tha~ I decided to show the results in a bar graph. on ajthe~ We can see on this graph how the com- pany has grown over the last year

@ PHRASES in ...
  Source: Oxford Collocations Dictionary for Students of English.pdf
  Page: 365
  Chunk ID: 50c8e45ebb4c603a:6df2d4625dec23d8
  Topic: Computer Graphics
  Category: CompositeElement
  Embedding Model: None
Result 3:
  Content: In recent years, teachers and students have become increasingly aware of the importance of collocation in English language learning. However, no matter how conv...
  Source: Oxford Collocations Dic

## 8) Remove Document

In [7]:
# Connect to Milvus (reuse existing connection)
connections.connect("default", host=MILVUS_HOST, port=MILVUS_PORT)

# Load the collection
collection = Collection(COLLECTION_NAME)
collection.load()

# Check entities before deletion
print(f"Entities before deletion: {collection.num_entities}")

# Delete records matching either document_id or source
# Use expression filter for precise targeting
delete_expr = f'document_id == "{document_id}" or source == "{test_file_name}"'
print(f"Delete expression: {delete_expr}")

# Perform deletion
try:
    delete_result = collection.delete(expr=delete_expr)
    print(f"Delete result: {delete_result}")
    
    # Flush to ensure deletion is persisted
    collection.flush()
    
    # Reload and verify deletion
    collection.load()
    print(f"Entities after deletion: {collection.num_entities}")
    
    # Verify no records remain for this document
    verification_results = collection.query(
        expr=delete_expr,
        output_fields=["document_id", "source", "chunk_id"],
        limit=10
    )
    print(f"Verification query results: {verification_results}")
    
    if not verification_results:
        print(f"✅ Successfully removed all chunks for document: {test_file_name} (ID: {document_id})")
    else:
        print(f"⚠️ Warning: {len(verification_results)} chunks still remain")
        
except Exception as e:
    print(f"❌ Deletion failed: {e}")
    # Show what records exist for debugging
    try:
        existing = collection.query(
            expr=f'source == "{test_file_name}"',
            output_fields=["document_id", "source", "chunk_id"],
            limit=5
        )
        print(f"Existing records for {test_file_name}: {existing}")
    except Exception as debug_e:
        print(f"Debug query also failed: {debug_e}")

Entities before deletion: 0
Delete expression: document_id == "50c8e45ebb4c603a" or source == "Oxford Collocations Dictionary for Students of English.pdf"
Delete result: (insert count: 0, delete count: 0, upsert count: 0, timestamp: 0, success count: 0, err count: 0
Entities after deletion: 0
Verification query results: data: [], extra_info: {}
✅ Successfully removed all chunks for document: Oxford Collocations Dictionary for Students of English.pdf (ID: 50c8e45ebb4c603a)
Entities after deletion: 0
Verification query results: data: [], extra_info: {}
✅ Successfully removed all chunks for document: Oxford Collocations Dictionary for Students of English.pdf (ID: 50c8e45ebb4c603a)
