# Notebook: Connect and Write to Milvus DB in watsonx.data

This notebook demonstrates how to interact with an existing Milvus database within the watsonx.data

**Key Steps:**

- Fetching Wikipedia articles from the Wikipedia API
- Embedding text using the Multilingual-E5-Large model
- Writing text with embeddings to Milvus

**Extending the Notebook:**

This notebook serves as a starting point for building a custom data ingestion pipeline. You can enhance it by:

- Scraping a website of your choice
- Using alternative embedding models available in WatsonX

**Important Note:**

When using different embedding models, ensure that they are available in WatsonX.

## Imports

In [1]:
from pymilvus import connections, DataType, Collection, FieldSchema, CollectionSchema, utility, db
from llama_index.core import StorageContext
import pandas as pd
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings("ignore")
import dotenv
from dotenv import load_dotenv
import wikipedia
import requests
import re
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

# Credentials 
Define the paratmers in a .env

For the Lab, the credentials might be provided for you.

- **ProjectID and Access Token**: From a watsonx.ai project. You **often** need them but **not** for this notebook.
- **API-Key**: IBM Cloud/ IAM Access Management
- **Miluvs-Password**: API Key
- **Milvus-Host**: From watsonx.data/infrastructure/miluvs-instance

In [2]:
import os
host = os.getenv("MILVUS_HOST", None)
port = os.getenv("MILVUS_PORT", None)
password = os.getenv("MILVUS_PASSWORD", None)
user = "ibmlhapikey"
print(f"Host: {host}, Port: {port}, Password: {password}, User: {user}")

Host: 6c4f0bde-3b6a-4c3d-8e8d-8fa6a4754b37.cpltu27f0ccoute4fi6g.lakehouse.appdomain.cloud, Port: 32302, Password: j5tszlJQ4Hd6UnGt4wRy9nFBw1Em3OESzihAvYA2gVrt, User: ibmlhapikey


# Connect to Milvus

In [5]:
connections.connect(user=user, password=password, host=host, port=port, secure=True)
# test connection
print(connections.list_connections())
print(connections.get_connection_addr(alias="default"))
print(db.list_database())

MilvusException: <MilvusException: (code=2, message=Fail connecting to server on 6c4f0bde-3b6a-4c3d-8e8d-8fa6a4754b37.cpltu27f0ccoute4fi6g.lakehouse.appdomain.cloud:32302, illegal connection params or server unavailable)>

## To Disconnect

In [4]:
# connections.disconnect("default")
# print(connections.list_connections())

## To Delete Collection

In [5]:
# utility.drop_collection('collection_name')

---
# Set Collection Name and Get Wiki Data

Lets check all the existing collections in the default database

In [7]:
collections = utility.list_collections()
for collection in collections:
    # Initialize the Collection object
    collection = Collection(collection)
    # Print collection name
    print(f"Collection: {collection.name}\n")
    # Print collection schema
    print(f"Schema: {collection.schema}\n")

ConnectionNotExistException: <ConnectionNotExistException: (code=1, message=should create connection first.)>

Name the collection and define the wiki-article you want to fetch.

In [18]:
# Set collection name
COLLECTION_NAME = "docling_helvetia"  #name
# Set wiki article name
# wiki_title = 'wiki-page-title' # 'wiki-page-title'

- Some patterns are removed from the article
- You can use the print statements to see how the article looks


In [25]:
# Demonstration of combining Google Cloud Storage and Qdrant
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode, AcceleratorOptions, \
    AcceleratorDevice, TesseractCliOcrOptions
from docling.document_converter import DocumentConverter
from docling.document_converter import PdfFormatOption
from docling_core.transforms.chunker import HierarchicalChunker
from docling.datamodel.settings import settings
from dotenv import load_dotenv
from langchain_docling import DoclingLoader

# Docling loader
def docling_loader(file_path, use_ocr_override: bool = False):
    pipeline_options = PdfPipelineOptions()
    pipeline_options.accelerator_options = AcceleratorOptions(
        num_threads=8, device=AcceleratorDevice.CUDA
    )
    # Set OCR based on the parameter (default: OCR disabled)
    pipeline_options.do_ocr = use_ocr_override

    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True  # uses text cells predicted from table structure model
    pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE

    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )
    # Enable the profiling to measure the time spent
    # settings.debug.profile_pipeline_timings = True

    # return DoclingLoader(chunker=HybridChunker(tokenizer="intfloat/multilingual-e5-large", max_tokens=512), file_path=file_path, converter=doc_converter)
    return DoclingLoader(chunker=HierarchicalChunker(), file_path=file_path, converter=doc_converter)

In [26]:
# wiki_page = wikipedia.page(wiki_title)
# body = wiki_page.content
# #print(body)
# #remove headings
# cleaned_body = re.sub(r'(?m)^\s*=+.*=+\s*$', '', body)
# #print(cleaned_body)
# print(wiki_page.title)
# print(wiki_page.url)

- The article will be split into chunks, which will be vectorized using an embedding model before writing it into Milvus. 
- This process is necessary even when you write your own web scraping script, as you'll need to split the retrieved text before embedding it

In [27]:
# Split the article into chunks
# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=510,
#     chunk_overlap=0,
#     # Define the separators to split the text
#     separators=["\n\n", "\n", ". ", "? ", "! "]
# )

# chunks = text_splitter.split_text(cleaned_body)
# print(f"Number of chunks: {len(chunks)}")

def load_document_with_conditional_ocr(file_path, text_length_threshold: int = 10):
    # First, try loading without OCR.
    loader = docling_loader(file_path, use_ocr_override=False)
    chunks = loader.load()

    # Check if we got any chunks and whether they contain enough text.
    total_text_length = sum(
        len(chunk.page_content) for chunk in chunks if hasattr(chunk, "page_content") and chunk.page_content)
    if not chunks or total_text_length < text_length_threshold:
        print("Detected empty or minimal text output. Reprocessing with OCR enabled...")
        # Re-run with OCR enabled.
        loader = docling_loader(file_path, use_ocr_override=True)
        chunks = loader.load()
    else:
        print("Text detected without OCR; skipping OCR step.")

    return chunks


from pathlib import Path

path = Path(r"../.data/AVB_NATIONAL/A12_DE.pdf")  #A01-S_DE.pdf")
files_path = []
# for file in path.iterdir():
chunks = load_document_with_conditional_ocr(path)
print(chunks)
# Print the chunks that will be embedded later
for i, chunk in enumerate(chunks):
    print(f"Chunk {i + 1}:")
    print(chunk)
    print("-" * 40)

Token indices sequence length is longer than the specified maximum sequence length for this model (595 > 512). Running this sequence through the model will result in indexing errors


Text detected without OCR; skipping OCR step.
[Document(metadata={'source': WindowsPath('../.data/AVB_NATIONAL/A12_DE.pdf'), 'dl_meta': {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/3', 'parent': {'$ref': '#/body'}, 'children': [], 'content_layer': 'body', 'label': 'text', 'prov': [{'page_no': 1, 'bbox': {'l': 56.64, 't': 607.8, 'r': 73.8, 'b': 599.591, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 4]}]}, {'self_ref': '#/texts/4', 'parent': {'$ref': '#/groups/0'}, 'children': [], 'content_layer': 'body', 'label': 'list_item', 'prov': [{'page_no': 1, 'bbox': {'l': 56.64, 't': 596.76, 'r': 257.895, 'b': 577.361, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 46]}]}, {'self_ref': '#/texts/5', 'parent': {'$ref': '#/groups/0'}, 'children': [], 'content_layer': 'body', 'label': 'list_item', 'prov': [{'page_no': 1, 'bbox': {'l': 56.64, 't': 573.57, 'r': 159.236, 'b': 565.361, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 2

# Prepare Chunks and Create Embeddings

intfloat/multilingual-e5-large expects a "passage: " prefix before each text passage.

In [13]:
# Add  "passage: " to beginning of each chunk for e5-large
input_texts = ["passage: " + chunk.page_content for chunk in chunks]
metadata = [chunk.metadata for chunk in chunks]

# Embed the Chunks using SentenceTransformer with intfloat/multilingual-e5-large
model = SentenceTransformer("intfloat/multilingual-e5-large")
embeddings = model.encode(input_texts, normalize_embeddings=True)
dim = embeddings.shape[1]

NameError: name 'chunks' is not defined

---
# Write Data into Milvus

## Check if Collection exists, Define Schema

Before proceeding, we need to check if the collection already exists in the Milvus database.

**Check if Collection Existst:**

If the collection already exists, you can drop it using the command provided earlier in this notebook.

**Define Collection Schema:**

If the collection does not exist, we define the schema for the collection.
The embeddings will be stored in vector and the assosiated text chunks in text.

watsonx.data **expects** the following fields:**"vector"** (index) and **"text"**.

In [19]:
# Check if collection already exists and create fields
if COLLECTION_NAME in utility.list_collections():
    collection = Collection(COLLECTION_NAME)
    print(f"Collection '{COLLECTION_NAME}' already exists. Rename or drop the collection first.")
else:
    print(f"Collection '{COLLECTION_NAME}' does not exist. Proceed.")
    id_field = FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True)
    text_field = FieldSchema(name="text", dtype=DataType.VARCHAR)
    vector_field = FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=dim)
    page_number = FieldSchema(name="page_number", dtype=DataType.INT64)
    file_name = FieldSchema(name="file_name", dtype=DataType.VARCHAR)
    product_name = FieldSchema(name="product_name", dtype=DataType.VARCHAR, max_length=256)
    product_year = FieldSchema(name="product_year", dtype=DataType.VARCHAR, max_length=256)
    chapter = FieldSchema(name="chapter", dtype=DataType.VARCHAR)
    metadata_field = FieldSchema(name="metadata", dtype=DataType.JSON)
    schema = CollectionSchema(
        fields=[id_field, text_field, vector_field, page_number, file_name, product_name, product_year, metadata_field])
    collection = Collection(name=COLLECTION_NAME, schema=schema)

Collection 'docling_helvetia' already exists. Rename or drop the collection first.


In [None]:
#try to identify product name from front page using llm
from ibm_watsonx_ai.foundation_models import ModelInference
from ibm_watsonx_ai import Credentials

creds = Credentials(
    url="https://eu-de.ml.cloud.ibm.com",
    api_key="YgK-ik0a_-zHtOyMj-iy8LW-i1aB5XOxMOgdndNv3j9B"
)


def extract_product_name(text, creds):
    prompt = f"""
    You are an assistant that extracts metadata from insurance documents.

    The following text is the first page of an insurance policy document.
    Your task is to identify the insurance product name mentioned in it and to identify the year of the version

    Return ONLY the product name as well as the year, no explanations.
    Respond only with a JSON object in the format:
    {{"product_name": "...", "year": "..."}}

    Text:
    \"\"\"
    {text}
    \"\"\"
    """

    model = ModelInference(
        model_id="ibm/granite-3-8b-instruct",
        credentials=creds,
        project_id="bf4840a1-a94d-47dc-824b-cdf9f215997e",
        params={
            "decoding_method": "greedy",
            "max_new_tokens": 100
        }
    )

    response = model.generate(prompt=prompt)
    return response["results"][0]["generated_text"].strip()

## Insert the Data, Create Index

Now we insert the embeddings and the text into milvus and define the index parameters. 

Refer to the pymilvus documentation for the index parameters.

In [15]:
# Insert .data
import json

# Convert each metadata dict to a JSON string, letting non-serializable objects be turned into strings.
def insert_into_milvus(collection, embeddings, text_chunks, product_name, product_year, file_name):
    data = [
        embeddings,
        text_chunks,
        [product_name] * len(text_chunks),
        [product_year] * len(text_chunks),
        [file_name] * len(text_chunks)
    ]
    collection.insert(data)
    collection.flush()

# Define the index 
index_params = {
    "metric_type": "COSINE",  # or L2 for example
    "index_type": "IVF_FLAT",
    "params": {"nlist": 1024}
}
# Create the index for the vector
collection.create_index(field_name="vector", index_params=index_params)
print(f"Index created on field 'vector' with params: {index_params}")

NameError: name 'metadata' is not defined

---
# Test the Collection

In [22]:
collection.load()
user_query = "Worum geht's im Art. 70?"
model = SentenceTransformer("intfloat/multilingual-e5-large")
query_embedding = model.encode(user_query).tolist()
search_results = collection.search(
    data=[query_embedding],
    anns_field="vector",
    param={"metric_type": "COSINE", "params": {"nprobe": 10}},
    limit=5,
    output_fields=["text", "metadata"]
)

for hits in search_results:
    for hit in hits:
        print(f"Chunk content: {hit.entity.get('text')}")
        print(f"Metadata: {hit.entity.get('metadata')}")
        print(f"Distance: {hit.distance}")
        print("---")

Chunk content: passage: Militärdienst und Krieg
Art.
70 Regelung zur Deckung der Risiken aus Militärdienst und Krieg
Metadata: {"source": "..\\.data\\AVB_NATIONAL\\A12_DE.pdf", "dl_meta": {"schema_name": "docling_core.transforms.chunker.DocMeta", "version": "1.0.0", "doc_items": [{"self_ref": "#/texts/67", "parent": {"$ref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 315.41, "t": 152.08000000000004, "r": 332.566, "b": 143.87099999999998, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 4]}]}, {"self_ref": "#/texts/68", "parent": {"$ref": "#/groups/5"}, "children": [], "content_layer": "body", "label": "list_item", "prov": [{"page_no": 1, "bbox": {"l": 315.41, "t": 141.03999999999996, "r": 519.425, "b": 121.91100000000006, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 63]}]}], "headings": ["Milit\u00e4rdienst und Krieg"], "origin": {"mimetype": "application/pdf", "binary_hash": 5349271543704535424, "filename": "A12_DE.pdf"}}