# Notebook: Connect and Write to Milvus DB in watsonx.data

This notebook demonstrates how to interact with an existing Milvus database within the watsonx.data

**Key Steps:**

- Fetching Wikipedia articles from the Wikipedia API
- Embedding text using the Multilingual-E5-Large model
- Writing text with embeddings to Milvus

**Extending the Notebook:**

This notebook serves as a starting point for building a custom data ingestion pipeline. You can enhance it by:

- Scraping a website of your choice
- Using alternative embedding models available in WatsonX

**Important Note:**

When using different embedding models, ensure that they are available in WatsonX.

## Imports

In [2]:
from pymilvus import connections, DataType, Collection, FieldSchema, CollectionSchema, utility, db, drop_collection
from llama_index.core import StorageContext
import pandas as pd
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import warnings

warnings.filterwarnings("ignore")
import dotenv
from dotenv import load_dotenv
import wikipedia
import requests
import re

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

# Credentials 
Define the paratmers in a .env

For the Lab, the credentials might be provided for you.

- **ProjectID and Access Token**: From a watsonx.ai project. You **often** need them but **not** for this notebook.
- **API-Key**: IBM Cloud/ IAM Access Management
- **Miluvs-Password**: API Key
- **Milvus-Host**: From watsonx.data/infrastructure/miluvs-instance

In [3]:
import os

host = os.getenv("MILVUS_HOST", None)
port = os.getenv("MILVUS_PORT", None)
password = os.getenv("MILVUS_PASSWORD", None)
user = os.getenv("MILVUS_USER", None)
print(f"Host: {host}, Port: {port}, Password: {password}, User: {user}")

Host: 102092af-5474-4a42-8dc2-35bb05ffdd0e.cvgfjtof0l91rq0joaj0.lakehouse.appdomain.cloud, Port: 31574, Password: oLDjQJic9EgszqIbNKUCR4_pxvr1L-DGKlpxkQ7p7zJK, User: ibmlhapikey


# Connect to Milvus

In [4]:
connections.connect(user=user, password=password, host=host, port=port, secure=True)
# test connection
print(connections.list_connections())
print(connections.get_connection_addr(alias="default"))
print(db.list_database())

[('default', <pymilvus.client.grpc_handler.GrpcHandler object at 0x0000020128249DF0>)]
{'secure': True, 'address': '102092af-5474-4a42-8dc2-35bb05ffdd0e.cvgfjtof0l91rq0joaj0.lakehouse.appdomain.cloud:31574', 'user': 'ibmlhapikey'}
['group_02', 'group_03', 'default', 'group_01']


## To Disconnect

In [5]:
# connections.disconnect("default")
print(connections.list_connections())

[('default', <pymilvus.client.grpc_handler.GrpcHandler object at 0x0000020128249DF0>)]


## To Delete Collection

In [113]:
#utility.drop_collection('docling_helvetia')

---
# Set Collection Name and Get Wiki Data

Lets check all the existing collections in the default database

In [161]:
collections = utility.list_collections()
for collection in collections:
    # Initialize the Collection object
    collection = Collection(collection)
    # Print collection name
    print(f"Collection: {collection.name}\n")
    # Print collection schema
    print(f"Schema: {collection.schema}\n")
    if collection.name == "docling_helvetia":
        # Print collection statistics
        print(f"Statistics: {collection.num_entities} entities\n")
        # Print collection description
        print(f"Description: {collection.description}\n")
        print(f"Number of shards: {collection.num_shards}\n")

Collection: ibm_based_embeddings

Schema: {'auto_id': True, 'description': 'Document embeddings with metadata', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 1024}}, {'name': 'text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 65535}}, {'name': 'product_name', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 256}}, {'name': 'year', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 256}}, {'name': 'file_name', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 256}}], 'enable_dynamic_field': True}

Collection: docling_helvetia

Schema: {'auto_id': True, 'description': 'Collection for storing text chunks and their embeddings', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 

Name the collection and define the wiki-article you want to fetch.

In [6]:
# Set collection name
COLLECTION_NAME = "docling_helvetia"  #name
# Set wiki article name
# wiki_title = 'wiki-page-title' # 'wiki-page-title'

- Some patterns are removed from the article
- You can use the print statements to see how the article looks


In [8]:
# Demonstration of combining Google Cloud Storage and Qdrant
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode, AcceleratorOptions, \
    AcceleratorDevice, TesseractCliOcrOptions, EasyOcrOptions
from docling.document_converter import DocumentConverter
from docling.document_converter import PdfFormatOption
from docling_core.transforms.chunker import HierarchicalChunker
from docling.datamodel.settings import settings
from dotenv import load_dotenv
from langchain_docling import DoclingLoader


# Docling loader
def docling_loader(file_path, use_ocr_override: bool = False):
    pipeline_options = PdfPipelineOptions()
    pipeline_options.accelerator_options = AcceleratorOptions(
        num_threads=8, device=AcceleratorDevice.CUDA
    )
    # Set OCR based on the parameter (default: OCR disabled)
    pipeline_options.do_ocr = use_ocr_override
    pipeline_options.ocr_options = EasyOcrOptions(
        lang=["fr", "de", "es", "en"],
        force_full_page_ocr=True,
        use_gpu=True,
    )

    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True  # uses text cells predicted from table structure model
    pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE

    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )
    # Enable the profiling to measure the time spent
    # settings.debug.profile_pipeline_timings = True

    # return DoclingLoader(chunker=HybridChunker(tokenizer="intfloat/multilingual-e5-large", max_tokens=512), file_path=file_path, converter=doc_converter)
    return DoclingLoader(chunker=HierarchicalChunker(), file_path=file_path, converter=doc_converter)

- The docs will be split into chunks, which will be vectorized using an embedding model before writing it into Milvus.
- This process is necessary even when you write your own web scraping script, as you'll need to split the retrieved text before embedding it

In [9]:
def load_document_with_conditional_ocr(file_path, text_length_threshold: int = 10):
    # First, try loading without OCR.
    loader = docling_loader(file_path, use_ocr_override=False)
    chunks = loader.load()

    # Check if we got any chunks and whether they contain enough text.
    total_text_length = sum(
        len(chunk.page_content) for chunk in chunks if hasattr(chunk, "page_content") and chunk.page_content)
    if not chunks or total_text_length < text_length_threshold:
        print("Detected empty or minimal text output. Reprocessing with OCR enabled...")
        # Re-run with OCR enabled.
        loader = docling_loader(file_path, use_ocr_override=True)
        chunks = loader.load()
    else:
        print("Text detected without OCR; skipping OCR step.")

    return chunks


from pathlib import Path

all_chunks = []
directory_path = Path(r"../.data/AVB_HELVETIA/")  #A01-S_DE.pdf")
files_path = []
# Iterate over all files in the specified directory
for file_path in directory_path.iterdir():
    if file_path.is_file() and file_path.suffix.lower() == '.pdf':
        print(f"Processing file: {file_path.name}")
        try:
            # Load and chunk the current file
            chunks = load_document_with_conditional_ocr(str(file_path))
            # Add the chunks from this file to the main list
            all_chunks.extend(chunks)
            print(f"Finished processing {file_path.name}. Found {len(chunks)} chunks.")
        except Exception as e:
            print(f"Error processing file {file_path.name}: {e}")
        print("-" * 40)
# Print all the collected chunks (optional)
print(f"\nTotal chunks collected from all files: {len(all_chunks)}")

Processing file: 1001 01.71_DE.pdf
Text detected without OCR; skipping OCR step.
Finished processing 1001 01.71_DE.pdf. Found 89 chunks.
----------------------------------------
Processing file: 1001 01.83_DE.pdf
Text detected without OCR; skipping OCR step.
Finished processing 1001 01.83_DE.pdf. Found 232 chunks.
----------------------------------------
Processing file: 1001 02.47_DE.pdf
Detected empty or minimal text output. Reprocessing with OCR enabled...
Finished processing 1001 02.47_DE.pdf. Found 40 chunks.
----------------------------------------
Processing file: 1001 05.80_DE.pdf
Text detected without OCR; skipping OCR step.
Finished processing 1001 05.80_DE.pdf. Found 128 chunks.
----------------------------------------
Processing file: 1001 05.81_DE.pdf
Detected empty or minimal text output. Reprocessing with OCR enabled...
Finished processing 1001 05.81_DE.pdf. Found 396 chunks.
----------------------------------------
Processing file: 1001 06.73a_DE.pdf
Detected empty or m

In [74]:
print(len(all_chunks))
filtered_chunks = []
# for chunk in all_chunks:
#     file_name = chunk.metadata.get("dl_meta", {}).get("origin", {}).get("filename", "Unknown")
#     correct_filename = "SEV-Zusatzbedingungen Jugendvorsorge-Versicherung_Terminvers_01.2010_DE.pdf"
#     if file_name == "SEV-Zusatzbedingungen Jugendvorsorge-Versicherung _Terminvers_01.2010_DE.pdf":
#         chunk.metadata["dl_meta"]["origin"]["filename"] = correct_filename
#         print(f"Corrected filename: {correct_filename}")
#     if file_name != "Raiffeisen EB Erwerbsausfall 1996 K od U_DE.pdf":
#         filtered_chunks.append(chunk)
# print(f"Filtered chunks: {len(filtered_chunks)}")
# all_chunks = filtered_chunks

70775
Corrected filename: SEV-Zusatzbedingungen Jugendvorsorge-Versicherung_Terminvers_01.2010_DE.pdf
Corrected filename: SEV-Zusatzbedingungen Jugendvorsorge-Versicherung_Terminvers_01.2010_DE.pdf
Corrected filename: SEV-Zusatzbedingungen Jugendvorsorge-Versicherung_Terminvers_01.2010_DE.pdf
Corrected filename: SEV-Zusatzbedingungen Jugendvorsorge-Versicherung_Terminvers_01.2010_DE.pdf
Corrected filename: SEV-Zusatzbedingungen Jugendvorsorge-Versicherung_Terminvers_01.2010_DE.pdf
Corrected filename: SEV-Zusatzbedingungen Jugendvorsorge-Versicherung_Terminvers_01.2010_DE.pdf


# Prepare Chunks and Create Embeddings

intfloat/multilingual-e5-large expects a "passage: " prefix before each text passage.

In [75]:
import json
from pathlib import Path


# Function to make metadata JSON serializable (reuse from previous context)
def make_serializable(metadata_dict):
    serializable_meta = {}
    if not isinstance(metadata_dict, dict):  # Handle cases where input isn't a dict
        return str(metadata_dict) if isinstance(metadata_dict, Path) else metadata_dict

    for key, value in metadata_dict.items():
        if isinstance(value, Path):
            serializable_meta[key] = str(value)  # Convert Path to string
        elif isinstance(value, dict):
            serializable_meta[key] = make_serializable(value)  # Recursively process nested dicts
        elif isinstance(value, list):
            serializable_meta[key] = [make_serializable(item) for item in value]  # Recursively process list items
        else:
            # Assume other types are directly serializable
            serializable_meta[key] = value
    return serializable_meta


# Add  "passage: " to beginning of each chunk for e5-large
input_texts = ["passage: " + chunk.page_content for chunk in all_chunks]
docling_metadata = [chunk.metadata for chunk in all_chunks]
# Make metadata serializable
serializable_metadata = [make_serializable(meta) for meta in docling_metadata]

page_numbers = []
chapters = []  # Example: Extract from 'headings' if relevant
file_names = []
for meta in serializable_metadata:
    # Page Number Extraction (Example)
    page_no = -1  # Default value
    try:
        # Adjust path based on your actual metadata structure
        page_no = meta.get('dl_meta', {}).get('doc_items', [{}])[0].get('prov', [{}])[0].get('page_no', -1)
    except (IndexError, KeyError, AttributeError):
        pass  # Keep default if extraction fails
    page_numbers.append(page_no)

    # Chapter Extraction (Example - using 'headings')
    chapter_name = "Unknown"  # Default value
    try:
        # Assumes 'headings' list exists and contains the relevant chapter
        headings = meta.get('dl_meta', {}).get('headings', [])
        if headings:
            chapter_name = headings[0]  # Take the first heading as chapter
    except (KeyError, AttributeError):
        pass
    # Ensure chapter name does not exceed max_length if defined in schema
    chapters.append(chapter_name)  # Use schema max_length

    file_name = "Unknown"  # Default value
    try:
        # Adjust path based on your actual metadata structure
        file_name = meta.get('dl_meta', {}).get('origin', {}).get('filename', "Unknown")
    except (IndexError, KeyError, AttributeError):
        pass  # Keep default if extraction fails
    file_names.append(file_name)

### Backup just in case

In [147]:
import pickle

# Define the backup file path
backup_file = "all_chunks_backup.pkl"

# Save all_chunks to file
with open(backup_file, "wb") as f:
    pickle.dump(all_chunks, f)

print(f"✓ Successfully saved {len(all_chunks)} chunks to {backup_file}")


✓ Successfully saved 70775 chunks to all_chunks_backup.pkl


In [152]:
# Load backup data
with open("all_chunks_backup.pkl", "rb") as f:
    all_chunks_restored = pickle.load(f)

print(f"✓ Loaded {len(all_chunks_restored)} chunks from backup file")
all_chunks_restored = None

✓ Loaded 70775 chunks from backup file


In [170]:
# Extract from Excel
# Placeholder values - replace with actual extraction logic
product_names = []
product_years = []
company_entity_names = []
try:
    df = pd.read_excel("../.data/Uebersicht_AVB_formatted.xlsx", sheet_name="Helvetia", dtype={'Year': str})
    for file_name in file_names:
        matching_row = df[df['File'] == file_name]
        if not matching_row.empty:
            # Get the first match (assuming filenames are unique)
            product_name = matching_row.iloc[0]['Produkt']
            split_date = str(matching_row.iloc[0]['Year']).strip()
            month, _sep, year = split_date.rpartition(".")
            product_month = month if _sep else 1
            product_year = year if _sep else split_date
            if product_year == "nan":
                product_year = None

            print(f"Filename: {file_name}")
            print(f"Found Product Name: {product_name}")
            print(f"Found Product Month: {product_month}")
            print(f"Found Year: {product_year}")

            product_names.append(product_name)
            product_years.append(str(product_year))
            company_entity_names.append("Helvetia")
        else:
            print(f"Filename '{file_name}' not found in the Excel file.")
            raise Exception(f"Filename '{file_name}' not found in the Excel file.")
except FileNotFoundError:
    print(f"Error: Excel file not found at '{df.path}'")
except KeyError as e:
    print(f"Error: Column {e} not found in the Excel file. Check column names.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Filename: 1001 01.71_DE.pdf
Found Product Name: Branche 1230
Found Year: 1971
Filename: 1001 01.71_DE.pdf
Found Product Name: Branche 1230
Found Year: 1971
Filename: 1001 01.71_DE.pdf
Found Product Name: Branche 1230
Found Year: 1971
Filename: 1001 01.71_DE.pdf
Found Product Name: Branche 1230
Found Year: 1971
Filename: 1001 01.71_DE.pdf
Found Product Name: Branche 1230
Found Year: 1971
Filename: 1001 01.71_DE.pdf
Found Product Name: Branche 1230
Found Year: 1971
Filename: 1001 01.71_DE.pdf
Found Product Name: Branche 1230
Found Year: 1971
Filename: 1001 01.71_DE.pdf
Found Product Name: Branche 1230
Found Year: 1971
Filename: 1001 01.71_DE.pdf
Found Product Name: Branche 1230
Found Year: 1971
Filename: 1001 01.71_DE.pdf
Found Product Name: Branche 1230
Found Year: 1971
Filename: 1001 01.71_DE.pdf
Found Product Name: Branche 1230
Found Year: 1971
Filename: 1001 01.71_DE.pdf
Found Product Name: Branche 1230
Found Year: 1971
Filename: 1001 01.71_DE.pdf
Found Product Name: Branche 1230
Fou

In [8]:
print(
    f"Input texts: {input_texts}\n\nPage: {page_numbers}\nChapters:{chapters}\nFile_name:{file_names}\n Metadata: {docling_metadata}\n\n")
# Embed the Chunks using SentenceTransformer with intfloat/multilingual-e5-large
model = SentenceTransformer("intfloat/multilingual-e5-large")
embeddings = model.encode(input_texts, normalize_embeddings=True, device="cuda")
dim = embeddings.shape[1]

NameError: name 'input_texts' is not defined

---
# Write Data into Milvus

## Check if Collection exists, Define Schema

Before proceeding, we need to check if the collection already exists in the Milvus database.

**Check if Collection Existst:**

If the collection already exists, you can drop it using the command provided earlier in this notebook.

**Define Collection Schema:**

If the collection does not exist, we define the schema for the collection.
The embeddings will be stored in vector and the assosiated text chunks in text.

watsonx.data **expects** the following fields:**"vector"** (index) and **"text"**.

In [145]:
# Check if collection already exists and create fields
#utility.drop_collection('docling_helvetia')
if COLLECTION_NAME in utility.list_collections():
    collection = Collection(COLLECTION_NAME)
    print(f"Collection '{COLLECTION_NAME}' already exists. Rename or drop the collection first.")
else:
    print(f"Collection '{COLLECTION_NAME}' does not exist. Proceed.")
    id_field = FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True)
    vector_field = FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=dim)
    text_field = FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=65535)
    metadata_field = FieldSchema(name="metadata", dtype=DataType.JSON)
    page_number = FieldSchema(name="page_number", dtype=DataType.INT64)
    file_name = FieldSchema(name="file_name", dtype=DataType.VARCHAR, max_length=2048)
    product_name = FieldSchema(name="product_name", dtype=DataType.VARCHAR, max_length=256)
    product_month = FieldSchema(name="product_month", dtype=DataType.INT64)
    product_year = FieldSchema(name="product_year", dtype=DataType.INT64)
    chapter = FieldSchema(name="chapter", dtype=DataType.VARCHAR, max_length=2048)
    company_entity = FieldSchema(name="company_entity", dtype=DataType.VARCHAR,
                                 max_length=256)  #Helvetia, National, Phoenix
    schema = CollectionSchema(
        fields=[id_field, vector_field, text_field, metadata_field, page_number, file_name, product_name, product_month,
                product_year, chapter, company_entity],
        description="Collection for storing text chunks and their embeddings",
        enable_dynamic_field=True)
    collection = Collection(name=COLLECTION_NAME, schema=schema)

Collection 'docling_helvetia_v2' does not exist. Proceed.


In [52]:
def safe_year(val) -> int | None:
    """
    Return a clean integer year or None.
    Accepts: 2025, '2025', '05.2024', 'nan', np.nan, None, 'None'
    """
    # real Python None
    if val is None:
        print("Year is None")
        return None

    # numpy NaN (float) – math.isnan handles both float and np.float64
    if isinstance(val, float) and math.isnan(val):
        return None

    # numeric int already
    if isinstance(val, (int, np.integer)):
        return int(val)

    # normalise to trimmed string
    s = str(val).strip().lower()
    if s in {"", "nan", "none"}:
        return None

    # dotted month.year pattern → keep part after the last dot
    _, sep, tail = s.rpartition(".")
    candidate = tail if sep else s

    # final check: exactly 4 digits?
    return int(candidate) if candidate.isdigit() and len(candidate) == 4 else None

## Insert the Data, Create Index

Now we insert the embeddings and the text into milvus and define the index parameters. 

Refer to the pymilvus documentation for the index parameters.

In [179]:
import math
# Insert .data
import json


# Convert each metadata dict to a JSON string, letting non-serializable objects be turned into strings.
def insert_into_milvus(collection, text_chunks, embeddings, serializable_metadata, page_numbers, file_names,
                       product_names, product_years, chapters, company_entities):
    BATCH_SIZE = 2_000  # keep each gRPC message << 64 MiB
    total = len(text_chunks)
    if not all(len(lst) == total for lst in [text_chunks,
                                             embeddings, serializable_metadata, page_numbers, file_names,
                                             product_names, product_years, chapters, company_entities
                                             ]):
        raise ValueError("All input lists must have the same length")
    else:
        print(f"✓ All input lists have the same length: {total}")
    for start in range(0, total, BATCH_SIZE):
        end = min(start + BATCH_SIZE, total)
        nulls = sum(1 for y in product_years[start:end] if safe_year(y) is None)
        print(f"Batch {start}-{end}: {nulls} product_year values are None")
        data_to_insert = []
        for i in range(start, end):
            year = safe_year(product_years[i])
            row = {
                "vector": embeddings[i].tolist(),  # Ensure individual vector is a list
                "text": text_chunks[i],
                "metadata": serializable_metadata[i],  # Already serialized JSON-compatible dict
                "page_number": page_numbers[i],
                "file_name": file_names[i],
                "product_name": product_names[i],
                "product_year": year if year else 0,
                "chapter": chapters[i],
                "company_entity": company_entities[i]
            }
            data_to_insert.append(row)

        collection.insert(data_to_insert)
        collection.flush()
        print(f"Inserted rows {start}–{end - 1}")
    print(f"✓ Finished: {total} rows ingested in {math.ceil(total / BATCH_SIZE)} batch(es)")


insert_into_milvus(collection, input_texts, embeddings, serializable_metadata, page_numbers, file_names, product_names,
                   product_years, chapters, company_entity_names)

collection.create_index(
    field_name="vector",
    index_params={"index_type": "IVF_FLAT", "metric_type": "COSINE", "params": {"nlist": 1024}}
)

collection.create_index(
    field_name="product_year",
    index_params={"index_type": "INVERTED"}
)

collection.create_index(
    field_name="product_month",
    index_params={"index_type": "INVERTED"}
)

collection.create_index(
    field_name="product_name",
    index_params={"index_type": "INVERTED"}
)
# (Optional) JSON‑path index is no longer needed because product_month is fixed.

print(f"Index created on field 'vector' with params: {index_params}")

✓ All input lists have the same length: 70775
Batch 0-2000: 0 product_year values are None
Inserted rows 0–1999
Batch 2000-4000: 0 product_year values are None
Inserted rows 2000–3999
Batch 4000-6000: 0 product_year values are None
Inserted rows 4000–5999
Batch 6000-8000: 0 product_year values are None
Inserted rows 6000–7999
Batch 8000-10000: 0 product_year values are None
Inserted rows 8000–9999
Batch 10000-12000: 0 product_year values are None
Inserted rows 10000–11999
Batch 12000-14000: 0 product_year values are None
Inserted rows 12000–13999
Batch 14000-16000: 0 product_year values are None
Inserted rows 14000–15999
Batch 16000-18000: 0 product_year values are None
Inserted rows 16000–17999
Batch 18000-20000: 0 product_year values are None
Inserted rows 18000–19999
Batch 20000-22000: 0 product_year values are None
Inserted rows 20000–21999
Batch 22000-24000: 0 product_year values are None
Inserted rows 22000–23999
Batch 24000-26000: 0 product_year values are None
Inserted rows 240

---
# Test the Collection

In [159]:
collection.load()
user_query = '''Die Auszahlung des Rückkaufswertes ist nur in den folgenden Fällen gestattet:
- in den letzten fünf Jahren vor Erreichen des ordentlichen Rentenalters der AHV
- wenn Sie eine ganze Invalidenrente der Eidgenössischen Invalidenversicherung'''
model = SentenceTransformer("intfloat/multilingual-e5-large")
query_embedding = model.encode(user_query).tolist()
search_results = OLD_COLL.search(
    data=[query_embedding],
    anns_field="vector",
    param={"metric_type": "COSINE", "params": {"nprobe": 10}},
    limit=5,
    output_fields=["text", "metadata", "page_number", "file_name", "product_name", "product_month","product_year", "chapter",
                   "company_entity"],
)
print(f"Number of rows: {collection.num_entities}")

for hits in search_results:
    for hit in hits:
        print(f"ID: {hit.entity.get('id')}")
        # print(f"Chunk content: {hit.entity.get('text')}")
        # print(f"Page Number: {hit.entity.get('page_number')}")
        # print(f"File Name: {hit.entity.get('file_name')}")
        # print(f"Product Name: {hit.entity.get('product_name')}")
        print(f"Product Month: {hit.entity.get('product_month')}")
        print(f"Product Year: {hit.entity.get('product_year')}")
        print(f"Chapter: {hit.entity.get('chapter')}")
        print(f"Company Entity: {hit.entity.get('company_entity')}")
        # print(f"Metadata: {hit.entity.get('metadata')}")  # Metadata is already requested
        # print(f"Distance: {hit.distance}")
        print("---")

Number of rows: 70775
ID: 457658653665384596
Product Month: None
Product Year: 2006
Chapter: 5. Kündigung
Company Entity: Helvetia
---
ID: 457658653665384614
Product Month: None
Product Year: 2003
Chapter: 5. Kündigung
Company Entity: Helvetia
---
ID: 457658653665384562
Product Month: None
Product Year: 2010
Chapter: 5. Kündigung
Company Entity: Helvetia
---
ID: 457658653664837998
Product Month: None
Product Year: 2020
Chapter: 7.11 Rückkauf
Company Entity: Helvetia
---
ID: 457658653664823958
Product Month: None
Product Year: 2017
Chapter: 6.10 Rückkauf
Company Entity: Helvetia
---
