Install the necessary libraries using pip.



In [None]:
%pip install -q llama-index transformers sentence-transformers llama-index-embeddings-huggingface duckdb llama-index-vector-stores-duckdb "pyarrow==20.*"

# Create vector DB

# Setup

Setup llama index:
* Set embedding model (either "all-MiniLM-L6-v2" or "bge-small-en-v1.5")
* Set splitter, chunck_size, overlap, etc.

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from llama_index.core.node_parser import SentenceSplitter, SemanticSplitterNodeParser

embed_model = HuggingFaceEmbedding(
    # model_name="BAAI/bge-small-en-v1.5",
    model_name="all-MiniLM-L6-v2",
)
# 256 is max chunk size vor all-mini; bge-small could handle 512
splitter = SentenceSplitter(chunk_size=256, chunk_overlap=32, paragraph_separator="\n\n")

Settings.embed_model = embed_model

Settings.text_splitter =splitter

Load the dicom standard manual (converted to markdown) - for now only the first part

In [None]:
import os
from llama_index.core import SimpleDirectoryReader

# Define the directory path where your markdown documents are located
docs_dir = "markdown"

# Use SimpleDirectoryReader to load the documents from the specified directory
documents = SimpleDirectoryReader(docs_dir).load_data()

# Chunck documents with the splitter
nodes = splitter.get_nodes_from_documents(documents)

# Display the number of documents loaded and the first document's text (for verification)
print(f"Loaded {len(documents)} documents.")
if documents:
    print("First document content:")
    print(documents[0].text[:500]+ "...") # Print first 500 characters

Loaded 1 documents.
First document content:
# PS3.1

## DICOM PS3.1 2025b - Introduction and Overview

# 1 Scope and Field of Application

PS3.1 provides an overview of the entire Digital Imaging and Communications in Medicine
 (DICOM) Standard. It describes the history, scope, goals, and structure of the Standard. In
 particular, it contains a brief description of the contents of each Part of the
 Standard.

## 1.1 Scope of DICOM

Digital Imaging and Communications in Medicine (DICOM) is the standard for the communication and management ...


## Add AI generated context to chuncks

Using a bigger LLM to add summaries and/or context to the chunks can be helpful

In [4]:
%pip install --upgrade --quiet google-genai

import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

In [None]:
from google import genai
import os
from dotenv import load_dotenv
load_dotenv()

# GOOGLE_API_KEY should be provided in a .env file
API_KEY = os.getenv("GOOGLE_API_KEY", None)

if not API_KEY:
    raise Exception("You must provide an Google API key!")

client = genai.Client(vertexai=False, api_key=API_KEY)

MODEL_ID = "gemini-2.0-flash-lite"


def get_ai_summary(doc):
    prompt = f"Summarize the following document in 1-2 sentences. Return only the summary. \n```{doc.text[:1500]}```"

    response = client.models.generate_content(
        model=MODEL_ID, contents=prompt
    )
    return response.text

def get_chunck_context(doc, node):
    """Generate a small context description to situate the chunk within its document."""
    
    prompt = """<document>
    {WHOLE_DOCUMENT}
    </document>

    Here is the chunk we want to situate within the whole document:
    <chunk>
    {CHUNK_CONTENT}
    </chunk>
    Please give a short succinct context to situate this chunk within the overall document for the purposes of improving search retrieval of the chunk.
    Answer only with the succinct context and nothing else.
    """

    response = client.models.generate_content(
        model=MODEL_ID, contents=prompt.format(WHOLE_DOCUMENT=doc, CHUNK_CONTENT=node)
    )
    return response.text

In [None]:
# split the documents into nodes and keep a reference (aka id) of the containing document
nodes_with_doc = [
    (doc_id, node)
    for doc_id, doc in enumerate(documents)
    for node in splitter.get_nodes_from_documents(documents=[doc])
]
nodes_with_doc[0]

(0,
 TextNode(id_='63f6d140-2f98-4b8e-becb-3c2f37f55c85', embedding=None, metadata={'file_path': '/content/markdown/part01.md', 'file_name': 'part01.md', 'file_type': 'text/markdown', 'file_size': 50877, 'creation_date': '2025-07-22', 'last_modified_date': '2025-07-22', 'summary': "DICOM PS3.1 is an introduction and overview of the DICOM standard, which governs the communication and management of medical imaging information. It outlines the standard's history, scope, goals, and structure, including its network and media communication protocols, file formats, and conformance requirements.\n"}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='ab1fd8aa-3474-4cf2-acfc-9f0f6265b1c4', node_type=<ObjectType.DOCUMEN

In [None]:
import time

for doc_id, node in nodes_with_doc:
    doc = documents[doc_id]
    if "summary" not in doc.metadata:
        doc.metadata['summary'] = get_ai_summary(doc)

    # if "context" not in node.metadata:
      # node.metadata['summary'] = get_ai_summary(node)
    if "context" not in node.metadata:
        node.metadata['context'] = get_chunck_context(doc, node)
        time.sleep(1) # sleep inbetween to don't exhaust googles api limits (per minute)

_, nodes = zip(*nodes_with_doc)
nodes = list(nodes)

Print example context / summary

In [None]:
print(nodes[0].metadata["context"])
# print(nodes[0].metadata["summary"])

The chunk is the beginning of DICOM standard document PS3.1, providing an introduction and overview, including the scope and field of application.



## Create vector store

Using duckdb as a leightweigth, file based vector database. Quite easy with llama-index. Just feed in the documents and/or nodes.

In [35]:
from llama_index.vector_stores.duckdb import DuckDBVectorStore
from llama_index.core import StorageContext, VectorStoreIndex
from pathlib import Path

db_path = Path("./embeddings.db")

clear_needed = db_path.exists()

vector_store = DuckDBVectorStore(db_path.name, persist_dir=str(db_path.parent), embed_dim=384)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

if clear_needed:
    vector_store.clear()

# Create index from nodes
index = VectorStoreIndex(nodes, storage_context=storage_context)

# Save DB
vector_store.client.checkpoint()
#vector_store.client.execute("CHECKPOINT")

print("Vector index created successfully.")

Vector index created successfully.


Show DB schema and first row

In [36]:
from duckdb import duckdb

# Connect to the DuckDB database
con = vector_store.client

# Get the table name from the vector store (assuming default table name)
table_name = vector_store.table_name

# Show the schema of the table
print(f"Schema for table: {table_name}")
schema = con.execute(f"DESCRIBE {table_name}").fetchall()
for col in schema:
    print(col)

# Show the first row of the table
print(f"\nFirst row of table: {table_name}")
first_row = con.execute(f"SELECT * FROM {table_name} LIMIT 1").fetchall()[0]
print(f"ID: {first_row[0]}, Text-length: {len(first_row[1])}")
print(first_row[1])

Schema for table: documents
('node_id', 'VARCHAR', 'NO', 'PRI', None, None)
('text', 'VARCHAR', 'YES', None, None, None)
('embedding', 'FLOAT[384]', 'YES', None, None, None)
('metadata_', 'JSON', 'YES', None, None, None)

First row of table: documents
ID: 63f6d140-2f98-4b8e-becb-3c2f37f55c85, Text-length: 642
# PS3.1

## DICOM PS3.1 2025b - Introduction and Overview

# 1 Scope and Field of Application

PS3.1 provides an overview of the entire Digital Imaging and Communications in Medicine
 (DICOM) Standard. It describes the history, scope, goals, and structure of the Standard. In
 particular, it contains a brief description of the contents of each Part of the
 Standard.

## 1.1 Scope of DICOM

Digital Imaging and Communications in Medicine (DICOM) is the standard for the communication and management of medical imaging information and related data.

The DICOM Standard facilitates interoperability of medical imaging equipment by
 specifying:


## Test retrieval

Retrieve the top 3 matches for an example query:

In [None]:
query = "Who is responsible for DICOM standardization?"

retriever = index.as_retriever(similarity_top_k=3)
nodes = retriever.retrieve(query)
[node.text for node in nodes]

['### 1.4.2\xa0Continuous Maintenance\n\nThe DICOM Standard is an evolving standard and it is maintained in accordance with\n the Procedures of the DICOM Standards Committee.\n Proposals for enhancements are welcome from all users of the Standard, and may be submitted to the Secretariat.\n Supplements and corrections to the Standard are balloted and approved several times a year.\n When approved as Final Text, each change becomes official, is published separately, and goes into effect immediately.\n At intervals, all of the approved Final Text changes are consolidated and published in an updated edition of the Standard.\n Once changes are consolidated into an updated edition of the Standard, the individual change documents are not maintained;\n readers are directed to use the consolidated edition of the Standard.\n\nA requirement in updating the Standard is to maintain effective compatibility with previous editions.\n\nThe maintenance process may involve retirement of sections of the S

Test RAG with gemini as LLM

In [11]:
%pip -q install llama-index-llms-google-genai

In [12]:
from IPython.display import Markdown, display
from llama_index.vector_stores.duckdb import DuckDBVectorStore
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.llms.google_genai import GoogleGenAI

vector_store = DuckDBVectorStore.from_local("./embeddings.db")
index = VectorStoreIndex.from_vector_store(vector_store)

llm = GoogleGenAI(
    model="gemini-2.0-flash",
)
query_engine = index.as_query_engine(llm=llm)
response = query_engine.query("What is DICOM used for?")
display(Markdown(f"<b>{response}</b>"))

<b>DICOM is a world-wide standard that can be used in every locale. It provides mechanisms to handle data that support cultural requirements, such as different writing systems, character sets, languages, and structures for addresses and person names. It supports the variety of workflows, processes, and policies used for biomedical imaging in different geographic regions, medical specialties, and local practices.
</b>