# Building RAG with Docling and LlamaIndex

In [None]:
!pip install llama-index-core

In [None]:
!pip install llama-index-readers-docling
!pip install llama-index-node-parser-docling
!pip install llama-index-readers-file
!pip install llama-index-llm-openai
!pip install llama-index-embeddings-openai

In [1]:
# setting up the llm
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.readers.docling import DoclingReader
from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.core import VectorStoreIndex


In [2]:
llm = OpenAI(model='gpt-4o')
embedding_model = OpenAIEmbedding()

In [3]:
reader = DoclingReader()
parser = MarkdownNodeParser()
docs = reader.load_data('/workspaces/Implementing-RAG/data/Guardian Vision Insurance.pdf')
for doc in docs:
    doc.metadata = {"source": doc.doc_id}

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


In [5]:
print(docs)

[Document(id_='9e217547-b494-443a-b322-9f630f0b2259', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text="## Vision Benefit Summary\n\n## Why choose Guardian for your Vision insurance:\n\nFor just a few dollars a month, this coverage saves you money on optical wellness, as well as providing discounts on eyewear, contacts, and corrective vision services\n\n- · Extensive network of vision specialists and medical professionals\n- · Affordable coverage\n- · Quick and easy claim payments\n\n## About Your Benefits:\n\nOption 1: Significant out-of-pocket savings available with your Full Feature plan by visiting one of VSP's network locations, including one of the largest private practice provider networks, Visionworks and contracted Pearle Vision locations.\n\n| Your Vision Plan                                                                 | VSP Vision Premier       

In [66]:
index = VectorStoreIndex.from_documents(documents  = docs, transformation = [parser], embed_model = embedding_model)

In [23]:
print(index.index_id)

ec8752ea-38b0-4267-8b9b-f02e3a012e39


In [67]:
query_engine = index.as_query_engine(llm=llm,return_source=True)

In [68]:
response = query_engine.query("What discount is provided for laser correction surgery?")
response.response.strip()
# Extract top 2 source content
top_sources = response.source_nodes[:2] 
for i, source_node in enumerate(top_sources, start=1):
    print(f"Source {i}:")
    print(source_node.node.text.strip())  # Print the text content of the source
    print("-" * 50)

Source 1:
This document is a summary of the major features of the referenced insurance coverage.  It is intended for illustrative purposes only and does not constitute a contract. The insurance plan documents, including the policy and certificate, comprise the contract for coverage. The full plan description, including the benefits and all terms, limitations and exclusions that apply will be contained in your insurance certificate. The plan documents are the final arbiter of coverage.  Coverage terms may vary by state and actual sold plan. The premium amounts reflected in this summary are an approximation; if there is a discrepancy between this amount and the premium actually billed, the latter prevails.

## Manage Your Benefits:

Go to www.GuardianAnytime.com to access secure information about your Guardian benefits including access to an image of your ID Card. Your on-line account will be set up within 30 days after your plan effective date.

## EXCLUSIONS AND LIMITATIONS

Important 

## Buidling RAG

In [None]:
!pip install langchain-milvus

In [None]:
!pip install langchain-huggingface

In [None]:
!pip install langchain-text-splitters

In [4]:
import os 
from dotenv import load_dotenv
load_dotenv()

True

In [5]:
from typing import Iterator
from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document as LCDocument
from docling.document_converter import DocumentConverter

In [6]:
from typing import Iterator

from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document as LCDocument
from docling.document_converter import DocumentConverter

class DoclingPDFLoader(BaseLoader):

    def __init__(self, file_path: str | list[str]) -> None:
        self._file_paths = file_path if isinstance(file_path, list) else [file_path]
        self._converter = DocumentConverter()

    def lazy_load(self) -> Iterator[LCDocument]:
        for source in self._file_paths:
            dl_doc = self._converter.convert(source).document
            text = dl_doc.export_to_markdown()
            yield LCDocument(page_content=text)

In [7]:
FILE_PATH = '/workspaces/Implementing-RAG/data/Guardian Vision Insurance.pdf'

In [8]:

from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = DoclingPDFLoader(file_path=FILE_PATH)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)

In [9]:

docs = loader.load()
splits = text_splitter.split_documents(docs)

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


In [10]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

HF_EMBED_MODEL_ID = "BAAI/bge-small-en-v1.5"
embeddings = HuggingFaceEmbeddings(model_name=HF_EMBED_MODEL_ID)

In [15]:
from tempfile import TemporaryDirectory

from langchain_milvus import Milvus

MILVUS_URI = os.environ.get(
    "MILVUS_URI", f"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db"
)

vectorstore = Milvus.from_documents(
    splits,
    embeddings,
    connection_args={"uri": MILVUS_URI},
    drop_old=True,
)

In [16]:
from langchain_huggingface import HuggingFaceEndpoint

HF_API_KEY = os.environ.get("HF_API_KEY")
HF_LLM_MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"

llm = HuggingFaceEndpoint(
    repo_id=HF_LLM_MODEL_ID,
    huggingfacehub_api_token=HF_API_KEY,
)

In [17]:
from typing import Iterable

from langchain_core.documents import Document as LCDocument
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough


def format_docs(docs: Iterable[LCDocument]):
    return "\n\n".join(doc.page_content for doc in docs)


retriever = vectorstore.as_retriever()

prompt = PromptTemplate.from_template(
    "Context information is below.\n---------------------\n{context}\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: {question}\nAnswer:\n"
)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [18]:
rag_chain.invoke("What is the copay for an eye exam in the VSP Network Signature Plan?")

BadRequestError: (Request ID: WOKDQzfaPcccjSUBnxewb)

Bad request:
Authorization header is correct, but the token seems invalid

: 

# Evaluating

In [None]:
# Evaluating
!pip install ragas

In [9]:
# need to provide question, answer, context, ground truth
# answer and context will be evaluated
from datasets import Dataset
import os
from ragas import evaluate
from ragas.metrics import faithfulness, answer_correctness

In [41]:
data_sample = {
    'question': [
        'What is the copay for an eye exam in the VSP Network Signature Plan?',
        'What is the contact lens allowance for medically necessary lenses under the plan?',
        'How often are eye exams covered under the vision plan?',
        'What discount is provided for laser correction surgery?'
    ],
    'answer': [
        'The copay for an eye exam in the VSP Network Signature Plan is $10',
        'The contact lens allowance for medically necessary lenses under the plan is $0 in-network, meaning there is no out-of-pocket cost for the member. For out-of-network services, the member would pay the amount over $210.',
        'Eye exams are covered every calendar year under the vision plan.',
        'The discount for laser correction surgery is an average of 10-20% off the usual and customary charge or 5% off the promotional price. Members out-of-pocket costs are limited to $1,800 per eye for LASIK, $1,500 per eye for PRK, or $2,300 per eye for Custom LASIK, Custom PRK, or Bladeless LASIK.'
    ],
    'contexts': [
        [
        """VISION PLAN C 180 Benefit Summary

- · 1 Extra $20 on select brands
- · Members can use their in network benefits on line at Eyeconic.com.

This document is a summary of the major features of the referenced insurance coverage.  It is intended for illustrative purposes only and does not constitute a contract. The insurance plan documents, including the policy and certificate, comprise the contract for coverage. The full plan description, including the benefits and all terms, limitations and exclusions that apply will be contained in your insurance certificate. The plan documents are the final arbiter of coverage.  Coverage terms may vary by state and actual sold plan. The premium amounts reflected in this summary are an approximation; if there is a discrepancy between this amount and the premium actually billed, the latter prevails.

## Manage Your Benefits:

Go to www.GuardianAnytime.com to access secure information about your Guardian benefits including access to an image of your ID Card. Your on-line account will be set up within 30 days after your plan effective date.

## EXCLUSIONS AND LIMITATIONS

Important Information: This policy provides vision care limited benefits health insurance only. It does not provide basic hospital, basic medical or major medical insurance as defined by the New York State Insurance Department. Coverage is limited to those charges that are necessary for a routine vision examination. Co-pays apply. The plan does not pay for: orthoptics or vision training and any associated supplemental testing; medical or surgical treatment of the eye; and eye examination or corrective eyewear required by an employer as a condition of employment; replacement of lenses and frames that are furnished under this plan, which are lost or broken (except at normal intervals when services are otherwise available or a warranty exists). The plan limits benefits for blended lenses, oversized lenses, photochromic lenses, tinted lenses, progressive multifocal lenses, coated or laminated lenses, a frame that exceeds plan allowance, cosmetic lenses; U-V protected lenses and optional cosmetic processes.

The services, exclusions and limitations listed above do not constitute a contract and are a summary only. The Guardian plan documents are the final arbiter of coverage. Contract #GP-1-VSN-96-VIS et al.

## Laser Correction Surgery:

Discounts on average of 10-20% off usual and customary charge or 5% off promotional price for vision laser Surgery. Members out-of-pocket costs are limited to $1,800 per eye for LASIK or $1,500 per eye for PRK or $2300 per eye for Custom LASIK, Custom PRK, or Bladeless LASIK.

Laser surgery is not an insured benefit. The surgery is available at a discounted fee. The covered person must pay the entire discounted fee. In addition, the laser surgery discount may not be available in all states.
"""
        ],
        [""" VISION PLAN C 180 Benefit Summary

- · 1 Extra $20 on select brands
- · Members can use their in network benefits on line at Eyeconic.com.

This document is a summary of the major features of the referenced insurance coverage.  It is intended for illustrative purposes only and does not constitute a contract. The insurance plan documents, including the policy and certificate, comprise the contract for coverage. The full plan description, including the benefits and all terms, limitations and exclusions that apply will be contained in your insurance certificate. The plan documents are the final arbiter of coverage.  Coverage terms may vary by state and actual sold plan. The premium amounts reflected in this summary are an approximation; if there is a discrepancy between this amount and the premium actually billed, the latter prevails.

## Manage Your Benefits:

Go to www.GuardianAnytime.com to access secure information about your Guardian benefits including access to an image of your ID Card. Your on-line account will be set up within 30 days after your plan effective date.

## EXCLUSIONS AND LIMITATIONS

Important Information: This policy provides vision care limited benefits health insurance only. It does not provide basic hospital, basic medical or major medical insurance as defined by the New York State Insurance Department. Coverage is limited to those charges that are necessary for a routine vision examination. Co-pays apply. The plan does not pay for: orthoptics or vision training and any associated supplemental testing; medical or surgical treatment of the eye; and eye examination or corrective eyewear required by an employer as a condition of employment; replacement of lenses and frames that are furnished under this plan, which are lost or broken (except at normal intervals when services are otherwise available or a warranty exists). The plan limits benefits for blended lenses, oversized lenses, photochromic lenses, tinted lenses, progressive multifocal lenses, coated or laminated lenses, a frame that exceeds plan allowance, cosmetic lenses; U-V protected lenses and optional cosmetic processes.

The services, exclusions and limitations listed above do not constitute a contract and are a summary only. The Guardian plan documents are the final arbiter of coverage. Contract #GP-1-VSN-96-VIS et al.

## Laser Correction Surgery:

Discounts on average of 10-20% off usual and customary charge or 5% off promotional price for vision laser Surgery. Members out-of-pocket costs are limited to $1,800 per eye for LASIK or $1,500 per eye for PRK or $2300 per eye for Custom LASIK, Custom PRK, or Bladeless LASIK.

Laser surgery is not an insured benefit. The surgery is available at a discounted fee. The covered person must pay the entire discounted fee. In addition, the laser surgery discount may not be available in all states.
"""],
        [
            """
VISION PLAN C 180 Benefit Summary

- · 1 Extra $20 on select brands
- · Members can use their in network benefits on line at Eyeconic.com.

This document is a summary of the major features of the referenced insurance coverage.  It is intended for illustrative purposes only and does not constitute a contract. The insurance plan documents, including the policy and certificate, comprise the contract for coverage. The full plan description, including the benefits and all terms, limitations and exclusions that apply will be contained in your insurance certificate. The plan documents are the final arbiter of coverage.  Coverage terms may vary by state and actual sold plan. The premium amounts reflected in this summary are an approximation; if there is a discrepancy between this amount and the premium actually billed, the latter prevails.

## Manage Your Benefits:

Go to www.GuardianAnytime.com to access secure information about your Guardian benefits including access to an image of your ID Card. Your on-line account will be set up within 30 days after your plan effective date.

## EXCLUSIONS AND LIMITATIONS

Important Information: This policy provides vision care limited benefits health insurance only. It does not provide basic hospital, basic medical or major medical insurance as defined by the New York State Insurance Department. Coverage is limited to those charges that are necessary for a routine vision examination. Co-pays apply. The plan does not pay for: orthoptics or vision training and any associated supplemental testing; medical or surgical treatment of the eye; and eye examination or corrective eyewear required by an employer as a condition of employment; replacement of lenses and frames that are furnished under this plan, which are lost or broken (except at normal intervals when services are otherwise available or a warranty exists). The plan limits benefits for blended lenses, oversized lenses, photochromic lenses, tinted lenses, progressive multifocal lenses, coated or laminated lenses, a frame that exceeds plan allowance, cosmetic lenses; U-V protected lenses and optional cosmetic processes.

The services, exclusions and limitations listed above do not constitute a contract and are a summary only. The Guardian plan documents are the final arbiter of coverage. Contract #GP-1-VSN-96-VIS et al.

## Laser Correction Surgery:

Discounts on average of 10-20% off usual and customary charge or 5% off promotional price for vision laser Surgery. Members out-of-pocket costs are limited to $1,800 per eye for LASIK or $1,500 per eye for PRK or $2300 per eye for Custom LASIK, Custom PRK, or Bladeless LASIK.

...
"""
        ],
        [
            """VISION PLAN C 180 Benefit Summary

- · 1 Extra $20 on select brands
- · Members can use their in network benefits on line at Eyeconic.com.

This document is a summary of the major features of the referenced insurance coverage.  It is intended for illustrative purposes only and does not constitute a contract. The insurance plan documents, including the policy and certificate, comprise the contract for coverage. The full plan description, including the benefits and all terms, limitations and exclusions that apply will be contained in your insurance certificate. The plan documents are the final arbiter of coverage.  Coverage terms may vary by state and actual sold plan. The premium amounts reflected in this summary are an approximation; if there is a discrepancy between this amount and the premium actually billed, the latter prevails.

## Manage Your Benefits:

Go to www.GuardianAnytime.com to access secure information about your Guardian benefits including access to an image of your ID Card. Your on-line account will be set up within 30 days after your plan effective date.

## EXCLUSIONS AND LIMITATIONS

Important Information: This policy provides vision care limited benefits health insurance only. It does not provide basic hospital, basic medical or major medical insurance as defined by the New York State Insurance Department. Coverage is limited to those charges that are necessary for a routine vision examination. Co-pays apply. The plan does not pay for: orthoptics or vision training and any associated supplemental testing; medical or surgical treatment of the eye; and eye examination or corrective eyewear required by an employer as a condition of employment; replacement of lenses and frames that are furnished under this plan, which are lost or broken (except at normal intervals when services are otherwise available or a warranty exists). The plan limits benefits for blended lenses, oversized lenses, photochromic lenses, tinted lenses, progressive multifocal lenses, coated or laminated lenses, a frame that exceeds plan allowance, cosmetic lenses; U-V protected lenses and optional cosmetic processes.

The services, exclusions and limitations listed above do not constitute a contract and are a summary only. The Guardian plan documents are the final arbiter of coverage. Contract #GP-1-VSN-96-VIS et al.

## Laser Correction Surgery:

Discounts on average of 10-20% off usual and customary charge or 5% off promotional price for vision laser Surgery. Members out-of-pocket costs are limited to $1,800 per eye for LASIK or $1,500 per eye for PRK or $2300 per eye for Custom LASIK, Custom PRK, or Bladeless LASIK.

Laser surgery is not an insured benefit. The surgery is available at a discounted fee"""
        ]
    ],
    'ground_truth': [
       '$10',
        'Amount over $210',
        'Every calendar year',
        'Average of 10-20% off usual and customary charges or 5% off promotional prices'
    ]
}


In [42]:
dataset = Dataset.from_dict(data_sample)

Faithfulness - factual consistenciy of the generated answer against the given context. 

In [43]:
score = evaluate(dataset, metrics=[faithfulness])

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

In [44]:
score

{'faithfulness': 0.3393}

In [None]:
from ragas import SingleTurnSample
from ragas.metrics import NonLLMContextPrecisionWithReference

context_precision = NonLLMContextPrecisionWithReference()

sample = SingleTurnSample(
    retrieved_contexts=[""], 
    reference_contexts=["Paris is the capital of France.", "The Eiffel Tower is one of the most famous landmarks in Paris."]
)

await context_precision.single_turn_ascore(sample)

0.9999999999

In [54]:
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import NonLLMContextRecall

sample = SingleTurnSample(
    retrieved_contexts=["Paris is the capital of France."], 
    reference_contexts=["Paris is the capital of France.", "The Eiffel Tower is one of the most famous landmarks in Paris."]
)

context_recall = NonLLMContextRecall()
await context_recall.single_turn_ascore(sample)

0.5

AssertionError: LLM is not set