https://github.com/DS4SD/docling/blob/main/docs/examples/rag_langchain.ipynb

In [None]:
!pip install -qq docling docling-core python-dotenv langchain-text-splitters langchain-huggingface langchain-milvus

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/226.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.4/226.4 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.9/5.9 MB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.6/53.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os

from dotenv import load_dotenv

load_dotenv()

False

In [None]:
from typing import Iterator

from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document as LCDocument

from docling.document_converter import DocumentConverter

class DoclingPDFLoader(BaseLoader):

    def __init__(self, file_path: str | list[str]) -> None:
        self._file_paths = file_path if isinstance(file_path, list) else [file_path]
        self._converter = DocumentConverter()

    def lazy_load(self) -> Iterator[LCDocument]:
        for source in self._file_paths:
            dl_doc = self._converter.convert(source).document
            text = dl_doc.export_to_markdown()
            yield LCDocument(page_content=text)

In [None]:

FILE_PATH = "/content/Coal2022.pdf"

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = DoclingPDFLoader(file_path=FILE_PATH)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)

In [None]:
docs = loader.load()
splits = text_splitter.split_documents(docs)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

HF_EMBED_MODEL_ID = "BAAI/bge-small-en-v1.5"
embeddings = HuggingFaceEmbeddings(model_name=HF_EMBED_MODEL_ID)

In [None]:
from tempfile import TemporaryDirectory

from langchain_milvus import Milvus

MILVUS_URI = os.environ.get(
    "MILVUS_URI", f"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db"
)

vectorstore = Milvus.from_documents(
    splits,
    embeddings,
    connection_args={"uri": MILVUS_URI},
    drop_old=True,
)

2024-12-31 23:06:07,759 [ERROR][handler]: RPC error: [create_index], <MilvusException: (code=65535, message=invalid index type: HNSW, local mode only support FLAT IVF_FLAT AUTOINDEX: )>, <Time:{'RPC start': '2024-12-31 23:06:07.757695', 'RPC error': '2024-12-31 23:06:07.759854'}> (decorators.py:140)


In [None]:
from langchain_huggingface import HuggingFaceEndpoint
import torch
HF_API_KEY = os.environ.get("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
HF_LLM_MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"



llm = HuggingFaceEndpoint(
    repo_id=HF_LLM_MODEL_ID,
    device="auto",
    # Change torch_dtype to a string instead of a torch.dtype object
    torch_dtype="float32",
    huggingfacehub_api_token=HF_API_KEY,
    do_sample=True,
    max_new_tokens=50,
)

                    device was transferred to model_kwargs.
                    Please make sure that device is what you intended.
                    torch_dtype was transferred to model_kwargs.
                    Please make sure that torch_dtype is what you intended.


In [None]:
from typing import Iterable

from langchain_core.documents import Document as LCDocument
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough


def format_docs(docs: Iterable[LCDocument]):
    return "\n\n".join(doc.page_content for doc in docs)


retriever = vectorstore.as_retriever()

prompt = PromptTemplate.from_template(
    "Context information is below.\n---------------------\n{context}\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: {question}\nAnswer:\n"
)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
rag_chain.invoke("What is the main topic of the book?")

'There is no book mentioned in the provided context information. The text appears to be a part of an article or a report discussing the international coal trade and the impact of Russia sanctions. Therefore, I cannot provide an answer to the query.'

In [None]:
rag_chain.invoke("How many pages in the book /content/Coal2022.pdf")

'Unfortunately, the query cannot be answered with the provided information. The query asks for a page count of a PDF file, but the context information only provides an overview of the content of the Coal2022 report, including its title, publication history, and'

In [None]:
rag_chain.invoke("Can you tell me what you know about the book? /content/Coal2022.pdf")

'The text appears to be a report or a document related to the coal industry. The title of the report is "Coal 2022", and it seems to cover a range of topics such as coal demand, supply, trade, costs, and prices'

In [None]:
rag_chain.invoke("Can you show me the information in the book about the demand for coal? /content/Coal2022.pdf")

'The text does not provide the information in the book "Coal2022.pdf". However, according to the context information, the book "Coal2022" provides a thorough analysis of recent trends in coal demand, supply, trade, costs and prices against'

In [None]:
rag_chain.invoke("Can you give advice on coal trading is it profitable or not from the book /content/Coal2022.pdf")

'Based on the provided context information, it appears that coal trading is not a highly profitable venture, especially outside of China and India. The high prices of coal since October 2021 have not led to a significant uptick in investment in coal mining projects'