# Indexing the pdf file into chunks

Docling Library Ref: https://docling-project.github.io/docling/examples/

In [1]:
import os
import io
import ast
import openai
import base64
import hashlib
import numpy as np
import pandas as pd

from PIL import Image
from pathlib import Path
from pymongo import MongoClient
from sentence_transformers import SentenceTransformer

from docling.chunking import HybridChunker
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling_core.types.doc import PictureItem

In [2]:
pdfFile = "./datasets/Dataset1.pdf"
mdFile = "./datasets/Dataset1.md"
embFile = "./data/largeEmbeddings.csv"
chunksOutFile = "./data/chunksDocling.txt"

imageDir = "./images/"

embedModelName = "BAAI/bge-large-en-v1.5"
embedModel = SentenceTransformer(embedModelName)

client = MongoClient("mongodb://localhost:27017/")
db = client["rag"]
collection = db["doc"]

## Builiding a .md File

Convert from pdf to docling format if not already done

In [3]:
if not os.path.exists(embFile):
    source = pdfFile
    
    # Docling Convertor
    # Pass in options to include images
    opts = PdfPipelineOptions()
    opts.generate_picture_images = True

    converter = DocumentConverter(
        format_options = {InputFormat.PDF: PdfFormatOption(pipeline_options=opts)}
    )
    result = converter.convert(source)

    # Convert and write pdf contents to .md File
    result_markdown = result.document.export_to_markdown()

    with open(mdFile, 'w', encoding='utf-8') as f:
        f.write(result_markdown)

    print(f"Markdown written to: {mdFile}")

## Text and Table Chunking

Uses the docling library's chunker to convert the pdf into chunks. Also converts tables into text format

In [4]:
if not os.path.exists(embFile):
    chunker = HybridChunker()
    chunk_iter = chunker.chunk(dl_doc=result.document)
    # Stores a list of all chunks
    chunkL = list(chunk_iter)

Store chunks in .txt

In [5]:
if not os.path.exists(embFile):
    dfL = []
    txtChunk = ""
    
    for idx, chunk in enumerate(chunkL):
        # Text to write in the chunks.txt file
        txtChunk += f"-- Chunk {idx} --\n{chunk.text}\n\n"

        # Rows to create dataframe
        row = {
            "Chunks" : chunk.text,
            "PageNo": chunk.model_dump()["meta"]["doc_items"][0]["prov"][0]["page_no"]
        }
        dfL.append(row)


    os.makedirs("data", exist_ok=True)
    output_file = os.path.join("data", "chunksDocling.txt")
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(txtChunk)

    df = pd.DataFrame(dfL)

## Image Chunks

In [6]:
def clear_output_folder(output_folder):
    if os.path.exists(output_folder):
        for filename in os.listdir(output_folder):
            file_path = os.path.join(output_folder, filename)
            try:
                if os.path.isfile(file_path) or os.path.islink(file_path):
                    os.unlink(file_path)
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)
            except Exception as e:
                print(f'Failed to delete {file_path}. Reason: {e}')
    else:
        os.makedirs(output_folder)

In [7]:
if not os.path.exists(embFile):
    output_dir = Path(imageDir)
    output_dir.mkdir(exist_ok=True)
    clear_output_folder(imageDir)

    # Set to store unique hashes so that only unique images are stored
    seen_hashes = set()
    # Map filepath to page number
    image_metadata = []

    for idx, picture in enumerate(result.document.pictures):
        image_ref = picture.image
        if image_ref and image_ref.pil_image:
            pil_image = image_ref.pil_image

            # Convert image to bytes and compute hash
            img_byte_arr = io.BytesIO()
            pil_image.save(img_byte_arr, format='PNG')
            img_bytes = img_byte_arr.getvalue()
            img_hash = hashlib.sha256(img_bytes).hexdigest()

            # Save image if it's unique
            if img_hash not in seen_hashes:
                seen_hashes.add(img_hash)

                # Determine the page number
                page_number = -1
                if picture.prov and picture.prov[0].page_no is not None:
                    page_number = picture.prov[0].page_no

                image_filename = f"page-{page_number}_img{len(seen_hashes)}.png"
                image_path = output_dir / image_filename
                pil_image.save(image_path)
                
                # Append metadata
                image_metadata.append({
                    'imagePath': str(image_path),
                    'PageNo': page_number
                })

    # Store image paths and pageNo in DF
    imgDF = pd.DataFrame(image_metadata)
    
    display(imgDF.head(5))


## Get LLM generated summary of Image

Uses GPT-4o, requires OpenAI API Key

In [8]:
def getImgSummary(imgPath):
    if os.getenv("OPENAI_API_KEY") is None:
        return ""
    
    with open(imgPath, "rb") as img_file:
        image_bytes = img_file.read()
        # Base64 Encoded image to send to LLM
        image = base64.b64encode(image_bytes).decode("utf-8")

    # Prompt and message structure
    prompt_template = """Describe the image in detail. Be specific about diagrams, flowchards, graphs, etc."""

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt_template},
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{image}"},
                },
            ],
        }
    ]

    # Call the GPT-4o model
    response = openai.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        max_tokens=1000
    )
    
    return response.choices[0].message.content

In [9]:
if not os.path.exists(embFile):
    # Compute summary for each image
    imgDF["Chunks"] = imgDF["imagePath"].apply(getImgSummary)
    
    display(imgDF)

## Combine Image and Text Chunks in one DataFrame

In [10]:
if not os.path.exists(embFile):
    dfComb = pd.concat([df, imgDF], ignore_index=True)
    
    display(dfComb)

## Compute embeddings and store chunks in mongoDB

In [11]:
if not os.path.exists(embFile):
    # Compute embeddings of each chunk
    embeddings = embedModel.encode(dfComb["Chunks"].tolist(), normalize_embeddings=True)
    
    dfComb["embeddings"] = embeddings.tolist()
    # Write to .csv so doesnt have to be calculated again
    dfComb.to_csv(embFile)
    
    docs = dfComb.to_dict(orient="records")

    # Refresh mongoDB and write whole DataFrame to monogoDB
    collection.delete_many({})
    collection.insert_many(docs)
    print("Inserted")
    
else:
    # If embeddings.csv file already exists load from .csv file
    dfComb = pd.read_csv(embFile)
    dfComb["embeddings"] = dfComb["embeddings"].apply(ast.literal_eval)
    
dfComb

Unnamed: 0.1,Unnamed: 0,Chunks,PageNo,imagePath,embeddings
0,0,An Roinn Airgeadais Department of Finance\nTra...,1,,"[0.014186208136379719, -0.013681311160326004, ..."
1,1,PUBLIC WORKS CONTRACTS - CONTRACTORS,1,,"[-0.018307723104953766, -0.004139376804232597,..."
2,2,This training manual is both the course materi...,2,,"[-0.02164776809513569, -0.022553179413080215, ..."
3,3,"1., 1 = COURSE AGENDA. 1., 2 = COURSE AGENDA. ...",3,,"[0.035058051347732544, 0.0017086670268326998, ..."
4,4,"= 5.5. , 2 = Comparison of Risk Allocation Und...",3,,"[0.0200276467949152, -0.00019992324814666063, ..."
...,...,...,...,...,...
561,561,"Sure, please upload the image or provide detai...",77,images\page-77_img16.png,"[0.005667939316481352, -0.04764747992157936, -..."
562,562,The image shows a horizontal yellow line with ...,203,images\page-203_img17.png,"[-0.007596821524202824, 0.005338060203939676, ..."
563,563,The image contains a pie chart with four segme...,222,images\page-222_img18.png,"[0.01627928577363491, -0.012685469351708889, -..."
564,564,The image appears to be a text excerpt highlig...,226,images\page-226_img19.png,"[-0.0030819198582321405, 0.00860358402132988, ..."
