## 1. use unstructure to parse pdf into text chunks, tables, images

In [1]:
# ! pip install -U langchain openai langchain-chroma langchain-experimental
# ! pip install "unstructured[all-docs]" pillow pydantic lxml pillow matplotlib chromadb tiktoken
# ! pip install -U langchain-openai

In [2]:
# pip install protobuf==3.20.3

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

import tensorflow as tf
print("TensorFlow version:", tf.__version__)

TensorFlow version: 2.18.0-dev20240717


In [3]:
from langchain_text_splitters import CharacterTextSplitter
from unstructured.partition.pdf import partition_pdf
from langchain_openai import ChatOpenAI

  from .autonotebook import tqdm as notebook_tqdm


In [4]:

# Extract elements from PDF
def extract_pdf_elements(path, fname):
    """
    Extract images, tables, and chunk text from a PDF file.
    path: File path, which is used to dump images (.jpg)
    fname: File name
    """
    return partition_pdf(
        filename=path + fname,
        extract_images_in_pdf=True,
        infer_table_structure=True,
        chunking_strategy="by_title",
        max_characters=4000,
        new_after_n_chars=3800,
        combine_text_under_n_chars=2000,
        image_output_dir_path=path,
    )


# Categorize elements by type
def categorize_elements(raw_pdf_elements):
    """
    Categorize extracted elements from a PDF into tables and texts.
    raw_pdf_elements: List of unstructured.documents.elements
    """
    tables = []
    texts = []
    for element in raw_pdf_elements:
        if "unstructured.documents.elements.Table" in str(type(element)):
            tables.append(str(element))
        elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
            texts.append(str(element))
    return texts, tables


In [5]:
# File path
fpath = "C:/Users/wudan/OneDrive - Langara College/DANA_4830/BERT/"
fname = "AttentionIsAllYouNeed.pdf"

# Get elements
raw_pdf_elements = extract_pdf_elements(fpath, fname)



Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
print(type(raw_pdf_elements),len(raw_pdf_elements))

<class 'list'> 20


In [7]:
raw_pdf_elements

[<unstructured.documents.elements.CompositeElement at 0x1c94113a350>,
 <unstructured.documents.elements.CompositeElement at 0x1c941890bd0>,
 <unstructured.documents.elements.CompositeElement at 0x1c9418902d0>,
 <unstructured.documents.elements.CompositeElement at 0x1c943aea250>,
 <unstructured.documents.elements.CompositeElement at 0x1c943aea190>,
 <unstructured.documents.elements.CompositeElement at 0x1c943c88e10>,
 <unstructured.documents.elements.Table at 0x1c95dfb5450>,
 <unstructured.documents.elements.CompositeElement at 0x1c941052410>,
 <unstructured.documents.elements.CompositeElement at 0x1c941283c10>,
 <unstructured.documents.elements.CompositeElement at 0x1c943b67ad0>,
 <unstructured.documents.elements.Table at 0x1c95da55150>,
 <unstructured.documents.elements.CompositeElement at 0x1c943b65690>,
 <unstructured.documents.elements.CompositeElement at 0x1c943b67190>,
 <unstructured.documents.elements.Table at 0x1c943b672d0>,
 <unstructured.documents.elements.CompositeElement at

In [8]:
type(raw_pdf_elements[0])

unstructured.documents.elements.CompositeElement

In [9]:
# Get text, tables
texts, tables = categorize_elements(raw_pdf_elements)

# # Optional: Enforce a specific token size for texts
# text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
#     chunk_size=4000, chunk_overlap=0
# )
# joined_texts = " ".join(texts)
# texts_4k_token = text_splitter.split_text(joined_texts)

In [10]:
len(texts)

16

In [11]:
texts

['3 2 0 2\n\ng u A 2 ] L C . s c [\n\n7 v 2 6 7 3 0 . 6 0 7 1 : v i X r a\n\nProvided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works.\n\nAttention Is All You Need\n\nAshish Vaswani∗ Google Brain avaswani@google.com\n\nNoam Shazeer∗ Google Brain noam@google.com\n\nNiki Parmar∗ Google Research nikip@google.com\n\nJakob Uszkoreit∗ Google Research usz@google.com\n\nLlion Jones∗ Google Research llion@google.com\n\nAidan N. Gomez∗ † University of Toronto aidan@cs.toronto.edu\n\nŁukasz Kaiser∗ Google Brain lukaszkaiser@google.com\n\nIllia Polosukhin∗ ‡ illia.polosukhin@gmail.com\n\nAbstract\n\nThe dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, th

In [12]:
tables

['Layer Type Self-Attention Recurrent Convolutional Self-Attention (restricted) Complexity per Layer O(n2 · d) O(n · d2) O(k · n · d2) O(r · n · d) Sequential Maximum Path Length Operations O(1) O(n) O(1) O(1) O(1) O(n) O(logk(n)) O(n/r)',
 'Model ByteNet [18] Deep-Att + PosUnk [39] GNMT + RL [38] ConvS2S [9] MoE [32] Deep-Att + PosUnk Ensemble [39] GNMT + RL Ensemble [38] ConvS2S Ensemble [9] Transformer (base model) Transformer (big) BLEU EN-DE EN-FR 23.75 24.6 25.16 26.03 26.30 26.36 27.3 28.4 39.2 39.92 40.46 40.56 40.4 41.16 41.29 38.1 41.8 Training Cost (FLOPs) EN-DE EN-FR 2.3 · 1019 9.6 · 1018 2.0 · 1019 1.8 · 1020 7.7 · 1019 1.0 · 1020 1.4 · 1020 1.5 · 1020 1.2 · 1020 8.0 · 1020 1.1 · 1021 1.2 · 1021 3.3 · 1018 2.3 · 1019',
 'base (A) (B) (C) (D) N dmodel 6 512 2 4 8 256 1024 dff 2048 1024 4096 h 8 1 4 16 32 dk 64 512 128 32 16 16 32 32 128 dv 64 512 128 32 16 32 128 Pdrop 0.1 0.0 0.2 ϵls 0.1 0.0 0.2 PPL train steps (dev) 100K 4.92 5.29 5.00 4.91 5.01 5.16 5.01 6.11 5.19 4.88 5

### partition conclusion:
(1) unstructure method give 3 types of partition from the PDF, text,table and image
(2) text and table are stored in the lists
(3) images are output into a folder in the path (figures)

# 2. use multimodal AI embedding those parsed elements 
Multi-vector-retriever to index image/text/table elements summaries 


## 2.1 Text and table summary

In [13]:
import os
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI



# Set your OpenAI API key
os.environ["OPENAI_API_KEY"] = ' '


# Generate summaries of text elements
def generate_text_summaries(texts, tables, summarize_texts=False):
    """
    Summarize text elements
    texts: List of str
    tables: List of str
    summarize_texts: Bool to summarize texts
    """

    # Prompt
    prompt_text = """You are an assistant tasked with summarizing tables and text for retrieval. \
    These summaries will be embedded and used to retrieve the raw text or table elements. \
    Give a concise summary of the table or text that is well optimized for retrieval. Table or text: {element} """


    prompt = ChatPromptTemplate.from_template(prompt_text)

    # Text summary chain
    model = ChatOpenAI(temperature=0, model="gpt-4")
    

    summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

    # Initialize empty summaries
    text_summaries = []
    table_summaries = []

    # Apply to text if texts are provided and summarization is requested
    if texts and summarize_texts:
        text_summaries = [summarize_chain.invoke({"element": text}) for text in texts]
    elif texts:
        text_summaries = texts

    # Apply to tables if tables are provided
    if tables:
        table_summaries = [summarize_chain.invoke({"element": table}) for table in tables]

    return text_summaries, table_summaries




text_summaries, table_summaries = generate_text_summaries(
    texts, tables, summarize_texts=False
)



In [14]:
len(tables)

4

In [15]:
len(text_summaries)

16

In [16]:
len(table_summaries)

4

In [17]:
table_summaries

['The table presents a comparison of different layer types in terms of complexity per layer and sequential maximum path length operations. The layer types include Self-Attention, Recurrent, Convolutional, and Restricted Self-Attention. The complexity per layer for these types is O(n2 · d), O(n · d2), O(k · n · d2), and O(r · n · d) respectively. The sequential maximum path length operations for these types are O(1), O(n), O(1), and O(n/r) respectively.',
 'The table compares different machine translation models, including Model ByteNet, Deep-Att + PosUnk, GNMT + RL, ConvS2S, MoE, and Transformer (base and big models). The comparison is based on BLEU scores for English to German (EN-DE) and English to French (EN-FR) translations, and the training cost in FLOPs. The Transformer (big) model has the highest BLEU scores for both language pairs, while the Model ByteNet has the lowest training cost.',
 'The table presents various parameters and their corresponding values for different models 

In [18]:
text_summaries[0
]

'3 2 0 2\n\ng u A 2 ] L C . s c [\n\n7 v 2 6 7 3 0 . 6 0 7 1 : v i X r a\n\nProvided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works.\n\nAttention Is All You Need\n\nAshish Vaswani∗ Google Brain avaswani@google.com\n\nNoam Shazeer∗ Google Brain noam@google.com\n\nNiki Parmar∗ Google Research nikip@google.com\n\nJakob Uszkoreit∗ Google Research usz@google.com\n\nLlion Jones∗ Google Research llion@google.com\n\nAidan N. Gomez∗ † University of Toronto aidan@cs.toronto.edu\n\nŁukasz Kaiser∗ Google Brain lukaszkaiser@google.com\n\nIllia Polosukhin∗ ‡ illia.polosukhin@gmail.com\n\nAbstract\n\nThe dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the

## ? eqution expression

use GPT-4 to generate image summaries

! need to change file path

## 2.2 Image summary

In [19]:
import base64
import os

from langchain_core.messages import HumanMessage


def encode_image(image_path):
    """Getting the base64 string"""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")


def image_summarize(img_base64, prompt):
    """Make image summary"""
    chat = ChatOpenAI(model="gpt-4o", max_tokens=1024)

    msg = chat.invoke(
        [
            HumanMessage(
                content=[
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
                    },
                ]
            )
        ]
    )
    return msg.content


def generate_img_summaries(path):
    """
    Generate summaries and base64 encoded strings for images
    path: Path to list of .jpg files extracted by Unstructured
    """

    # Store base64 encoded images
    img_base64_list = []

    # Store image summaries
    image_summaries = []

    # Prompt
    prompt = """You are an assistant tasked with summarizing images for retrieval. \
    These summaries will be embedded and used to retrieve the raw image. \
    Give a concise summary of the image that is well optimized for retrieval."""

    # Apply to images
    for img_file in sorted(os.listdir(path)):
        if img_file.endswith(".jpg"):
            img_path = os.path.join(path, img_file)
            base64_image = encode_image(img_path)
            img_base64_list.append(base64_image)
            image_summaries.append(image_summarize(base64_image, prompt))

    return img_base64_list, image_summaries


# Image summaries
image_fpath = r'C:\Users\wudan\OneDrive - Langara College\DANA_4830\RAG_multimodal\Langchain_RAG_multimodal\figures'
img_base64_list, image_summaries = generate_img_summaries(image_fpath)

In [20]:
len(image_summaries)

8

In [21]:
image_summaries[3]

'Text alignment image showing the sentence: "The Law will never be perfect, but its application should be just - this is what we are missing, in my opinion." with visual connectors between words.'

# 3. store those elements embeddings into the database

Store the raw texts, tables, and images in the docstore.

Store the texts, table summaries, and image summaries in the vectorstore for efficient semantic retrieval.

https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/multi_vector/#summary

In [22]:
import tensorflow as tf

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Restrict TensorFlow to only allocate a specific amount of memory on the first GPU
        tf.config.experimental.set_virtual_device_configuration(
            gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4096)]) # Set limit as per your system
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Virtual devices must be set before GPUs have been initialized
        print(e)


In [23]:
import os
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

In [24]:
print(os.environ['TF_ENABLE_ONEDNN_OPTS'])

0


In [25]:
# Set environment variable to disable oneDNN optimizations to avoid kernel crash


import uuid

from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings

In [26]:



def create_multi_vector_retriever(
    vectorstore, text_summaries, texts, table_summaries, tables, image_summaries, images
):
    """
    Create retriever that indexes summaries, but returns raw images or texts
    """

    # Initialize the storage layer
    store = InMemoryStore()
    id_key = "doc_id"

    # Create the multi-vector retriever
    retriever = MultiVectorRetriever(
        vectorstore=vectorstore,
        docstore=store,
        id_key=id_key,
    )

    # Helper function to add documents to the vectorstore and docstore
    def add_documents(retriever, doc_summaries, doc_contents):
        doc_ids = [str(uuid.uuid4()) for _ in doc_contents]
        summary_docs = [
            Document(page_content=s, metadata={id_key: doc_ids[i]})
            for i, s in enumerate(doc_summaries)
        ]
        retriever.vectorstore.add_documents(summary_docs)
        retriever.docstore.mset(list(zip(doc_ids, doc_contents)))

    # Add texts, tables, and images
    # Check that text_summaries is not empty before adding
    if text_summaries:
        add_documents(retriever, text_summaries, texts)
    # Check that table_summaries is not empty before adding
    if table_summaries:
        add_documents(retriever, table_summaries, tables)
    # Check that image_summaries is not empty before adding
    if image_summaries:
        add_documents(retriever, image_summaries, images)

    return retriever




In [27]:
# The vectorstore to use to index the summaries
vectorstore = Chroma(
    collection_name="mm_rag_cj_blog", embedding_function=OpenAIEmbeddings()
)



In [2]:
len(img_base64_list)

NameError: name 'img_base64_list' is not defined

In [28]:
# Create retriever
retriever_multi_vector_img = create_multi_vector_retriever(
    vectorstore,
    text_summaries,
    texts,
    table_summaries,
    tables,
    image_summaries,
    img_base64_list,
)

: 

# 4. build rag retriever
# 5. query the multi-modal rag retriever