# Semi-structured eval: Multi-modal

We will test retrival of table information from the `Semi-structured Reports` dataset using various methods.

In [None]:
! pip install -U langchain openai chromadb pypdfium2 open_clip pillow

## Load

In [2]:
import base64
import io
import os
import uuid
from io import BytesIO

import pypdfium2 as pdfium
from IPython.display import HTML, display
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.schema.document import Document
from langchain.schema.messages import HumanMessage
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough
from langchain.storage import InMemoryStore
from langchain.vectorstores import Chroma
from langchain_experimental.open_clip import OpenCLIPEmbeddings
from PIL import Image

## Dataset

In [1]:
import os

from langchain_benchmarks import registry
from langchain_benchmarks.rag.tasks.semi_structured_reports import get_file_names

# Task
task = registry["Semi-structured Reports"]

# Files used
paths = list(get_file_names())
files = [str(p) for p in paths]

### TODO: Replace when dataset is updated
dir = "/Users/rlm/Desktop/Eval_Sets/semi_structured_reports/"
files = [f for f in os.listdir(dir) if f.endswith(".pdf")]

## Load

In [4]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from unstructured.partition.pdf import partition_pdf

def generate_doc_summary(file):
    """
    Create a doc summary
    """

    # Prompt
    prompt_text = """You are an assistant tasked extracting two attributes \
    from financial documents. (1) Tell me the company that the document is \
    focused on. (2) Look at any tables in the document and tell me the units \ 
    of the table. Many table will have '(In thousands)' or '(in millions)' prior \
    to the table text. Provide these two  {document} """
    prompt = ChatPromptTemplate.from_template(prompt_text)

    # Text summary chain
    model = ChatOpenAI(temperature=0, model="gpt-4-1106-preview")
    summarize_chain = {"document": lambda x: x} | prompt | model | StrOutputParser()

    # Load doc
    loader = PyPDFLoader(file)
    pdf_pages = loader.load()
    texts = [t.page_content for t in pdf_pages]
    text_string = " ".join(texts)
    summary = summarize_chain.invoke({"document": text_string})
    return summary

def get_images(fpath, fname):
    """
    Get PIL images from PDF pages
    """
    pdf = pdfium.PdfDocument(fpath + fname)
    n_pages = len(pdf)
    pil_images = []
    for page_number in range(n_pages):
        page = pdf.get_page(page_number)
        bitmap = page.render(scale=1, rotation=0, crop=(0, 0, 0, 0))
        pil_image = bitmap.to_pil()
        pil_images.append(pil_image)
        pil_image.save(f"{fpath}img/{fname}_img_{page_number + 1}.jpg", format="JPEG")
    return pil_images


def resize_base64_image(base64_string, size=(128, 128)):
    """
    Resize an image encoded as a Base64 string
    """
    # Decode the Base64 string
    img_data = base64.b64decode(base64_string)
    img = Image.open(io.BytesIO(img_data))

    # Resize the image
    resized_img = img.resize(size, Image.LANCZOS)

    # Save the resized image to a bytes buffer
    buffered = io.BytesIO()
    resized_img.save(buffered, format=img.format)

    # Encode the resized image to Base64
    return base64.b64encode(buffered.getvalue()).decode("utf-8")


def convert_to_base64(pil_image):
    """
    Base64 encoded strings
    """

    buffered = BytesIO()
    pil_image.save(buffered, format="JPEG")  # You can change the format if needed
    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
    img_str = resize_base64_image(img_str, size=(595, 842))  # Size for PDF
    return img_str


def image_summarize(img_base64, prompt):
    """Make image summary"""
    chat = ChatOpenAI(model="gpt-4-vision-preview", max_tokens=1024)

    msg = chat.invoke(
        [
            HumanMessage(
                content=[
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
                    },
                ]
            )
        ]
    )
    return msg.content


def generate_img_summaries(img_base64_list):
    """
    Generate summaries and base64 encoded strings for images
    img_base64_list: Base64 encoded images
    """

    # Store image summaries
    processed_images = []
    image_summaries = []

    # Prompt
    prompt = """You are an assistant tasked with summarizing a tables in images for retrieval. \
    The summary will be embedded and used to retrieve the raw table. Give a concise summary. \
    For tables, list as many of rows and column names as possible so we know what is captured in the table. \ 
    Do not worry about summarizing quantitative results in the table. If the image does not contain \
    a table, then just simply say 'No table present.' \n\n Image:"""

    # Apply to images
    for i, base64_image in enumerate(img_base64_list):
        print(f"Image {i+1} base64 size: {len(base64_image)} characters")
        try:
            image_summaries.append(image_summarize(base64_image, prompt))
            processed_images.append(base64_image)
        except:
            print(
                f"!!!Error processing image {i+1} w/ base64 size: {len(base64_image)} characters!!!"
            )

    return image_summaries, processed_images


images_base_64 = []
image_summaries = []

for fi in files:
    
    # Generate document summary
    doc_summary = generate_doc_summary(dir + fi)

    # Get images
    pil_images = get_images(dir, fi)
    doc_images_base_64 = [convert_to_base64(i) for i in pil_images]

    # Image summaries
    doc_image_summaries, doc_images_base_64_processed = generate_img_summaries(
        doc_images_base_64
    )

    # Add doc summary to table summary to preserve context
    doc_image_summaries = [
        doc_summary + "\n\n Here is a summary of a table within this doc: \n\n" + t
        for t in doc_image_summaries
    ]

    # Add to lists
    images_base_64.extend(doc_images_base_64)
    image_summaries.extend(doc_image_summaries)

Image 1 base64 size: 147760 characters
Image 2 base64 size: 162520 characters
Image 3 base64 size: 41216 characters
Image 1 base64 size: 83508 characters
Image 2 base64 size: 89800 characters
Image 3 base64 size: 30172 characters
Image 4 base64 size: 73840 characters
Image 5 base64 size: 76620 characters
Image 6 base64 size: 86604 characters
Image 7 base64 size: 31564 characters
Image 8 base64 size: 47936 characters
Image 9 base64 size: 11584 characters
!!!Error processing image 9 w/ base64 size: 11584 characters!!!
Image 10 base64 size: 29448 characters
Image 11 base64 size: 52888 characters
Image 12 base64 size: 41684 characters
Image 13 base64 size: 51312 characters
Image 14 base64 size: 57952 characters
Image 15 base64 size: 16744 characters
!!!Error processing image 15 w/ base64 size: 16744 characters!!!
Image 1 base64 size: 160820 characters
Image 2 base64 size: 174944 characters
Image 3 base64 size: 39336 characters
Image 1 base64 size: 94548 characters
Image 2 base64 size: 1582

## Make Retriever

### MV Retriever

Add raw docs and doc summaries to [Multi Vector Retriever](https://python.langchain.com/docs/modules/data_connection/retrievers/multi_vector#summary): 

* Store the raw images in the `docstore`.
* Store the image summaries in the `vectorstore` for semantic retrieval.

In [5]:
def create_multi_vector_retriever(vectorstore, image_summaries, images):
    """
    Create retriever that indexes summaries, but returns raw images or texts
    """

    # Initialize the storage layer
    store = InMemoryStore()
    id_key = "doc_id"

    # Create the multi-vector retriever
    retriever = MultiVectorRetriever(
        vectorstore=vectorstore,
        docstore=store,
        id_key=id_key,
    )

    # Helper function to add documents to the vectorstore and docstore
    def add_documents(retriever, doc_summaries, doc_contents):
        doc_ids = [str(uuid.uuid4()) for _ in doc_contents]
        summary_docs = [
            Document(page_content=s, metadata={id_key: doc_ids[i]})
            for i, s in enumerate(doc_summaries)
        ]
        retriever.vectorstore.add_documents(summary_docs)
        retriever.docstore.mset(list(zip(doc_ids, doc_contents)))

    add_documents(retriever, image_summaries, images)

    return retriever


# The vectorstore to use to index the summaries
vectorstore = Chroma(
    collection_name="multi-modal-rag-mv", embedding_function=OpenAIEmbeddings()
)

# Create retriever
retriever_multi_vector_img = create_multi_vector_retriever(
    vectorstore,
    image_summaries,
    images_base_64,
)

### Multi modal embeddings

In [6]:
# Create chroma
vectorstore_mmembd = Chroma(
    collection_name="multi-modal-rag-deck-mmembd",
    embedding_function=OpenCLIPEmbeddings(),
)

# Get image URIs with .jpg extension only
image_uris = sorted(
    [
        os.path.join(dir + "/img/", image_name)
        for image_name in os.listdir(dir + "/img/")
        if image_name.endswith(".jpg")
    ]
)

# Add images
vectorstore_mmembd.add_images(uris=image_uris)

# Make retriever
retriever_mmembd = vectorstore_mmembd.as_retriever()

## RAG

In [7]:
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough


def prepare_images(docs):
    """
    Prepare iamges for prompt

    :param docs: A list of base64-encoded images from retriever.
    :return: Dict containing a list of base64-encoded strings.
    """
    b64_images = []
    for doc in docs:
        if isinstance(doc, Document):
            doc = doc.page_content
        b64_images.append(doc)
    return {"images": b64_images}


def img_prompt_func(data_dict, num_images=2):
    """
    GPT-4V prompt for image analysis.

    :param data_dict: A dict with images and a user-provided question.
    :param num_images: Number of images to include in the prompt.
    :return: A list containing message objects for each image and the text prompt.
    """
    messages = []
    if data_dict["context"]["images"]:
        for image in data_dict["context"]["images"][:num_images]:
            image_message = {
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{image}"},
            }
            messages.append(image_message)
    text_message = {
        "type": "text",
        "text": (
            "You are an analyst tasked with answering questions about visual content.\n"
            "You will be give a set of image(s) from a slide deck / presentation.\n"
            "Use this information to answer the user question. \n"
            f"User-provided question: {data_dict['question']}\n\n"
        ),
    }
    messages.append(text_message)
    return [HumanMessage(content=messages)]


def multi_modal_rag_chain(retriever):
    """
    Multi-modal RAG chain
    """

    # Multi-modal LLM
    model = ChatOpenAI(temperature=0, model="gpt-4-vision-preview", max_tokens=1024)

    # RAG pipeline
    chain = (
        {
            "context": retriever | RunnableLambda(prepare_images),
            "question": RunnablePassthrough(),
        }
        | RunnableLambda(img_prompt_func)
        | model
        | StrOutputParser()
    )

    return chain


# Create RAG chain
chain_multimodal_rag = multi_modal_rag_chain(retriever_multi_vector_img)
chain_multimodal_rag_mmembd = multi_modal_rag_chain(retriever_mmembd)

## Eval

In [None]:
import uuid
from langsmith.client import Client
from langchain.smith import RunEvalConfig

# Config
client = Client()
eval_config = RunEvalConfig(
    evaluators=["cot_qa"],
)

# Experiments
chain_map = {
    "image-summaries-mvr": chain_multimodal_rag,
    "multi-modal-embeddings": chain_multimodal_rag_mmembd,
}

# Run evaluation
run_id = uuid.uuid4().hex[:4]
test_runs = {}
for project_name, chain in chain_map.items():
    test_runs[project_name] = client.run_on_dataset(
        # dataset_name=task.name,
        dataset_name="Semi-Structured-Eval-v9",
        llm_or_chain_factory=lambda: (lambda x: x["question"]) | chain,
        evaluation=eval_config,
        verbose=True,
        project_name=f"{run_id}-{project_name}",
        project_metadata={"chain": project_name},
    )

View the evaluation results for project 'de84-image-summaries-mvr' at:
https://smith.langchain.com/o/1fa8b1f4-fcb9-4072-9aa9-983e35ad61b8/datasets/8ea15254-5fa3-4141-8cac-4d4eefc89614/compare?selectedSessions=48367506-b22c-4d99-aff7-2c8c10b187ff

View all tests for Dataset Semi-Structured-Eval-v9 at:
https://smith.langchain.com/o/1fa8b1f4-fcb9-4072-9aa9-983e35ad61b8/datasets/8ea15254-5fa3-4141-8cac-4d4eefc89614
[--------------------->                            ] 11/25



[----------------------------->                    ] 15/25

Error Type: BadRequestError, Message: Error code: 400 - {'error': {'message': "You uploaded an unsupported image. Please make sure your image is below 20 MB in size and is of one the following formats: ['png', 'jpeg', 'gif', 'webp'].", 'type': 'invalid_request_error', 'param': None, 'code': 'image_parse_error'}}


[------------------------------->                  ] 16/25

Error Type: BadRequestError, Message: Error code: 400 - {'error': {'message': "You uploaded an unsupported image. Please make sure your image is below 20 MB in size and is of one the following formats: ['png', 'jpeg', 'gif', 'webp'].", 'type': 'invalid_request_error', 'param': None, 'code': 'image_parse_error'}}


[------------------------------------------------->] 25/25

Unnamed: 0,output,feedback.COT Contextual Accuracy,error,execution_time
count,23,23.0,2,25.0
unique,23,,2,
top,"I'm sorry, but the images provided do not cont...",,"Error code: 400 - {'error': {'message': ""You u...",
freq,1,,1,
mean,,0.173913,,18.605202
std,,0.387553,,4.605097
min,,0.0,,12.127701
25%,,0.0,,15.619378
50%,,0.0,,17.681072
75%,,0.0,,20.110241


View the evaluation results for project 'de84-multi-modal-embeddings' at:
https://smith.langchain.com/o/1fa8b1f4-fcb9-4072-9aa9-983e35ad61b8/datasets/8ea15254-5fa3-4141-8cac-4d4eefc89614/compare?selectedSessions=c1cc16cb-859f-4a82-be00-17c922e4c036

View all tests for Dataset Semi-Structured-Eval-v9 at:
https://smith.langchain.com/o/1fa8b1f4-fcb9-4072-9aa9-983e35ad61b8/datasets/8ea15254-5fa3-4141-8cac-4d4eefc89614
[>                                                 ] 0/25