# Semi-structured eval: Unstructured + Multi-vector retriever

We will test retrieval of table information from the `Semi-structured Reports` dataset using various methods.

## Pre-requisites

In [1]:
%pip install --quiet -U langchain langsmith langchain_benchmarks
%pip install --quiet chromadb openai "unstructured[all-docs]"

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## Dataset

In [2]:
import os

from langchain_benchmarks import registry
from langchain_benchmarks.rag.tasks.semi_structured_reports import get_file_names

# Task
task = registry["Semi-structured Reports"]

# Files used
paths = list(get_file_names())
files = [str(p) for p in paths]

### TODO: Replace when dataset is updated
#dir = "/Users/rlm/Desktop/Eval_Sets/semi_structured_reports/"
dir = "/mnt/c/Users/taqi_/OneDrive/Desktop/semi_structured_reports/"
files = [f for f in os.listdir(dir) if f.endswith(".pdf")]

### Load

Use table-aware splitting following cookbook [here](https://github.com/langchain-ai/langchain/blob/master/cookbook/Semi_Structured_RAG.ipynb).

In addition to the below pip packages, you will also need [poppler](https://pdf2image.readthedocs.io/en/latest/installation.html) and [tesseract](https://tesseract-ocr.github.io/tessdoc/Installation.html) in your system.

In [5]:
import hashlib

from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema.output_parser import StrOutputParser
from unstructured.partition.pdf import partition_pdf
from langchain.schema.output_parser import StrOutputParser


def categorize_elements(raw_pdf_elements):
    """
    Categorize extracted elements from a PDF into tables and texts.
    raw_pdf_elements: List of unstructured.documents.elements
    """
    tables = []
    texts = []
    for element in raw_pdf_elements:
        if "unstructured.documents.elements.Table" in str(type(element)):
            tables.append(str(element))
        elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
            texts.append(str(element))
    return texts, tables


def hash_table(table):
    """
    Generate a hash for a table based on its content.
    """
    return hashlib.md5(str(table).encode()).hexdigest()


def generate_doc_summary(file):
    """
    Create a doc summary
    """

    # Prompt
    prompt_text = """You are an assistant tasked with summarizing a document. \
    Write a concisde 1-2 sentence summary of the document: {document} """
    prompt = ChatPromptTemplate.from_template(prompt_text)

    # Text summary chain
    model = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k")
    summarize_chain = {"document": lambda x: x} | prompt | model | StrOutputParser()

    # Load doc
    loader = PyPDFLoader(file)
    pdf_pages = loader.load()
    texts = [t.page_content for t in pdf_pages]
    text_string = " ".join(texts)
    summary = summarize_chain.invoke({"document": text_string})
    return summary


def generate_text_summaries(texts, tables, summarize_texts=False):
    """
    Summarize text elements
    texts: List of str
    tables: List of str
    summarize_texts: Bool to summarize texts
    """

    # Prompt
    prompt_text = """You are an assistant tasked with summarizing tables and text for retrieval. \
    These summaries will be embedded and used to retrieve the raw text or table elements. \
    Give a concise summary of the table or text that is well optimized for retrieval. Table or text: {element} """
    prompt = ChatPromptTemplate.from_template(prompt_text)

    # Text summary chain
    model = ChatOpenAI(temperature=0, model="gpt-4")
    summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

    # Initialize empty summaries
    text_summaries = []
    table_summaries = []

    # Apply to text if texts are provided and summarization is requested
    if texts and summarize_texts:
        text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})
    elif texts:
        text_summaries = texts

    # Apply to tables if tables are provided
    if tables:
        table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})

    return text_summaries, table_summaries


def parse_pdf_unstructured(file):
    """
    Get tables, texts from pdf
    """
    # Get elements
    unstructured_elements = partition_pdf(
        filename=file,
        extract_images_in_pdf=False,
        infer_table_structure=True,
        chunking_strategy="by_title",
        max_characters=4000,
        new_after_n_chars=3800,
        combine_text_under_n_chars=2000,
        image_output_dir_path=os.path.dirname(file),
    )

    # Categorize elements by type
    texts, tables = categorize_elements(unstructured_elements)
    print(f"There are {len(tables)} tables")
    print(f"There are {len(texts)} text elements")
    return texts, tables


texts = []
tables = []
text_summaries = []
table_summaries = []

for fi in files:
    # Generate document summary
    doc_summary = generate_doc_summary(dir + fi)

    # Get texts, tables
    doc_texts, doc_tables = parse_pdf_unstructured(dir + fi)

    # Enforce a specific token size for texts to summarize larger chunks
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=4000, chunk_overlap=0
    )
    joined_doc_texts = " ".join(doc_texts)
    doc_texts = text_splitter.split_text(joined_doc_texts)

    # Get text, table summaries
    doc_text_summaries, doc_table_summaries = generate_text_summaries(
        doc_texts, doc_tables, summarize_texts=True
    )

    # Add doc summary to table summary to preserve context
    doc_table_summaries = [
        doc_summary + "\n here is a table summary in this doc:" + t
        for t in doc_table_summaries
    ]

    # Add to lists
    texts.extend(doc_texts)
    tables.extend(doc_tables)
    text_summaries.extend(doc_text_summaries)
    table_summaries.extend(doc_table_summaries)

Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


There are 14 tables
There are 21 text elements
There are 1 tables
There are 5 text elements
There are 9 tables
There are 11 text elements
There are 13 tables
There are 15 text elements
There are 12 tables
There are 14 text elements
There are 2 tables
There are 5 text elements


## Index

In [6]:
import uuid

from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.schema.document import Document

from langchain.storage import InMemoryStore
from langchain.vectorstores import Chroma


def create_multi_vector_retriever(
    vectorstore,
    text_summaries=None,
    texts=None,
    table_summaries=None,
    tables=None,
    image_summaries=None,
    images=None,
):
    """
    Create retriever that indexes summaries, but returns raw images or texts
    """

    # Initialize the storage layer
    store = InMemoryStore()
    id_key = "doc_id"

    # Create the multi-vector retriever
    retriever = MultiVectorRetriever(
        vectorstore=vectorstore,
        docstore=store,
        id_key=id_key,
    )

    # Helper function to add documents to the vectorstore and docstore
    def add_documents(retriever, doc_summaries, doc_contents):
        doc_ids = [str(uuid.uuid4()) for _ in doc_contents]
        summary_docs = [
            Document(page_content=s, metadata={id_key: doc_ids[i]})
            for i, s in enumerate(doc_summaries)
        ]
        retriever.vectorstore.add_documents(summary_docs)
        retriever.docstore.mset(list(zip(doc_ids, doc_contents)))

    # Add texts, tables, and images
    if text_summaries:
        add_documents(retriever, text_summaries, texts)
    if table_summaries:
        add_documents(retriever, table_summaries, tables)
    if image_summaries:
        add_documents(retriever, image_summaries, images)

    return retriever

In [10]:
# The vectorstore to use to index the summaries
vectorstore = Chroma(
    collection_name="unstructured-tables", embedding_function=OpenAIEmbeddings()
)

# Create retriever
retriever_unstructured = create_multi_vector_retriever(
    vectorstore,
    table_summaries=table_summaries,
    tables=tables,
)

## RAG

In [11]:
from langchain.chat_models import ChatOpenAI

from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough


def rag_chain(retriever):
    """
    RAG chain
    """

    # Prompt template
    template = """Answer the question based only on the following context, which can include text and tables:
    {context}
    Question: {question}
    """
    prompt = ChatPromptTemplate.from_template(template)

    # LLM
    model = ChatOpenAI(temperature=0, model="gpt-4")

    # RAG pipeline
    chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | prompt
        | model
        | StrOutputParser()
    )
    return chain


# Create RAG chain
chain = rag_chain(retriever_unstructured)

## Eval

In [12]:
import uuid
from langsmith.client import Client
from langchain.smith import RunEvalConfig

eval_config = RunEvalConfig(
    evaluators=["cot_qa"],
)

def run_eval(chain, eval_run_name):
    """
    Run eval
    """
    client = Client()
    test_run = client.run_on_dataset(
        ### TODO: Replace with public dataset
        dataset_name="Semi-Structured-Eval-v5",
        llm_or_chain_factory=lambda: (lambda x: x["question"]) | chain,
        evaluation=eval_config,
        verbose=True,
        project_name=eval_run_name,
    )


# Experiments
chain_map = {
    "unstructured": chain,
}

run_id = str(uuid.uuid4())
for project_name, chain in chain_map.items():
    run_eval(chain, project_name + "_" + run_id)

View the evaluation results for project 'unstructured_40142856-2364-46b2-bee6-dd6175da958d' at:
https://smith.langchain.com/o/1fa8b1f4-fcb9-4072-9aa9-983e35ad61b8/projects/p/f900fab4-2ef0-420d-97d2-31172e9ce97e?eval=true

View all tests for Dataset Semi-Structured-Eval-v5 at:
https://smith.langchain.com/o/1fa8b1f4-fcb9-4072-9aa9-983e35ad61b8/datasets/2759f13d-c0c0-4d60-a8cf-0ce204750642
[------------------------------------------------->] 25/25
 Eval quantiles:
                                          inputs.question  \
count                                                  25   
unique                                                 25   
top     What is Datadog's Non-GAAP gross margin for th...   
freq                                                    1   
mean                                                  NaN   
std                                                   NaN   
min                                                   NaN   
25%                                           

Unnamed: 0,output,feedback.COT Contextual Accuracy,error,execution_time
count,20,20.0,0.0,20.0
unique,20,,0.0,
top,The text does not provide information on the p...,,,
freq,1,,,
mean,,0.4,,5.202974
std,,0.502625,,1.212825
min,,0.0,,3.637487
25%,,0.0,,4.431906
50%,,0.0,,4.952879
75%,,1.0,,5.655802
