## Data Loading

### Get Public Dataset

Get the LangSmith public dataset for semi-structured data, `Semi-structured Reports`.

In [16]:
import os
from pathlib import Path

from langchain_benchmarks import clone_public_dataset, registry
from langchain_benchmarks.rag.tasks.semi_structured_reports import get_file_names

# Task
task = registry["Semi-structured Reports"]

# Files used
paths = list(get_file_names())
files = [str(p) for p in paths]

### Base Case

PDF loader (naive to tables).

In [17]:
from langchain.document_loaders import PyPDFLoader


def load_and_split(file):
    """
    Load and split PDF files
    """

    loader = PyPDFLoader(file)
    pdf_pages = loader.load()

    # Split
    from langchain.text_splitter import RecursiveCharacterTextSplitter

    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=2000, chunk_overlap=50
    )

    # Get chunks
    docs = text_splitter.split_documents(pdf_pages)
    texts = [d.page_content for d in docs]
    print(f"There are {len(texts)} text elements")
    return texts


baseline_texts = []
for fi in files:
    baseline_texts.append(load_and_split(fi))

There are 11 text elements
There are 3 text elements


### Unstructured

Table-aware splitting following cookbook [here](https://github.com/langchain-ai/langchain/blob/master/cookbook/Semi_Structured_RAG.ipynb).

In addition to the below pip packages, you will also need [poppler](https://pdf2image.readthedocs.io/en/latest/installation.html) and [tesseract](https://tesseract-ocr.github.io/tessdoc/Installation.html) in your system.

In [None]:
! pip install "unstructured[all-docs]"

In [22]:
from unstructured.partition.pdf import partition_pdf


def categorize_elements(raw_pdf_elements):
    """
    Categorize extracted elements from a PDF into tables and texts.
    raw_pdf_elements: List of unstructured.documents.elements
    """
    tables = []
    texts = []
    for element in raw_pdf_elements:
        if "unstructured.documents.elements.Table" in str(type(element)):
            tables.append(str(element))
        elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
            texts.append(str(element))
    return texts, tables


def parse_pdf_unstructured(file):
    # Get elements
    unstructured_elements = partition_pdf(
        filename=file,
        extract_images_in_pdf=False,
        infer_table_structure=True,
        chunking_strategy="by_title",
        max_characters=4000,
        new_after_n_chars=3800,
        combine_text_under_n_chars=2000,
        image_output_dir_path=os.path.dirname(file),
    )

    # Categorize elements by type
    texts, tables = categorize_elements(unstructured_elements)
    print(f"There are {len(tables)} tables")
    print(f"There are {len(texts)} text elements")
    return texts, tables


# Run
unstructured_texts = []
unstructured_tables = []
for fi in files:
    texts, tables = parse_pdf_unstructured(fi)
    unstructured_texts.append(texts)
    unstructured_tables.append(tables)

Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


There are 6 tables
There are 12 text elements
There are 1 tables
There are 5 text elements


### Docugami

Table-aware splitting following cookbook [here](https://github.com/langchain-ai/langchain/blob/master/cookbook/docugami_xml_kg_rag.ipynb).

In [None]:
! pip install docugami==0.0.4 dgml-utils==0.2.0

In [None]:
from docugami_processing import upload_files, wait_for_xml

# Load
DOCSET_NAME = "Semi-Structured"
dg_docs = upload_files(files, DOCSET_NAME)
dgml_paths = wait_for_xml(dg_docs)

In [None]:
def extract_docugami_file(dgml_path):
    with open(dgml_path, "r") as file:
        contents = file.read().encode("utf-8")
        chunks = get_chunks_str(
            contents,
            include_xml_tags=True,
            min_text_length=1024 * 8,  # 8k chars are ~2k tokens
            max_text_length=1024 * 8,  # 8k chars are ~2k tokens
        )

    # Tables
    tables = [c for c in chunks if "table" in c.structure.split()]
    print(f"There are {len(tables)} tables")

    # Text
    texts = [c for c in chunks if "table" not in c.structure.split()]
    print(f"There are {len(texts)} text elements")
    return texts, tables


docugami_texts = []
docugami_tables = []
for fname in files:
    # Get xml
    dgml_path = dgml_paths[fname]
    # Extract elelemtns
    texts, tables = extract_docugami_file(dgml_path)
    docugami_texts.append(texts)
    docugami_tables.append(tables)

## Indexing

### Base case

In [None]:
vectorstore_baseline = Chroma.from_texts(
    documents=baseline_texts, collection_name="baseline", embedding=OpenAIEmbeddings()
)

retriever_baseline = vectorstore_baseline.as_retriever()

### Multi-vector retriever

In [None]:
def generate_text_summaries(texts, tables, summarize_texts=False):
    """
    Summarize text elements
    texts: List of str
    tables: List of str
    summarize_texts: Bool to summarize texts
    """

    # Prompt
    prompt_text = """You are an assistant tasked with summarizing tables and text for retrieval. \
    These summaries will be embedded and used to retrieve the raw text or table elements. \
    Give a concise summary of the table or text that is well optimized for retrieval. Table or text: {element} """
    prompt = ChatPromptTemplate.from_template(prompt_text)

    # Text summary chain
    model = ChatOpenAI(temperature=0, model="gpt-4")
    summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

    # Initialize empty summaries
    text_summaries = []
    table_summaries = []

    # Apply to text if texts are provided and summarization is requested
    if texts and summarize_texts:
        text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})
    elif texts:
        text_summaries = texts

    # Apply to tables if tables are provided
    if tables:
        table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})

    return text_summaries, table_summaries


def create_multi_vector_retriever(
    vectorstore, text_summaries, texts, table_summaries, tables, image_summaries, images
):
    """
    Create retriever that indexes summaries, but returns raw images or texts
    """

    # Initialize the storage layer
    store = InMemoryStore()
    id_key = "doc_id"

    # Create the multi-vector retriever
    retriever = MultiVectorRetriever(
        vectorstore=vectorstore,
        docstore=store,
        id_key=id_key,
    )

    # Helper function to add documents to the vectorstore and docstore
    def add_documents(retriever, doc_summaries, doc_contents):
        doc_ids = [str(uuid.uuid4()) for _ in doc_contents]
        summary_docs = [
            Document(page_content=s, metadata={id_key: doc_ids[i]})
            for i, s in enumerate(doc_summaries)
        ]
        retriever.vectorstore.add_documents(summary_docs)
        retriever.docstore.mset(list(zip(doc_ids, doc_contents)))

    # Add texts, tables, and images
    if text_summaries:
        add_documents(retriever, text_summaries, texts)
    if table_summaries:
        add_documents(retriever, table_summaries, tables)
    if image_summaries:
        add_documents(retriever, image_summaries, images)

    return retriever

In [None]:
## Unstructured

# Get text, table summaries
unstructured_text_summaries, unstructured_table_summaries = generate_text_summaries(
    unstructured_texts, unstructured_tables, summarize_texts=False
)

# The vectorstore to use to index the summaries
unstructured_vectorstore = Chroma(
    collection_name="unstructured", embedding_function=OpenAIEmbeddings()
)

# Create retriever
retriever_unstructured = create_multi_vector_retriever(
    unstructured_vectorstore,
    unstructured_text_summaries,
    unstructured_texts,
    unstructured_table_summaries,
    unstructured_tables,
)

In [None]:
## Docugami

# Get text, table summaries
docugami_text_summaries, docugami_table_summaries = generate_text_summaries(
    docugami_texts, docugami_tables, summarize_texts=False
)

# The vectorstore to use to index the summaries
docugami_vectorstore = Chroma(
    collection_name="docugami", embedding_function=OpenAIEmbeddings()
)

# Create retriever
retriever_docugami = create_multi_vector_retriever(
    docugami_vectorstore,
    docugami_text_summaries,
    docugami_texts,
    docugami_table_summaries,
    docugami_tables,
)

## RAG

In [None]:
def rag_chain(retriever):
    """
    RAG chain
    """

    # Prompt template
    template = """Answer the question based only on the following context, which can include text and tables:
    {context}
    Question: {question}
    """
    prompt = ChatPromptTemplate.from_template(template)

    # LLM
    model = ChatOpenAI(temperature=0, model="gpt-4")

    # RAG pipeline
    baseline_chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | prompt
        | model
        | StrOutputParser()
    )
    return chain


# Create RAG chains
chain_baseline = rag_chain(retriever_baseline)
chain_unstructured = rag_chain(retriever_unstructured)
chain_docugami = rag_chain(retriever_docugami)

# Eval

See guide [here](https://github.com/langchain-ai/langchain-benchmarks/blob/main/docs/source/notebooks/retrieval/semi_structured.ipynb).

In [None]:
from functools import partial

from langsmith.client import Client

from langchain_benchmarks.rag import get_eval_config


def run_eval(chain, eval_run_name):
    """
    Run eval
    """
    client = Client()
    test_run = client.run_on_dataset(
        dataset_name=task.name,
        llm_or_chain_factory=chain,
        evaluation=get_eval_config(),
        verbose=True,
        project_name=eval_run_name,
    )


# Experiments
chain_map = {
    "baseline": chain_baseline,
    "unstructured": chain_unstructured,
    "docugami": chain_docugami,
    ### TODO: Add images
}

for project_name, chain in chain_map.items():
    run_eval(chain, project_name)