# Semi-structured eval: Long-context

We will test retrieval of table information from the `Semi-structured Reports` dataset using various methods.

In [1]:
%pip install --quiet -U langchain langsmith langchain_benchmarks
%pip install --quiet -U anthropic openai

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## Dataset

In [2]:
import os
from pathlib import Path

from langchain_benchmarks import clone_public_dataset, registry
from langchain_benchmarks.rag.tasks.semi_structured_reports import get_file_names

# Task
task = registry["Semi-structured Reports"]

# Files used
paths = list(get_file_names())
files = [str(p) for p in paths]

### TODO: Replace when dataset is updated
#dir = "/Users/rlm/Desktop/Eval_Sets/semi_structured_reports/"
dir = "/mnt/c/Users/taqi_/OneDrive/Desktop/semi_structured_reports/"
files = [dir + f for f in os.listdir(dir) if f.endswith(".pdf")]

## Load

In [3]:
from langchain.document_loaders import PyPDFLoader

texts = []
for fi in files:
    loader = PyPDFLoader(fi)
    pdf_pages = loader.load()
    texts.extend(pdf_pages)

texts = [t.page_content for t in texts]
text_string = " /// New Document /// ".join(texts)

## Run

In [4]:
from langchain.chat_models import ChatAnthropic, ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough


def create_chain(model):
    # Prompt template
    template = """Answer the question based only on the following context, which can include text and tables:
    {context}
    Question: {question}
    """
    prompt = ChatPromptTemplate.from_template(template)

    chain = (
        {
            "context": lambda x: text_string,
            "question": RunnablePassthrough(),
        }
        | prompt
        | model
        | StrOutputParser()
    )

    return chain


# OAI 128k
model = ChatOpenAI(temperature=0, model="gpt-4-1106-preview")
chain_oai_128k = create_chain(model)

# Anthropic 100k
# model = ChatAnthropic(temperature=0, model="claude-2")
# chain_claude = create_chain(model)

## Eval

In [5]:
import uuid
from functools import partial

from langsmith.client import Client
from langchain.smith import RunEvalConfig
from langchain_benchmarks.rag import get_eval_config

eval_config = RunEvalConfig(
    evaluators=["cot_qa"],
)

def run_eval(chain, eval_run_name):
    """
    Run eval
    """
    client = Client()
    test_run = client.run_on_dataset(
        ### TODO: Replace with public dataset
        dataset_name="Semi-Structured-Eval",
        llm_or_chain_factory=lambda: (lambda x: x["question"]) | chain,
        evaluation=eval_config,
        verbose=True,
        project_name=eval_run_name,
    )


# Experiments
chain_map = {
    "oai_128k": chain_oai_128k,
    # "claude2_100k_v2": chain_claude,
}

run_id = str(uuid.uuid4())
for project_name, chain in chain_map.items():
    run_eval(chain, project_name + "_" + run_id)

View the evaluation results for project 'oai_128k_ce4677bd-0103-4315-9133-c50e8080c011' at:
https://smith.langchain.com/o/530c4d06-5640-4c0f-94fe-0be7b769531f/datasets/7f6e64ae-218b-47cc-83a9-1991bd5fbfc8/compare?selectedSessions=aaad6363-6c20-4bbc-8510-41a8ee9599ac

View all tests for Dataset Semi-Structured-Eval at:
https://smith.langchain.com/o/530c4d06-5640-4c0f-94fe-0be7b769531f/datasets/7f6e64ae-218b-47cc-83a9-1991bd5fbfc8
[------------------------------------------------->] 20/20

Unnamed: 0,output,feedback.COT Contextual Accuracy,error,execution_time
count,20,20.0,0.0,20.0
unique,20,,0.0,
top,To calculate the percentage of total revenues ...,,,
freq,1,,,
mean,,0.5,,13.121374
std,,0.512989,,5.090399
min,,0.0,,6.733415
25%,,0.0,,10.459327
50%,,0.5,,11.664865
75%,,1.0,,13.316016
