# Semi-structured eval: Long-context

We will test retrival of table information from the `Semi-structured Reports` dataset using various methods.

In [None]:
%pip install -U langchain langsmith langchain_benchmarks
%pip install -U anthropic openai

## Dataset

In [1]:
import os

from langchain_benchmarks import registry
from langchain_benchmarks.rag.tasks.semi_structured_reports import get_file_names

# Task
task = registry["Semi-structured Reports"]

# Files used
paths = list(get_file_names())
files = [str(p) for p in paths]

### TODO: Replace when dataset is updated
dir = "/Users/rlm/Desktop/Eval_Sets/semi_structured_reports/"
files = [dir + f for f in os.listdir(dir) if f.endswith(".pdf")]

## Load

In [2]:
from langchain.document_loaders import PyPDFLoader

texts = []
for fi in files:
    loader = PyPDFLoader(fi)
    pdf_pages = loader.load()
    texts.extend(pdf_pages)

texts = [t.page_content for t in texts]
text_string = " /// New Document /// ".join(texts)

## Run

In [3]:
from langchain.chat_models import ChatOpenAI
from langchain.chat_models import ChatAnthropic
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough


def create_chain(model):
    # Prompt template
    template = """Answer the question based only on the following context, which can include text and tables:
    {context}
    Question: {question}
    """
    prompt = ChatPromptTemplate.from_template(template)

    chain = (
        {
            "context": lambda x: text_string,
            "question": RunnablePassthrough(),
        }
        | prompt
        | model
        | StrOutputParser()
    )

    return chain


# OAI 128k
model = ChatOpenAI(temperature=0, model="gpt-4-1106-preview")
chain_oai_128k = create_chain(model)

# Anthropic 100k
model = ChatAnthropic(temperature=0, model="claude-2")
chain_claude = create_chain(model)

## Eval

In [7]:
import uuid
from langsmith.client import Client
from langchain.smith import RunEvalConfig

# Config
client = Client()
eval_config = RunEvalConfig(
    evaluators=["cot_qa"],
)

# Experiments
chain_map = {
    "oai_128k": chain_oai_128k,
    "claude2_100k_v2": chain_claude,
}


# Run evaluation
run_id = uuid.uuid4().hex[:4]
test_runs = {}
for project_name, chain in chain_map.items():
    test_runs[project_name] = client.run_on_dataset(
        # dataset_name=task.name,
        dataset_name="Semi-Structured-Eval-v8",
        llm_or_chain_factory=lambda: (lambda x: x["question"]) | chain,
        evaluation=eval_config,
        verbose=True,
        project_name=f"{run_id}-{project_name}",
        project_metadata={"chain": project_name},
    )

View the evaluation results for project 'f3b7-oai_128k' at:
https://smith.langchain.com/o/1fa8b1f4-fcb9-4072-9aa9-983e35ad61b8/datasets/8cff6883-909b-4014-86e2-7a5445ebdff5/compare?selectedSessions=d26a93a7-42cc-4d48-8204-bf15dcb6d6e7

View all tests for Dataset Semi-Structured-Eval-v8 at:
https://smith.langchain.com/o/1fa8b1f4-fcb9-4072-9aa9-983e35ad61b8/datasets/8cff6883-909b-4014-86e2-7a5445ebdff5
[------------------------------------------------->] 25/25

Unnamed: 0,output,feedback.COT Contextual Accuracy,error,execution_time
count,25,25.0,0.0,25.0
unique,25,,0.0,
top,Datadog's Non-GAAP gross margin for the Nine M...,,,
freq,1,,,
mean,,0.88,,16.04947
std,,0.331662,,7.425928
min,,0.0,,7.389074
25%,,1.0,,9.99655
50%,,1.0,,14.489918
75%,,1.0,,21.660434


View the evaluation results for project 'f3b7-claude2_100k_v2' at:
https://smith.langchain.com/o/1fa8b1f4-fcb9-4072-9aa9-983e35ad61b8/datasets/8cff6883-909b-4014-86e2-7a5445ebdff5/compare?selectedSessions=98b7ad86-0c6b-4395-9c3e-b0cef4735a4f

View all tests for Dataset Semi-Structured-Eval-v8 at:
https://smith.langchain.com/o/1fa8b1f4-fcb9-4072-9aa9-983e35ad61b8/datasets/8cff6883-909b-4014-86e2-7a5445ebdff5
[------------------------------------------------->] 25/25

Unnamed: 0,output,feedback.COT Contextual Accuracy,error,execution_time
count,25,25.0,0.0,25.0
unique,25,,0.0,
top,Based on the information in the Datadog finan...,,,
freq,1,,,
mean,,0.52,,42.703964
std,,0.509902,,9.958002
min,,0.0,,34.072126
25%,,0.0,,37.258732
50%,,1.0,,39.457795
75%,,1.0,,43.017492
