# Semi-structured eval: Long-context

We will test retrival of table information from the `Semi-structured Reports` dataset using various methods.

In [None]:
%pip install -U langchain langsmith langchain_benchmarks
%pip install -U anthropic openai

## Dataset

In [1]:
import os
from pathlib import Path

from langchain_benchmarks import clone_public_dataset, registry
from langchain_benchmarks.rag.tasks.semi_structured_reports import get_file_names

# Task
task = registry["Semi-structured Reports"]

# Files used
paths = list(get_file_names())
files = [str(p) for p in paths]

### TODO: Replace when dataset is updated 
dir = "/Users/rlm/Desktop/Eval_Sets/semi_structured_reports/"
files = [dir+f for f in os.listdir(dir) if f.endswith(".pdf")]

## Load

In [8]:
from langchain.document_loaders import PyPDFLoader

texts = []
for fi in files:
    loader = PyPDFLoader(fi)
    pdf_pages = loader.load()
    texts.extend(pdf_pages)

texts = [t.page_content for t in texts]
text_string = ' /// New Document /// '.join(texts)

## Run

In [22]:
from langchain.chat_models import ChatOpenAI
from langchain.chat_models import ChatAnthropic
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

def create_chain(model):

    # Prompt template
    template = """Answer the question based only on the following context, which can include text and tables:
    {context}
    Question: {question}
    """
    prompt = ChatPromptTemplate.from_template(template)
    
    chain = (
            {
                "context":  lambda x: text_string,
                "question": RunnablePassthrough(),
            }
            | prompt
            | model
            | StrOutputParser()
        )

    return chain
    
# OAI 128k
model = ChatOpenAI(temperature=0, model="gpt-4-1106-preview")
chain_oai_128k = create_chain(model)

# Anthropic 100k
model = ChatAnthropic(temperature=0, model="claude-v1-100k")
chain_claude = create_chain(model)

## Eval

In [23]:
import uuid

from functools import partial

from langsmith.client import Client

from langchain_benchmarks.rag import get_eval_config


def run_eval(chain, eval_run_name):
    """
    Run eval
    """
    client = Client()
    test_run = client.run_on_dataset(
        ### TODO: Replace with public dataset 
        dataset_name="Semi-Structured-Eval",
        llm_or_chain_factory=lambda: (lambda x: x["question"]) | chain,
        evaluation=get_eval_config(),
        verbose=True,
        project_name = eval_run_name
    )


# Experiments
chain_map = {
    "oai_128k": chain_oai_128k,
    "claude2_100k": chain_claude,
}

run_id = str(uuid.uuid4())
for project_name, chain in chain_map.items():
    run_eval(chain, project_name+"_"+run_id)

View the evaluation results for project 'oai_128k_e264e160-6b78-4e90-9d31-e1f8408c1d2f' at:
https://smith.langchain.com/o/1fa8b1f4-fcb9-4072-9aa9-983e35ad61b8/projects/p/3b461515-facc-4c48-910d-f3987c892976?eval=true

View all tests for Dataset Semi-Structured-Eval at:
https://smith.langchain.com/o/1fa8b1f4-fcb9-4072-9aa9-983e35ad61b8/datasets/95f61109-029d-43a9-ae7d-ec1d53c6f723
[------------------------------------->            ] 15/20

Chain failed for example 8270a2d1-76a7-4ba6-ba85-c9439fa8c9dc with inputs {'question': "What was the year-over-year change in Datadog's net income for the three months that ended September 30, 2023?"}
Error Type: RateLimitError, Message: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4-1106-preview in organization org-beE7dFiWHsRELDAwjnWvVrhA on tokens per min (TPM): Limit 300000, Used 293648, Requested 36132. Please try again in 5.956s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


[--------------------------------------->          ] 16/20

Chain failed for example 4737f962-738c-496b-b4c7-46abdfd36f82 with inputs {'question': 'How much capitalized software expense did Datadog report for the three months that ended September 30, 2023?'}
Error Type: RateLimitError, Message: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4-1106-preview in organization org-beE7dFiWHsRELDAwjnWvVrhA on tokens per min (TPM): Limit 300000, Used 280605, Requested 36131. Please try again in 3.347s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


[-------------------------------------------->     ] 18/20

Chain failed for example 7e0aa415-3f65-4078-afba-73c0d4237f0f with inputs {'question': 'How much did Datadog spend on R&D for the three months ended September 30,2023?'}
Error Type: RateLimitError, Message: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4-1106-preview in organization org-beE7dFiWHsRELDAwjnWvVrhA on tokens per min (TPM): Limit 300000, Used 266424, Requested 36124. Please try again in 509ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


[------------------------------------------------->] 20/20
 Eval quantiles:
                                0.25        0.5       0.75       mean  \
embedding_cosine_distance   0.032063   0.049692   0.066995   0.050126   
faithfulness                0.100000   0.100000   0.100000   0.136364   
score_string:accuracy       0.100000   0.700000   1.000000   0.652941   
execution_time             10.165935  10.165935  10.165935  10.165935   

                                mode  
embedding_cosine_distance   0.007891  
faithfulness                0.100000  
score_string:accuracy       1.000000  
execution_time             10.165935  
View the evaluation results for project 'claude2_100k_e264e160-6b78-4e90-9d31-e1f8408c1d2f' at:
https://smith.langchain.com/o/1fa8b1f4-fcb9-4072-9aa9-983e35ad61b8/projects/p/d4f62582-d832-4b46-b0cb-cd9d30d9ca62?eval=true

View all tests for Dataset Semi-Structured-Eval at:
https://smith.langchain.com/o/1fa8b1f4-fcb9-4072-9aa9-983e35ad61b8/datasets/95f61109-029d

Chain failed for example e0f62570-d458-4533-b81f-47fa12fe9525 with inputs {'question': 'How many bank failures occurred between 2021 and 2023?'}
Error Type: InternalServerError, Message: Error code: 529 - {'error': {'type': 'overloaded_error', 'message': 'Overloaded'}}


[------------------------------------------------->] 20/20
 Eval quantiles:
                                0.25        0.5       0.75       mean  \
embedding_cosine_distance   0.061142   0.081544   0.095727   0.083903   
faithfulness                0.100000   0.300000   0.750000   0.420000   
score_string:accuracy       0.100000   0.500000   0.800000   0.478947   
execution_time             31.836878  31.836878  31.836878  31.836878   

                                mode  
embedding_cosine_distance   0.034193  
faithfulness                0.100000  
score_string:accuracy       0.100000  
execution_time             31.836878  
