# Semi-structured eval: baseline retriever

We will test retrival of table information from the `Semi-structured Reports` dataset using various methods.

## Pre-requisites

In [None]:
%pip install -U langchain langsmith langchain_benchmarks
%pip install --quiet chromadb openai

## Dataset

In [1]:
import os

from langchain_benchmarks import registry
from langchain_benchmarks.rag.tasks.semi_structured_reports import get_file_names

# Task
task = registry["Semi-structured Reports"]

# Files used
paths = list(get_file_names())
files = [str(p) for p in paths]

### TODO: Replace when dataset is updated
dir = "/Users/rlm/Desktop/Eval_Sets/semi_structured_reports/"
files = [f for f in os.listdir(dir) if f.endswith(".pdf")]

## Load

In [2]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


def load_and_split(file,token_count):
    """
    Load and split PDF files
    """

    loader = PyPDFLoader(file)
    pdf_pages = loader.load()

    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=token_count, chunk_overlap=50
    )

    # Get chunks
    docs = text_splitter.split_documents(pdf_pages)
    texts = [d.page_content for d in docs]
    print(f"There are {len(texts)} text elements")
    return texts


texts = []
token_count = 1000
for fi in files:
    texts.extend(load_and_split(dir + fi,token_count))

There are 5 text elements
There are 14 text elements
There are 5 text elements
There are 23 text elements
There are 13 text elements
There are 13 text elements


## Index

In [3]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

vectorstore_baseline = Chroma.from_texts(
    texts=texts, collection_name="baseline-1000-token", embedding=OpenAIEmbeddings()
)

retriever_baseline = vectorstore_baseline.as_retriever()

## RAG

In [4]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough


def rag_chain(retriever):
    """
    RAG chain
    """

    # Prompt template
    template = """Answer the question based only on the following context, which can include text and tables:
    {context}
    Question: {question}
    """
    prompt = ChatPromptTemplate.from_template(template)

    # LLM
    model = ChatOpenAI(temperature=0, model="gpt-4")

    # RAG pipeline
    chain = (
        {
            "context": retriever | (lambda x: "\n\n".join([i.page_content for i in x])),
            "question": RunnablePassthrough(),
        }
        | prompt
        | model
        | StrOutputParser()
    )
    return chain


# Create RAG chain
chain = rag_chain(retriever_baseline)

## Eval

In [5]:
import uuid
from langsmith.client import Client
from langchain.smith import RunEvalConfig

# Config
client = Client()
eval_config = RunEvalConfig(
    evaluators=["cot_qa"],
)

# Experiments
chain_map = {
    "baseline-1000-tok": chain,
    # "baseline-2000-tok": chain,
}


# Run evaluation
run_id = uuid.uuid4().hex[:4]
test_runs = {}
for project_name, chain in chain_map.items():
    test_runs[project_name] = client.run_on_dataset(
        # dataset_name=task.name,
        dataset_name="Semi-Structured-Eval-v8",
        llm_or_chain_factory=lambda: (lambda x: x["question"]) | chain,
        evaluation=eval_config,
        verbose=True,
        project_name=f"{run_id}-{project_name}",
        project_metadata={"chain": project_name},
    )

View the evaluation results for project '6d3c-baseline-1000-tok' at:
https://smith.langchain.com/o/1fa8b1f4-fcb9-4072-9aa9-983e35ad61b8/datasets/8cff6883-909b-4014-86e2-7a5445ebdff5/compare?selectedSessions=6a5183de-6ae9-4cca-b2ee-8c9520416820

View all tests for Dataset Semi-Structured-Eval-v8 at:
https://smith.langchain.com/o/1fa8b1f4-fcb9-4072-9aa9-983e35ad61b8/datasets/8cff6883-909b-4014-86e2-7a5445ebdff5
[------------------------------------------------->] 25/25

Unnamed: 0,output,feedback.COT Contextual Accuracy,error,execution_time
count,25,25.0,0.0,25.0
unique,25,,0.0,
top,Datadog's Non-GAAP gross margin for the Nine M...,,,
freq,1,,,
mean,,0.76,,9.010377
std,,0.43589,,3.432551
min,,0.0,,5.954703
25%,,1.0,,6.80699
50%,,1.0,,8.011278
75%,,1.0,,9.132252


## TEMP: ITERATURE ON THE DATASET

In [6]:
### TODO: Replace with public dataset

import uuid

import pandas as pd
from langsmith import Client

# Read
df = pd.read_csv(dir + "semi_structured_reports.csv")

# Dataset
client = Client()
dataset_name = "Semi-Structured-Eval-v9"
dataset = client.create_dataset(dataset_name=dataset_name)

# Populate dataset
for _, row in df.iterrows():
    # Get Q, A
    q = row["Question"]
    a = row["Answer"]

    # Use the values in your function
    client.create_example(
        inputs={"question": q}, outputs={"answer": a}, dataset_id=dataset.id
    )