In [1]:
### Env
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true' # enables tracing 
os.environ["LANGCHAIN_API_KEY"] = "xxx"
os.environ["LANGCHAIN_PROJECT"] = "RAG-feedback-and-few-shot"

### Creating a RAG bot

In [2]:
### Index

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

urls = [
    "https://lilianweng.github.io/posts/2023-06-23-agent/",
    "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
    "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/",
]

docs = [WebBaseLoader(url).load() for url in urls]
docs_list = [item for sublist in docs for item in sublist]

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=250, chunk_overlap=0
)
doc_splits = text_splitter.split_documents(docs_list)

# Add to vectorDB
vectorstore = Chroma.from_documents(
    documents=doc_splits,
    collection_name="rag-chroma",
    embedding=OpenAIEmbeddings(),
)
retriever = vectorstore.as_retriever(k=4)

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [3]:
### RAG bot

import openai
from langsmith import traceable
from langsmith.wrappers import wrap_openai

class RagBot:

    def __init__(self, retriever, model: str = "gpt-4o"):
        self._retriever = retriever
        # Wrapping the client instruments the LLM
        self._client = wrap_openai(openai.Client())
        self._model = model

    @traceable()
    def retrieve_docs(self, question):
        return self._retriever.invoke(question)

    @traceable()
    def invoke_llm(self, question, docs):
        response = self._client.chat.completions.create(
            model=self._model,
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful AI assistant for question answering."
                    " Use the following docs to answer the user question.\n\n"
                    f"## Docs\n\n{docs}",
                },
                {"role": "user", "content": question},
            ],
        )

        # Evaluators will expect "answer" and "contexts"
        return {
            "answer": response.choices[0].message.content,
            "contexts": [str(doc.page_content) for doc in docs],
        }

    @traceable()
    def get_answer(self, question: str):
        docs = self.retrieve_docs(question)
        return self.invoke_llm(question, docs)

rag_bot = RagBot(retriever)
response = rag_bot.get_answer("How does ReAct agent work?")
response["answer"][:150]

'The ReAct agent, developed by Yao et al. (2023), integrates reasoning and acting within a large language model (LLM) by extending the action space to '

### Setting an online evaluator

Now, we see traces logged to our project.

We can add an evaluator to our project.

Let's do document grading, which is a very useful check! 

https://github.com/langchain-ai/langgraph/blob/main/examples/rag/langgraph_crag.ipynb

Grade documents:

https://docs.smith.langchain.com/tutorials/Developers/rag#evaluator

In [None]:
response = rag_bot.get_answer("How does ReAct agent work?")
response = rag_bot.get_answer("What is the difference between ReAct and Reflexion approaches for self-reflection?")
response = rag_bot.get_answer("What is the Memory and Retrieval model in Generative Agents simulation?")
response = rag_bot.get_answer("What are the types of LLM memory?")
response = rag_bot.get_answer("What is the Memory and Retrieval model in Generative Agents simulation?")

### Attaching evaluators to a dataset

Now, let's build a dataset.

In [28]:
from langsmith import Client

client = Client()

# Create a dataset
examples = [
    (
        "How does ReAct agent work? ",
        "ReAct integrates reasoning and acting, performing actions - such tools like Wikipedia search API - and then observing / reasoning about the tool outputs.",
    ),
    (
        "What are the types of biases that can arise with few-shot prompting?",
        "The biases that can arise with few-shot prompting include (1) Majority label bias, (2) Recency bias, and (3) Common token bias.",
    ),
    (
        "What are five types of adversarial attacks?",
        "Five types of adversarial attacks are (1) Token manipulation, (2) Gradient based attack, (3) Jailbreak prompting, (4) Human red-teaming, (5) Model red-teaming.",
    ),
    (
        "What is the difference between ReAct and Reflexion approaches for self-reflection?",
        "Reflexion extends ReAct with a standard RL setup: it computed a reward and passes this to a heuristic function, which determines when the trajectory is inefficient or contains hallucination and should be stopped. It reflects on this output and optionally may decide to reset the environment to start a new trial depending on the self-reflection results.",
    ),
    (
        "What are the mappings of human memory into LLMs?", 
        "Sensory memory is learning embedding representations for raw inputs. Short-term memory is in-context learning. It is short and finite, as it is restricted by the finite context window length of Transformer. Long-term memory is the external vector store that the agent can attend to at query time, accessible via fast retrieval."),
]

# Save it
dataset_name = "Agent QA"
if not client.has_dataset(dataset_name=dataset_name):
    dataset = client.create_dataset(dataset_name=dataset_name)
    inputs, outputs = zip(
        *[({"question": text}, {"answer": label}) for text, label in examples]
    )
    client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)

Here's a function to call our RAG bot.

In [29]:
def predict_rag_answer(example: dict):
    """Use this for answer evaluation"""
    response = rag_bot.get_answer(example["question"])
    # Here, I can re-set the output keys to use in evaluation, if I want
    return {"response": response["answer"], "documents": response["contexts"]}

Here's an evaluator function for answers relative to our references.

In [30]:
from langchain import hub
from langchain_openai import ChatOpenAI

# Grade prompt
grade_prompt_answer_accuracy = hub.pull("langchain-ai/rag-answer-vs-reference")

def answer_evaluator(run, example) -> dict:
    """
    A simple evaluator for RAG answer accuracy
    """

    # Get the question, the ground truth reference answer, RAG chain answer prediction
    input_question = example.inputs["question"]
    reference = example.outputs["answer"]

    # Here, a reference the key set in predict_rag_answer
    prediction = run.outputs["response"]

    # Define an LLM grader
    llm = ChatOpenAI(model="gpt-4o", temperature=0)
    answer_grader = grade_prompt_answer_accuracy | llm

    # Run evaluator
    score = answer_grader.invoke(
        {
            "question": input_question,
            "correct_answer": reference,
            "student_answer": prediction,
        }
    )
    score = score["Score"]
    return {"key": "answer_v_reference_score", "score": score}

In [33]:
from langsmith.evaluation import evaluate
model_tested = "gpt-4o"
experiment_prefix = f"RAG-agentQA-{model_tested}"
experiment_results = evaluate(
    predict_rag_answer,
    data=dataset_name,
    evaluators=[answer_evaluator],
    experiment_prefix=experiment_prefix + "-answer",
    num_repetitions=1,
)

View the evaluation results for experiment: 'RAG-agentQA-gpt-4o-answer-45f862b3' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08941386-706a-4a9f-9567-b1f1d7242101/compare?selectedSessions=3b412956-df7c-461c-b2df-d0ed04c57ddc




0it [00:00, ?it/s]