## LangSmith - Trazas, Pruebas y Evaluaciones

####  Crea trazas con LangSmith

In [1]:
import os
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = os.getenv("LANGCHAIN_TRACING_V2")
os.environ["LANGCHAIN_ENDPOINT"] = os.getenv("LANGCHAIN_ENDPOINT")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")


In [2]:
from langsmith import Client

client = Client()

In [3]:
import uuid

uid = uuid.uuid4()
PROJECT_NAME = "Lil Demo-" + str(uid)

session = client.create_project(
   project_name=PROJECT_NAME,
   description="Demo for starting to use the Langsmith API",
)


In [50]:
os.environ["LANGCHAIN_PROJECT"] = PROJECT_NAME

In [2]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI()
llm.invoke("Hey!")

AIMessage(content='Hello! How can I help you today?', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 9, 'prompt_tokens': 9, 'total_tokens': 18, 'completion_tokens_details': {'audio_tokens': None, 'reasoning_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': None, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-0fef67ca-8cdf-40c9-8f67-8dffa7e7bd86-0', usage_metadata={'input_tokens': 9, 'output_tokens': 9, 'total_tokens': 18, 'input_token_details': {'cache_read': 0}, 'output_token_details': {'reasoning': 0}})

### Prompt Engineering

In [4]:
system_prompt = '''You are a meeting summarization agent. Your task is to analyze a meeting transcript and create a structured summary in JSON format with the following fields. Ensure that your response is always formatted as follows:

  "date": "Date of the meeting",
  "participants": ["Participant 1", "Participant 2", "Participant 3", ...],
  "summary": "A concise summary of the key discussion points and outcomes of the meeting.",
  "action_points": ["Action Point 1", "Action Point 2", "Action Point 3", ...],
  "transcript_body": "The full content of the meeting transcript."

Make sure to extract names, key points, and relevant actions clearly and consistently.
'''

In [5]:
human_prompt = '''Date: November 4, 2024
        Participants: Alice, Bob, Charlie

        Transcript:

        Alice: Good morning, everyone. Let’s start with the project update. Bob, could you share the current status?

        Bob: Sure, Alice. The development team has completed the initial phase, and we’re now moving into testing. We anticipate some minor bugs, but overall, things are on track to meet the deadline.

        Charlie: That’s good news. How about the client presentation preparation?

        Alice: Yes, we should start preparing for that. Charlie, could you put together a draft of the slides by Thursday?

        Charlie: Absolutely. I’ll have it ready for review.

        Bob: Also, I wanted to mention that we need to allocate more time for user feedback analysis. Should I schedule a separate session for that next week?

        Alice: Yes, please do. It’s important to get thorough feedback to refine the final product.'''

In [6]:
from langchain import hub
prompt = hub.pull("rlm/rag-prompt")

### LangSmith - Evaluate RAG

In [13]:
# Clone dataset
client = Client()
dataset = client.clone_public_dataset(
    "https://smith.langchain.com/public/d7e3a510-a07a-42a9-872a-e2cf189f2929/d"
)

dataset_name = "Test"

In [27]:
from bs4 import BeautifulSoup as Soup
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.text import TextLoader

# Cargar documentos
loader = TextLoader("../data/office_procedures.txt")

docs = loader.load()

# Dividir en chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=4500, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

# Embed y almacenar en vectorstore Chroma
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

# Índice
retriever = vectorstore.as_retriever()

In [29]:
import openai
from langsmith import traceable
from langsmith.wrappers import wrap_openai

class RagBot:

    def __init__(self, retriever, model: str = "gpt-4o"):
        self._retriever = retriever
        self._client = wrap_openai(openai.Client())
        self._model = model

    @traceable()
    def retrieve_docs(self, question):
        return self._retriever.invoke(question)

    @traceable()
    def invoke_llm(self, question, docs):
        response = self._client.chat.completions.create(
            model=self._model,
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful AI agent with expertise this Office´s Procedures."
                    " Use ONLY the following docs to produce a concise code solution to the user question.\n\n"
                    f"## Docs\n\n{docs}",
                },
                {"role": "user", "content": question},
            ],
        )
        return {
            "answer": response.choices[0].message.content,
            "contexts": [str(doc) for doc in docs],
        }

    @traceable()
    def get_answer(self, question: str):
        docs = self.retrieve_docs(question)
        return self.invoke_llm(question, docs)

# Inicializar el bot
rag_bot = RagBot(retriever)

In [32]:
response = rag_bot.get_answer("De acuerdo con el contexto, ¿cuáles son las prácticas para gestionar reuniones de equipo?")
print (response["answer"])

Para gestionar reuniones de equipo de manera efectiva, se recomienda:

1. Enviar una agenda clara con antelación.
2. Establecer objetivos específicos para la reunión.
3. Asignar un moderador para garantizar que la discusión fluya de manera ordenada.
4. Evitar interrupciones durante la reunión.
5. Limitar el uso de dispositivos electrónicos a lo necesario y evitar distracciones.


### Evaluación

In [None]:
def predict_rag_answer(example: dict):
    """Función para evaluar la respuesta generada por el modelo"""
    response = rag_bot.get_answer(example["input_question"])
    return {"answer": response["answer"]}

In [None]:
def predict_rag_answer_with_context(example: dict):
    """Función para evaluar los documentos recuperados y las alucinaciones"""
    response = rag_bot.get_answer(example["input_question"])
    return {"answer": response["answer"], "contexts": response["contexts"]}

### Response vs Reference

In [None]:
from langchain import hub
from langchain_openai import ChatOpenAI

# Prompt para evaluar la respuesta vs la referencia
grade_prompt_answer_accuracy = prompt = hub.pull("langchain-ai/rag-answer-vs-reference")

def answer_evaluator(run, example) -> dict:
    """
    Evaluator de accuracy de la respuesta generada por el modelo vs la respuesta de referencia
    """

    # Obtén pregunta, ground truth answer, RAG chain answer
    input_question = example.inputs["input_question"]
    reference = example.outputs["output_answer"]
    prediction = run.outputs["answer"]

    # LLM para comparar respuesta vs referencia
    llm = ChatOpenAI(model="gpt-4-turbo", temperature=0)

    # Prompt estructurado
    answer_grader = grade_prompt_answer_accuracy | llm

    # Ejecutar el evaluador
    score = answer_grader.invoke({"question": input_question,
                                  "correct_answer": reference,
                                  "student_answer": prediction})
    score = score["Score"]

    return {"key": "answer_v_reference_score", "score": score}

In [34]:
from langsmith.evaluation import evaluate

experiment_results = evaluate(
    predict_rag_answer,
    data=dataset_name,
    evaluators=[answer_evaluator],
    experiment_prefix="rag-answer-v-reference",
    metadata={"version": "TEST-questions, gpt-4o"},
)

View the evaluation results for experiment: 'rag-answer-v-reference-38cb025e' at:
https://smith.langchain.com/o/896f324f-7bc4-5a0a-8ffc-a3ab7bd5a36e/datasets/6914005e-d3d8-466c-9c22-5b6585ee1741/compare?selectedSessions=0f714644-c8ff-4b21-adcd-392c012afe54




0it [00:00, ?it/s]

### Response vs input

In [36]:
grade_prompt_answer_helpfulness = prompt = hub.pull("langchain-ai/rag-answer-helpfulness")

def answer_helpfulness_evaluator(run, example) -> dict:
    """
    Evaluator de la utilidad de la respuesta generada por el modelo con respecto a la pregunta
    """

    # Obtén pregunta, RAG chain answer
    input_question = example.inputs["input_question"]
    prediction = run.outputs["answer"]

    # LLM para evaluar la utilidad de la respuesta
    llm = ChatOpenAI(model="gpt-4-turbo", temperature=0)

    # Prompt estructurado
    answer_grader = grade_prompt_answer_helpfulness | llm

    # Ejecutar evaluator
    score = answer_grader.invoke({"question": input_question,
                                  "student_answer": prediction})
    score = score["Score"]

    return {"key": "answer_helpfulness_score", "score": score}

In [41]:
experiment_results = evaluate(
    predict_rag_answer,
    data=dataset_name,
    evaluators=[answer_helpfulness_evaluator],
    experiment_prefix="rag-answer-helpfulness",
    metadata={"version": "TEST-questions, gpt-4o"},
)

View the evaluation results for experiment: 'rag-answer-helpfulness-8fc67e2c' at:
https://smith.langchain.com/o/896f324f-7bc4-5a0a-8ffc-a3ab7bd5a36e/datasets/6914005e-d3d8-466c-9c22-5b6585ee1741/compare?selectedSessions=35d2ae10-8f27-4bb8-bf22-08dbbcf411b4




5it [00:27,  5.55s/it]


In [None]:
def answer_hallucination_grader(root_run: Run, example: Example) -> dict:
    """
    A simple evaluator that checks to see the answer is grounded in the documents
    """

    # RAG input
    rag_pipeline_run = next(
        run for run in root_run.child_runs if run.name == "get_answer"
    )
    retrieve_run = next(
        run for run in rag_pipeline_run.child_runs if run.name == "retrieve_docs"
    )
    contexts = "\n\n".join(doc.page_content for doc in retrieve_run.outputs["output"])

    # RAG output
    prediction = rag_pipeline_run.outputs["answer"]

    # LLM grader
    llm = ChatOpenAI(model="gpt-4-turbo", temperature=0)

    # Structured prompt
    answer_grader = grade_prompt_hallucinations | llm

    # Get score
    score = answer_grader.invoke({"documents": contexts,
                                  "student_answer": prediction})
    score = score["Score"]

    return {"key": "answer_hallucination", "score": score}

experiment_results = evaluate(
    predict_rag_answer,
    data=dataset_name,
    evaluators=[document_relevance_grader, answer_hallucination_grader],
    metadata={"version": "LCEL context, gpt-4-0125-preview"},
)