### 1. Libraries

In [1]:
# Enable asyncio in Jupyter Notebook
import nest_asyncio
nest_asyncio.apply()

In [2]:
import numpy as np
from datasets import load_dataset
from langchain.evaluation import CotQAEvalChain, load_evaluator
from langchain.prompts import ChatPromptTemplate
from langchain.smith import RunEvalConfig, arun_on_dataset
from langchain_anthropic.chat_models import ChatAnthropic
from langchain_core.output_parsers import StrOutputParser
from langchain_openai.chat_models import ChatOpenAI
from langsmith import Client

from src.settings import settings

In [3]:
np.random.seed(1399)
langsmith_client = Client()

### 2. Load models to evaluate

In [4]:
# Define generation arguments for all the models
generation_kwargs = {
    "temperature": 0,
    "max_tokens": 500
}

In [5]:
# VLLM endpoint for the RAFT fine tuned model
endpoint = "https://jjovalle99--raft-starling7b-ft-serve-model.modal.run/v1"
raft_starling_7b = ChatOpenAI(
    model="jjovalle99/starling-7b-raft-ft",
    openai_api_key="EMPTY",
    openai_api_base=endpoint,
    **generation_kwargs
)

In [6]:
# Comparison models
openai_llm = ChatOpenAI(model="gpt-3.5-turbo-0125", api_key=settings.env.OPENAI_API_KEY, **generation_kwargs)
anthropic_llm = ChatAnthropic(model="claude-3-haiku-20240307", anthropic_api_key=settings.env.ANTHROPIC_API_KEY, **generation_kwargs)

### 3. Prepare dataset

In [7]:
# Loading created dataset
dataset = load_dataset("jjovalle99/raft-dataset-aws-wellarchitected", split="train")
# Sampling 30 examples
sampled_dataset = dataset.shuffle(seed=1399).select(range(30))

In [8]:
# Prepare context
sampled_dataset = sampled_dataset.map(lambda x: {"context": "\n".join([f"<DOCUMENT>{str(doc)}</DOCUMENT>" for doc in x["context"]["sentences"][0]])})

In [9]:
# Create dataset in LangSmith
dataset_name = "RAFT"
dataset = langsmith_client.create_dataset(
    dataset_name=dataset_name,
    description="This dataset is used for evaluating the RAFT model.",
)

In [10]:
# Populate dataset
instruction = "Carefully read and analyze the provided documents to answer the question that follows. " \
"Provide a detailed, step-by-step explanation of your reasoning, demonstrating how you arrived at your" \
" conclusion based on the information given in the documents.\n\n"

dataset_inputs = [
    {
        "context": f"{instruction}{example['context']}",
        "question": example["question"],
        "answer": example["cot_answer"]
    } 
    for example in sampled_dataset
]
dataset_outputs = [{"answer": example["cot_answer"]} for example in sampled_dataset]

langsmith_client.create_examples(
    inputs=dataset_inputs,
    outputs=dataset_outputs,
    dataset_id=dataset.id,
)

![img](assets/dataset-langsmith2.png)

## 4. Evaluation With Context

In [11]:
# Evaluator llm
eval_llm = ChatOpenAI(model="gpt-4-turbo-preview", api_key=settings.env.OPENAI_API_KEY)

In [12]:
# Define chain
prompt = ChatPromptTemplate.from_messages(
    messages=[("user", f"{instruction}{{context}}\n{{question}}")]
)

output_parser = StrOutputParser()

def create_chain_by_model(model):
    return (
        prompt 
        | model 
        | output_parser
    )

### 4.1 How are we evaluating?

In [13]:
_example = sampled_dataset[0]
_output = create_chain_by_model(model=raft_starling_7b).invoke({
    "context": _example["context"],
    "question": _example["question"]
})

#### 4.1.1 How is Coherence evaluated?

In [14]:
evaluator = load_evaluator("criteria", criteria="coherence")
print(type(evaluator))
print(evaluator.prompt.invoke({"input": "This is a mock input", "output": "This is a mock output"}).text)

<class 'langchain.evaluation.criteria.eval_chain.CriteriaEvalChain'>
You are assessing a submitted answer on a given task or input based on a set of criteria. Here is the data:
[BEGIN DATA]
***
[Input]: This is a mock input
***
[Submission]: This is a mock output
***
[Criteria]: coherence: Is the submission coherent, well-structured, and organized?
***
[END DATA]
Does the submission meet the Criteria? First, write out in a step by step manner your reasoning about each criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then print only the single character "Y" or "N" (without quotes or punctuation) on its own line corresponding to the correct answer of whether the submission meets all criteria. At the end, repeat just the letter again by itself on a new line.


In [15]:
evaluator.invoke({"input": _example["context"], "output": _output})

{'input': "<DOCUMENT>They allow communication between instances in your VPC and services without imposing availability risks or bandwidth constraints on your network traffic. ## Common anti-patterns:\n\nFoundations - 415\n---\n## AWS Well-Architected Framework\n\nFramework\n\n- Having only one connectivity provider between your on-site network and AWS. - Consuming the connectivity capabilities of your AWS Direct Connect connection, but only having one connection. - Having only one path for your VPN connectivity. Benefits of establishing this best practice: By implementing redundant connectivity between your cloud environment and your corporate or on-premises environment, you can ensure that the dependent services between the two environments can communicate reliably. Level of risk exposed if this best practice is not established: High\n\n### Implementation guidance\n\n- Ensure that you have highly available connectivity between AWS and on-premises environment. Use multiple AWS Direct C

#### 4.1.2 How is Helpfulness evaluated?

In [16]:
evaluator = load_evaluator("criteria", criteria="helpfulness")
print(type(evaluator))
print(evaluator.prompt.invoke({"input": "This is a mock input", "output": "This is a mock output"}).text)

<class 'langchain.evaluation.criteria.eval_chain.CriteriaEvalChain'>
You are assessing a submitted answer on a given task or input based on a set of criteria. Here is the data:
[BEGIN DATA]
***
[Input]: This is a mock input
***
[Submission]: This is a mock output
***
[Criteria]: helpfulness: Is the submission helpful, insightful, and appropriate? If so, respond Y. If not, respond N.
***
[END DATA]
Does the submission meet the Criteria? First, write out in a step by step manner your reasoning about each criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then print only the single character "Y" or "N" (without quotes or punctuation) on its own line corresponding to the correct answer of whether the submission meets all criteria. At the end, repeat just the letter again by itself on a new line.


In [17]:
evaluator.invoke({"input": _example["context"], "output": _output})

{'input': "<DOCUMENT>They allow communication between instances in your VPC and services without imposing availability risks or bandwidth constraints on your network traffic. ## Common anti-patterns:\n\nFoundations - 415\n---\n## AWS Well-Architected Framework\n\nFramework\n\n- Having only one connectivity provider between your on-site network and AWS. - Consuming the connectivity capabilities of your AWS Direct Connect connection, but only having one connection. - Having only one path for your VPN connectivity. Benefits of establishing this best practice: By implementing redundant connectivity between your cloud environment and your corporate or on-premises environment, you can ensure that the dependent services between the two environments can communicate reliably. Level of risk exposed if this best practice is not established: High\n\n### Implementation guidance\n\n- Ensure that you have highly available connectivity between AWS and on-premises environment. Use multiple AWS Direct C

#### 4.1.3 How is Correcteness/QA evaluated?

In [18]:
evaluator = CotQAEvalChain.from_llm(llm=eval_llm)
print(type(evaluator))
print(evaluator.prompt.invoke({"context": "This is a mock context", "query": "This is a mock question/query", "result": "This is a mock result"}).text)

<class 'langchain.evaluation.qa.eval_chain.CotQAEvalChain'>
You are a teacher grading a quiz.
You are given a question, the context the question is about, and the student's answer. You are asked to score the student's answer as either CORRECT or INCORRECT, based on the context.
Write out in a step by step manner your reasoning to be sure that your conclusion is correct. Avoid simply stating the correct answer at the outset.

Example Format:
QUESTION: question here
CONTEXT: context the question is about here
STUDENT ANSWER: student's answer here
EXPLANATION: step by step reasoning here
GRADE: CORRECT or INCORRECT here

Grade the student answers based ONLY on their factual accuracy. Ignore differences in punctuation and phrasing between the student answer and true answer. It is OK if the student answer contains more information than the true answer, as long as it does not contain any conflicting statements. Begin! 

QUESTION: This is a mock question/query
CONTEXT: This is a mock context


In [20]:
print(evaluator.invoke({"context": _example["context"], "query": _example["question"], "result": _output})["text"])

To evaluate the student's answer, we need to follow these steps:

1. **Identify the Question's Focus**: The question is asking for the level of risk associated with not following the best practice of establishing predefined processes for incident analysis.

2. **Locate Relevant Information in the Context**: The context provided in the second <DOCUMENT> mentions, "Level of risk exposed if this best practice is not established: High." This statement directly relates to the question's focus, which is about the risk level when predefined processes for incident analysis are not established.

3. **Analyze the Student's Answer**: The student's answer is "High." They have also provided a reasoning that references the exact quote from the context, indicating they have correctly identified and interpreted the relevant information.

4. **Compare Student's Answer with the Context**: The student's answer matches the information given in the context. They have accurately cited the portion of the tex

### 4.2 Defining the evaluation configuration

In [21]:
# Define metrics of eval
evaluation_config = RunEvalConfig(
    evaluators=[
        "cot_qa",
        RunEvalConfig.LabeledCriteria(
            criteria="helpfulness",
            input_key="question",
            prediction_key="output",
        ),
        RunEvalConfig.LabeledCriteria(
            criteria="coherence",
            input_key="question",
            prediction_key="output",
        )
    ],
    input_key="question",
    eval_llm=eval_llm,
)

### 4.3 Run evaluation

In [22]:
for each in [raft_starling_7b, openai_llm, anthropic_llm]:
    
    MODEL_NAME = each.model_name if hasattr(each, "model_name") else each.model 
    print(MODEL_NAME)

    await arun_on_dataset(
        dataset_name=dataset_name,
        llm_or_chain_factory=create_chain_by_model(model=each),
        client=langsmith_client,
        evaluation=evaluation_config,
        project_name=f"Evaluation - {MODEL_NAME}",
    )

jjovalle99/starling-7b-raft-ft
View the evaluation results for project 'Evaluation - jjovalle99/starling-7b-raft-ft' at:
https://smith.langchain.com/o/e1ff5e9a-fc1a-5ec0-91dc-86daf509e790/datasets/9dd76add-d39c-4897-8b54-d762412f1644/compare?selectedSessions=527c1618-5954-4ec1-9119-904284f16933

View all tests for Dataset RAFT at:
https://smith.langchain.com/o/e1ff5e9a-fc1a-5ec0-91dc-86daf509e790/datasets/9dd76add-d39c-4897-8b54-d762412f1644
[-------------------------------------------->     ] 27/30

Chain failed for example 539f726b-d146-4f19-9e43-09bb2b4bd978 with inputs {'answer': '##Reason: The document provides a specific best practice under the identifier SUS02-BP01, which is to ##begin_quote## Scale workload infrastructure dynamically ##end_quote##. This is further elaborated as using the elasticity of the cloud to scale infrastructure dynamically to match the supply of cloud resources to demand, thereby avoiding overprovisioned capacity in the workload. The recommendation aims to ensure that infrastructure scales with user load, addressing common anti-patterns such as not scaling infrastructure with user load. ##Answer: Scale workload infrastructure dynamically', 'context': "Carefully read and analyze the provided documents to answer the question that follows. Provide a detailed, step-by-step explanation of your reasoning, demonstrating how you arrived at your conclusion based on the information given in the documents.\n\n<DOCUMENT>Provide your team members with devices tha

[------------------------------------------------->] 30/30gpt-3.5-turbo-0125
View the evaluation results for project 'Evaluation - gpt-3.5-turbo-0125' at:
https://smith.langchain.com/o/e1ff5e9a-fc1a-5ec0-91dc-86daf509e790/datasets/9dd76add-d39c-4897-8b54-d762412f1644/compare?selectedSessions=b34e329c-ab5f-4409-8fad-aae3750d4db1

View all tests for Dataset RAFT at:
https://smith.langchain.com/o/e1ff5e9a-fc1a-5ec0-91dc-86daf509e790/datasets/9dd76add-d39c-4897-8b54-d762412f1644
[------------------------------------------------->] 30/30claude-3-haiku-20240307
View the evaluation results for project 'Evaluation - claude-3-haiku-20240307' at:
https://smith.langchain.com/o/e1ff5e9a-fc1a-5ec0-91dc-86daf509e790/datasets/9dd76add-d39c-4897-8b54-d762412f1644/compare?selectedSessions=2d555b66-90c2-47c7-877d-6785b79806d3

View all tests for Dataset RAFT at:
https://smith.langchain.com/o/e1ff5e9a-fc1a-5ec0-91dc-86daf509e790/datasets/9dd76add-d39c-4897-8b54-d762412f1644
[>                            

Chain failed for example 4d69f923-db86-4f23-bda7-01926c92f9ed with inputs {'answer': '##Reason: The document provides a clear recommendation for compiling source code and running unit tests within the AWS ecosystem. Specifically, it states ##begin_quote## Use CodeBuild to compile your source code, runs unit tests, and produces artifacts that are ready to deploy. ##end_quote## This indicates that AWS CodeBuild is the recommended service for these tasks. ##Answer: AWS CodeBuild', 'context': "Carefully read and analyze the provided documents to answer the question that follows. Provide a detailed, step-by-step explanation of your reasoning, demonstrating how you arrived at your conclusion based on the information given in the documents.\n\n<DOCUMENT>This reduces lead time, decreases cost, encourages increased frequency of change, reduces the level of effort, and increases collaboration. Prepare\n\n127\n---\n## AWS Well-Architected Framework\n\n### Implementation steps\n\n|Author|Source/Ar

[------------------------------------------------->] 30/30

### 5. Evaluating Without context

In [27]:
# Define chain
prompt = ChatPromptTemplate.from_messages(
    messages=[("system", "You are an expert QA assistant, answer the following question"), ("user", "{question}")]
)

output_parser = StrOutputParser()

def create_chain_by_model(model):
    return (
        prompt 
        | model 
        | output_parser
    )

In [29]:
# Define metrics of eval
evaluation_config = RunEvalConfig(
    evaluators=[
        "cot_qa",
        RunEvalConfig.LabeledCriteria(
            criteria="helpfulness",
            input_key="question",
            prediction_key="output",
        ),
        RunEvalConfig.LabeledCriteria(
            criteria="coherence",
            input_key="question",
            prediction_key="output",
        )
    ],
    input_key="question",
    eval_llm=eval_llm,
)

In [30]:
for each in [raft_starling_7b, openai_llm, anthropic_llm]:
    
    MODEL_NAME = each.model_name if hasattr(each, "model_name") else each.model 
    print(MODEL_NAME)

    await arun_on_dataset(
        dataset_name=dataset_name,
        llm_or_chain_factory=create_chain_by_model(model=each),
        client=langsmith_client,
        evaluation=evaluation_config,
        concurrency_level=10 if "haiku" not in MODEL_NAME else 2,
        project_name=f"WC - Evaluation - {MODEL_NAME}",
    )

jjovalle99/starling-7b-raft-ft
View the evaluation results for project 'WC - Evaluation - jjovalle99/starling-7b-raft-ft' at:
https://smith.langchain.com/o/e1ff5e9a-fc1a-5ec0-91dc-86daf509e790/datasets/9dd76add-d39c-4897-8b54-d762412f1644/compare?selectedSessions=56300fb5-7d29-4044-b650-ed5fbef6f099

View all tests for Dataset RAFT at:
https://smith.langchain.com/o/e1ff5e9a-fc1a-5ec0-91dc-86daf509e790/datasets/9dd76add-d39c-4897-8b54-d762412f1644
[------------------------------------------------->] 30/30gpt-3.5-turbo-0125
View the evaluation results for project 'WC - Evaluation - gpt-3.5-turbo-0125' at:
https://smith.langchain.com/o/e1ff5e9a-fc1a-5ec0-91dc-86daf509e790/datasets/9dd76add-d39c-4897-8b54-d762412f1644/compare?selectedSessions=55698c93-89ad-4c0a-9dd1-8575fc7487e6

View all tests for Dataset RAFT at:
https://smith.langchain.com/o/e1ff5e9a-fc1a-5ec0-91dc-86daf509e790/datasets/9dd76add-d39c-4897-8b54-d762412f1644
[------------------------------------------------->] 30/30claude