# Enviornment 

In [None]:
! pip install langsmith openai

In [94]:
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true' # enables tracing 
os.environ['LANGCHAIN_API_KEY'] = <your-api-key>

In [5]:
import os
os.environ['LANGCHAIN_PROJECT'] = 'Test'

# Tracing

https://docs.smith.langchain.com/tracing/quick_start

![Screenshot 2024-04-01 at 3.50.52 PM.png](attachment:1eef9305-6a29-4236-85c6-607946ba3a27.png)

In [6]:
import openai
from langsmith.wrappers import wrap_openai
openai_client = wrap_openai(openai.Client())

# System prompt 
system_msg = f"Give me a short historical summary of a topic."

# User topic
topic = "the Pantheon"

# Pass in website text
messages = [{"role": "system", "content": system_msg},
            {"role": "user", "content": f"topic: {topic}"}]

# Call OpenAI
openai_client.chat.completions.create(messages=messages, model="gpt-3.5-turbo")

ChatCompletion(id='chatcmpl-99M2IK2326wrf4PJONVPQsstRDp4i', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='The Pantheon is a famous ancient Roman temple located in the city of Rome. It was originally commissioned by Marcus Agrippa in 27 BC, but was later reconstructed by Emperor Hadrian around 126 AD. The Pantheon is renowned for its unique architecture, with a large dome that is still considered a masterpiece of engineering and design.\n\nOriginally dedicated to all the pagan gods of Rome, the Pantheon was converted into a Christian church in the 7th century and is still in use today as the Church of St. Mary and the Martyrs. The building has survived largely intact over the centuries, making it one of the best-preserved ancient Roman structures in the world.\n\nThe Pantheon has inspired countless architects and artists over the centuries, and its iconic dome continues to be a symbol of classical Roman architecture. Today, the Pant

# Dataset: SDK or UI

With LangSmith SDK: 

https://docs.smith.langchain.com/evaluation/quickstart#1-create-a-dataset

With UI:

https://docs.smith.langchain.com/evaluation/faq/datasets-webapp

![Screenshot 2024-04-01 at 3.59.07 PM.png](attachment:297eef34-4ec8-4df6-9b35-e4164e43cd8d.png)

New model release:

https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm

Build a `Manually Curated` dataset:

![Screenshot 2024-04-01 at 4.31.04 PM.png](attachment:1f72fe49-a720-43e9-a875-7c76e21879d7.png)

In [7]:
import pandas as pd

# QA
inputs = [
    "How many tokens was DBRX pre-trained on?",
    "Is DBRX a MOE model and how many parameters does it have?",
    "How many GPUs was DBRX trained on and what was the connectivity between GPUs?"
]

outputs = [
    "DBRX was pre-trained on 12 trillion tokens of text and code data.",
    "Yes, DBRX is a fine-grained mixture-of-experts (MoE) architecture with 132B total parameters.",
    "DBRX was trained on 3072 NVIDIA H100s connected by 3.2Tbps Infiniband"
]

# Dataset
qa_pairs = [{"question": q, "answer": a} for q, a in zip(inputs, outputs)]
df = pd.DataFrame(qa_pairs)

# Write to csv
csv_path = "/Users/rlm/Desktop/DBRX_eval.csv"
df.to_csv(csv_path, index=False)

Create dataset

In [8]:
from langsmith import Client

client = Client()
dataset_name = "DBRX Dataset v2"

# Store
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="QA pairs about DBRX model.",
)
client.create_examples(
    inputs=[{"question": q} for q in questions],
    outputs=[{"answer": a} for a in answers],
    dataset_id=dataset.id,
)

NameError: name 'questions' is not defined

Update dataset

In [None]:
new_questions = [
    "What is the context window of DBRX Instruct?", 
]

new_answers = [
    "DBRX Instruct was trained with up to a 32K token context window.",
]

# See updated version in the UI
client.create_examples(
    inputs=[{"question": q} for q in new_questions],
    outputs=[{"answer": a} for a in new_answers],
    dataset_id=dataset.id,
)

# Dataset: Traces

`From user logs`

![Screenshot 2024-04-01 at 4.31.11 PM.png](attachment:116c49b0-ec73-4930-8d13-c4e0c9b4b889.png)

In [23]:
# Create a project
import os
os.environ['LANGCHAIN_PROJECT'] = 'DBRX'

In [31]:
# Load website

import requests
from bs4 import BeautifulSoup
url = 'https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
text = [p.text for p in soup.find_all('p')]
full_text = '\n'.join(text)

In [88]:
# OpenAI API

import openai
from langsmith.wrappers import wrap_openai
openai_client = wrap_openai(openai.Client())

def predict(inputs: dict) -> dict:
    """
    Generates answers to user questions based on a provided website text using OpenAI API.

    Parameters:
    inputs (dict): A dictionary with a single key 'question', representing the user's question as a string.

    Returns:
    dict: A dictionary with a single key 'output', containing the generated answer as a string.
    """

    # System prompt 
    system_msg = f"Answer user questions in 2-3 sentences about this context: \n\n\n {full_text}"
    
    # Pass in website text
    messages = [{"role": "system", "content": system_msg},
                {"role": "user", "content": inputs["question"]}]

    # Call OpenAI
    response = openai_client.chat.completions.create(messages=messages, model="gpt-3.5-turbo")

    # Response in output dict, as expected by dataset 
    return {"answer": response.dict()['choices'][0]['message']['content']} 
    
predict({"question":"What are the main differences in training efficiency between MPT-7B vs DBRX?"})

{'answer': "DBRX's training efficiency significantly improved compared to MPT-7B. A model from the DBRX family, DBRX MoE-A, reached a similar Gauntlet score to MPT-7B with 3.7x fewer FLOPs. This efficiency boost was achieved through various improvements, including an MoE architecture, better optimization strategies, enhanced pretraining data quality, and other architectural changes."}

# LLM-as-Judge: Built-in evaluator

![Screenshot 2024-04-01 at 4.17.29 PM.png](attachment:786ec55e-5801-4918-ac15-71b8c3162e9e.png)

Evaluate:

https://docs.smith.langchain.com/evaluation/faq/evaluator-implementations

`Built-in evaluators`

In [93]:
from langsmith.evaluation import evaluate, LangChainStringEvaluator

# Evaluators
qa_evalulator = [LangChainStringEvaluator("cot_qa")]

experiment_results = evaluate(
    predict,
    data=dataset_name,
    evaluators=qa_evalulator,
    experiment_prefix="test-dbrx-qa",
    # Any experiment metadata can be specified here
    metadata={
      "version": "1.0.0",
      "variant": "stuff website context",
    },
)

  experiment_results = evaluate(


View the evaluation results for experiment: 'test-dbrx-qa:d4aefa4' at:
https://smith.langchain.com/o/1fa8b1f4-fcb9-4072-9aa9-983e35ad61b8/datasets/accb4d34-55ee-431e-8fb1-5b0a0a3bd033/compare?selectedSessions=d4a244d7-c0df-442a-aac3-5f3f84ada480




0it [00:00, ?it/s]

# Custom evaluator


In [92]:
def is_answered(run: Run, example: Example) -> dict: 

    # Get outputs
    student_answer = run.outputs.get("answer")
    
    # Check if the student_answer is an empty string
    # Key is optional 
    # Score is needed
    if not student_answer:
        return {"key": "is_answered" , "score": 0} 
    else:
        return {"key": "is_answered" , "score": 1} 

# Evaluators
qa_evalulator = [is_answered]

# Run
experiment_results = evaluate(
    predict,
    data=dataset_name,
    evaluators=qa_evalulator,
    experiment_prefix="test-dbrx-qa-custom-eval-is-answered",
    # Any experiment metadata can be specified here
    metadata={
      "version": "1.0.0",
      "variant": "stuff website context",
    },
)

  experiment_results = evaluate(


View the evaluation results for experiment: 'test-dbrx-qa-custom-eval-is-answered:17d0152' at:
https://smith.langchain.com/o/1fa8b1f4-fcb9-4072-9aa9-983e35ad61b8/datasets/accb4d34-55ee-431e-8fb1-5b0a0a3bd033/compare?selectedSessions=2d771ff0-6059-4681-9b82-74318a5fdd17




0it [00:00, ?it/s]

# LLM-as-Judge with custom evaluator

https://docs.smith.langchain.com/evaluation/quickstart#3-evaluate

In [91]:
from langsmith.schemas import Run, Example
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

class CheckExplicit(BaseModel):
    """Check if output has explicit langugage"""

    grade: int = Field(description="Grade 1 if the output has explicit langugage, else 0.")

def grade_answer(run: Run, example: Example) -> dict: 

    # Get outputs
    student_answer = run.outputs.get("answer")
    
    # LLM with function call 
    llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
    grader_llm = llm.with_structured_output(CheckExplicit)
    
    # Prompt 
    system = """Check if the generation from the LLM has explicit langugage"""
    grade_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system),
            ("human", "STUDENT ANSWER: {student_answer}"),
        ]
    )

    # Invoke
    grader = grade_prompt | grader_llm
    score = grader.invoke({"student_answer":student_answer,})

    # Score is required
    return {"key": "grade" , "score": score.grade} 

# Evaluators
qa_evalulator = [grade_answer]

# Run
experiment_results = evaluate(
    predict,
    data=dataset_name,
    evaluators=qa_evalulator,
    experiment_prefix="test-dbrx-qa-custom-eval-explicit",
    # Any experiment metadata can be specified here
    metadata={
      "version": "1.0.0",
      "variant": "stuff website context",
    },
)

  experiment_results = evaluate(


View the evaluation results for experiment: 'test-dbrx-qa-custom-eval-explicit:82e8e38' at:
https://smith.langchain.com/o/1fa8b1f4-fcb9-4072-9aa9-983e35ad61b8/datasets/accb4d34-55ee-431e-8fb1-5b0a0a3bd033/compare?selectedSessions=a232c150-9f94-474a-95d0-79323d3dfc0e




0it [00:00, ?it/s]

Let's assume we want this:

https://github.com/openai/evals/blob/main/evals/registry/modelgraded/closedqa.yaml

In [90]:
from langsmith.schemas import Run, Example
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

class Grade(BaseModel):
    """Grade LLM anser relative to reference."""

    grade: int = Field(description="Grade 1 if the student answer matches the reference, else 0.")

def grade_answer(run: Run, example: Example) -> dict:

    # Get 
    student = run.outputs.get("answer") # full trace (hidden from model)
    question = example.outputs.get("question") # datapoint 
    answer = example.outputs.get("answer")
    
    # LLM with function call 
    llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
    grader_llm = llm.with_structured_output(Grade)
    
    # Prompt 
    system = """You are assessing a submitted STUDENT ANSWER to a question relative to the TRUE ANSWER based on the provided criteria:\n
    conciseness:  Is the student answer concise and to the point? \n
    correct: Is the student answer correct relative to the reference answer? \n
    Does the student answer the criterion? \n
    First, write out in a step by step manner your reasoning about the criterion to be sure that your conclusion is correct. \n
    Return 1 if the student answer meets the criteria and 0 if it does not."""
    grade_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system),
            ("human", "QUESTION: {question} \n STUDENT ANSWER: {student} \n TRUE ANSWER: {answer}"),
        ]
    )

    # Invoke
    grader = grade_prompt | grader_llm
    score = grader.invoke({"student":student,"answer":answer,"question":question})

    # Store
    return {"key": "grade" , "score": score.grade} # score is required?

# Evaluators
qa_evalulator = [grade_answer]

# Run
experiment_results = evaluate(
    predict,
    data=dataset_name,
    evaluators=qa_evalulator,
    experiment_prefix="test-dbrx-qa-custom-eval",
    # Any experiment metadata can be specified here
    metadata={
      "version": "1.0.0",
      "variant": "stuff website context",
    },
)

  experiment_results = evaluate(


View the evaluation results for experiment: 'test-dbrx-qa-custom-eval:91acf71' at:
https://smith.langchain.com/o/1fa8b1f4-fcb9-4072-9aa9-983e35ad61b8/datasets/accb4d34-55ee-431e-8fb1-5b0a0a3bd033/compare?selectedSessions=e82b3329-0319-4f15-aec6-281b01f7ae33




0it [00:00, ?it/s]