# Create a dataset
 - Define example input and reference output pairs that you will use to evaluate your app

In [11]:
from langsmith import Client

client = Client()

#Programmatically create a dataset in langsmith
dataset = client.create_dataset(
    dataset_name="Evaluation Quick Start",
    description="A dataset for evaluating a simple language model",
    
)

#Create examples
examples = [
    {
        "inputs": {"question": "What is the capital of Kenya?"},
        "outputs": {"answer": "The capital of Kenya is Nairobi"},
    },
    {
        "inputs": {"question": "What is the largest planet in the solar system?"},
        "outputs": {"answer": "The largest planet in the solar system is Jupiter"},
    },
    {
        "inputs": {"question": "What is the chemical symbol for water?"},
        "outputs": {"answer": "The chemical symbol for water is H2O"},
    },
]
    
    
client.create_examples(dataset_id=dataset.id, examples=examples)

{'example_ids': ['639f8b98-6e63-4b92-8ffa-0b733494892b',
  '7fb45806-be7a-4530-8065-aeabf2b02494',
  'f279b217-73f8-472a-aeff-39e91bc6f442'],
 'count': 3}

# Define what your target function
- Define a target function that defines what you are evaluating.
- Examples of what you can test:
    - You can test an llm call that includes the new prompt you want to evaluate.
    - You can evaluate a part of your application
    - You can test your end to end application

In [15]:
from langsmith import wrappers
from openai import OpenAI

#wrap the openai client for langsmith tracing
openai_client = wrappers.wrap_openai(OpenAI())

#Define the application logic you want to evaluate inside the target function
#The sdk will automatically send the inputs from the dataset to the target function

def target(input: dict) -> dict:
    response = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "Answer the following question accurately."},
            {"role": "user", "content": input["question"]}
        ]
    )
    return {"answer": response.choices[0].message.content.strip()}
        

# Define Evaluator

- Import a prebuilt prompt from openevals and create an evaluator
    - outputs are the result of the target function
    - reference_outputs are from the examples pair defined in the dataset

In [18]:
from openevals.llm import create_llm_as_judge
from openevals.prompts import CORRECTNESS_PROMPT

def correctness_evaluator(inputs:dict, outputs:dict, reference_outputs:dict):
    evaluator = create_llm_as_judge(
        model="openai:gpt-4o-mini",
        prompt=CORRECTNESS_PROMPT,
        feedback_key="correctness",
    )
    
    eval_result = evaluator(
        inputs=inputs,
        outputs=outputs,
        reference_outputs=reference_outputs,
    )
    
    return eval_result
    

# Run and view results

- Run the experiment

In [19]:
experiment_results = client.evaluate(
    target,
    data="Evaluation Quick Start",
    evaluators=[correctness_evaluator],
    experiment_prefix="First-eval-in-langsmith",
    max_concurrency=2,
)

View the evaluation results for experiment: 'First-eval-in-langsmith-e40bf4d7' at:
https://smith.langchain.com/o/5e26199c-44b7-5d71-a174-0781dc496380/datasets/a7935129-510d-410a-8992-795a000b93f5/compare?selectedSessions=d548a87b-0c4a-4863-9608-f7bcc8606112




3it [00:04,  1.66s/it]
