In [1]:
import dotenv

dotenv.load_dotenv()

True

In [2]:
from langsmith import Client

client = Client()



In [3]:
# Programmatically create a dataset in LangSmith
# For other dataset creation methods, see:
# https://docs.smith.langchain.com/evaluation/how_to_guides/manage_datasets_programmatically
# https://docs.smith.langchain.com/evaluation/how_to_guides/manage_datasets_in_application
try:
    dataset = client.create_dataset(
        dataset_name="Sample dataset", description="A sample dataset in LangSmith.", 
    )
except Exception as e:
    print(e)

In [27]:
dataset = client.read_dataset(dataset_name="Sample dataset")

In [28]:
# Create examples
examples = [
    {
        "inputs": {"question": "Which country is Mount Kilimanjaro located in?"},
        "outputs": {"answer": "Mount Kilimanjaro is located in Tanzania."},
    },
    {
        "inputs": {"question": "What is Earth's lowest point?"},
        "outputs": {"answer": "Earth's lowest point is The Dead Sea."},
    },
]


In [6]:
# Add examples to the dataset
client.create_examples(dataset_id=dataset.id, examples=examples)

{'example_ids': ['6fa16cb3-e4ab-476d-883f-865638c23aca',
  '08d88c01-713c-469f-b057-394a3c624c73'],
 'count': 2}

In [7]:
from langsmith import wrappers
from openai import OpenAI
import os





In [8]:
# os.getenv("GEMINI_API_KEY")

In [9]:
# Wrap the OpenAI client for LangSmith tracing
gemini_client = wrappers.wrap_openai(OpenAI(
                                        api_key=os.getenv("GEMINI_API_KEY"),
                                        base_url="https://generativelanguage.googleapis.com/v1beta/",
                                        
                                    ))

In [10]:
test_client= OpenAI(
                                        api_key=os.getenv("GEMINI_API_KEY"),
                                        base_url="https://generativelanguage.googleapis.com/v1beta/"
                                    )

In [11]:
# response = test_client.chat.completions.create(
#     model="gemini-2.5-flash",
#     messages=[
#         {"role": "system", "content": "You are a helpful assistant."},
#         {
#             "role": "user",
#             "content": "Explain to me how AI works"
#         }
#     ]
# )

# print(response.choices[0].message)

In [12]:
# Define the application logic you want to evaluate inside a target function
# The SDK will automatically send the inputs from the dataset to your target function
def target(inputs: dict) -> dict:
    response = gemini_client.chat.completions.create(
        model="gemini-2.0-flash-lite",
        messages=[
            {"role": "system", "content": "Answer the following question accurately"},
            {"role": "user", "content": inputs["question"]},
        ],
    )
    return { "answer": response.choices[0].message.content.strip() }

In [13]:
# target(inputs={"question": "What is the capital of France?"})

In [14]:
from openevals.llm import create_llm_as_judge
from openevals.prompts import CORRECTNESS_PROMPT

def correctness_evaluator(inputs: dict, outputs: dict, reference_outputs: dict):
    evaluator = create_llm_as_judge(
        prompt=CORRECTNESS_PROMPT,
        model="google_genai:gemini-2.0-flash-lite",
        feedback_key="correctness",
    )
    eval_result = evaluator(
        inputs=inputs,
        outputs=outputs,
        reference_outputs=reference_outputs
    )
    return eval_result

In [15]:
print(CORRECTNESS_PROMPT)

You are an expert data labeler evaluating model outputs for correctness. Your task is to assign a score based on the following rubric:

<Rubric>
  A correct answer:
  - Provides accurate and complete information
  - Contains no factual errors
  - Addresses all parts of the question
  - Is logically consistent
  - Uses precise and accurate terminology

  When scoring, you should penalize:
  - Factual errors or inaccuracies
  - Incomplete or partial answers
  - Misleading or ambiguous statements
  - Incorrect terminology
  - Logical inconsistencies
  - Missing key information
</Rubric>

<Instructions>
  - Carefully read the input and output
  - Check for factual accuracy and completeness
  - Focus on correctness of information rather than style or verbosity
</Instructions>

<Reminder>
  The goal is to evaluate factual correctness and completeness of the response.
</Reminder>

<input>
{inputs}
</input>

<output>
{outputs}
</output>

Use the reference outputs below to help you evaluate the

In [16]:
# After running the evaluation, a link will be provided to view the results in langsmith
experiment_results = client.evaluate(
    target,
    data="Sample dataset",
    evaluators=[
        correctness_evaluator,
        # can add multiple evaluators here
    ],
    experiment_prefix="first-eval-in-langsmith",
    max_concurrency=0,
)

  from .autonotebook import tqdm as notebook_tqdm


View the evaluation results for experiment: 'first-eval-in-langsmith-c3604d58' at:
https://smith.langchain.com/o/cb35aec6-2b3b-4044-bfca-43dbfad00211/datasets/c941b0f0-af2b-4f0b-9027-bd9a50339df7/compare?selectedSessions=db18717f-996b-4383-8b3e-848cf3ce599b




0it [00:00, ?it/s]Key 'additionalProperties' is not supported in schema, ignoring
Key 'parameters' is not supported in schema, ignoring
1it [00:02,  2.36s/it]Key 'additionalProperties' is not supported in schema, ignoring
Key 'parameters' is not supported in schema, ignoring
2it [00:04,  2.41s/it]


In [None]:
print(experiment_results)

<ExperimentResults first-eval-in-langsmith-c3604d58>


In [23]:
experiment_results

In [45]:
from langsmith import evaluate

def correct(outputs: dict, reference_outputs: dict) -> bool:
    """Check if the answer exactly matches the expected answer."""
    return outputs["answer"] == reference_outputs["answer"]

def dummy_app(inputs: dict) -> dict:
    return {"answer": "hmm i'm not sure", "reasoning": "i didn't understand the question"}

results = evaluate(
    dummy_app,
    data="Sample dataset",
    evaluators=[correct],
    max_concurrency=2,
)




View the evaluation results for experiment: 'large-boat-84' at:
https://smith.langchain.com/o/cb35aec6-2b3b-4044-bfca-43dbfad00211/datasets/c941b0f0-af2b-4f0b-9027-bd9a50339df7/compare?selectedSessions=11908f19-0dbc-4099-a700-6ac2291ed051




2it [00:00,  5.35it/s]


In [46]:
results.experiment_name

'large-boat-84'

In [47]:
print([k for k in dir(results) if not k.startswith('_')])

['experiment_name', 'to_pandas', 'wait']


In [48]:

results.to_pandas()

Unnamed: 0,inputs.question,outputs.answer,outputs.reasoning,error,reference.answer,feedback.correct,execution_time,example_id,id
0,What is Earth's lowest point?,hmm i'm not sure,i didn't understand the question,,Earth's lowest point is The Dead Sea.,False,0.001298,08d88c01-713c-469f-b057-394a3c624c73,a933f7cd-27e6-4b26-88ec-8e26e13ea1ab
1,Which country is Mount Kilimanjaro located in?,hmm i'm not sure,i didn't understand the question,,Mount Kilimanjaro is located in Tanzania.,False,0.008987,6fa16cb3-e4ab-476d-883f-865638c23aca,fc4cd304-23a3-4865-81e6-7a2c08f374bc


In [None]:

from langgraph.graph import StateGraph, START, END, MessagesState
from typing_extensions import TypedDict

from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv
load_dotenv()

llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash")

# Define input schema
class InputState(TypedDict):
    question: str

# Define output schema
class OutputState(TypedDict):
    answer: str

# Combine input and output
class OverallState(InputState, OutputState):
    pass


def chat_node(state: InputState):

    question =  state["question"]
    
    answer = llm.invoke(question)
    return {"answer": answer.content, "question": state["question"]}

graph_builder = StateGraph(OverallState, input_schema=InputState, output_schema=OutputState)

graph_builder.add_node("chat", chat_node)

graph_builder.set_entry_point("chat")
graph_builder.set_finish_point("chat")

relationship_graph = graph_builder.compile()
