# Evaluating Files declared in code


This tutorial highlights how to combine Humanloop decorated files.


### 1. Setting up imports


In [1]:
import os

import chromadb
import pandas as pd
from dotenv import load_dotenv
from exact_match import exact_match
from humanloop import Humanloop
from levenshtein import compare_log_and_target
from openai import OpenAI

load_dotenv()

True

In [2]:
DIRECTORY = "SDK/Decorators Evaluation"
# DIRECTORY_PREFIX should lack a trailing slash
if DIRECTORY_PREFIX := os.getenv("DIRECTORY_PREFIX"):
    DIRECTORY = f"{DIRECTORY_PREFIX}/{DIRECTORY}"
print(DIRECTORY)

Andrei QA/SDK/Decorators Evaluation


### 2. Instantiating the Humanloop client


In [3]:
humanloop = Humanloop(api_key=os.getenv("HUMANLOOP_KEY"))

### 3. Instantiating the vector database


In [4]:
chroma = chromadb.chromadb.Client()
collection = chroma.get_or_create_collection(name="MedQA")

# init collection into which we will add documents
knowledge_base = pd.read_parquet("../../../assets/sources/textbooks.parquet")
knowledge_base = knowledge_base.sample(5, random_state=42)
collection.add(
    documents=knowledge_base["contents"].to_list(),
    ids=knowledge_base["id"].to_list(),
)

### 4. Loading the evaluation dataset


In [5]:
datapoints_df = pd.read_json("../../../assets/datapoints.jsonl", lines=True)
datapoints = [row.to_dict() for _, row in datapoints_df.iterrows()][:20]

### 5. Declare Humanloop Files via code


In [6]:
TEMPLATE = """Answer the following question factually.

Question: {question}

Options:
- {option_A}
- {option_B}
- {option_C}
- {option_D}
- {option_E}

---

Here is some retrieved information that might be helpful.
Retrieved data:
{retrieved_data}

---

Give you answer in 3 sections using the following format. Do not include the quotes or the brackets. Do include the "---" separators.
```
<chosen option verbatim>
---
<clear explanation of why the option is correct and why the other options are incorrect. keep it ELI5.>
---
<quote relevant information snippets from the retrieved data verbatim. every line here should be directly copied from the retrieved data>
```
"""

In [7]:
@humanloop.tool(
    path=f"{DIRECTORY}/Retrieval Tool",
)
def retrieval_tool(question: str) -> str:
    """Retrieve most relevant document from the vector db (Chroma) for the question."""
    response = collection.query(query_texts=[question], n_results=1)
    retrieved_doc = response["documents"][0][0]
    return retrieved_doc


@humanloop.prompt(
    path=f"{DIRECTORY}/MedQA Answer",
    template=TEMPLATE,
    tools=[retrieval_tool.json_schema],
)
def ask_model(
    question: str,
    option_A: str,
    option_B: str,
    option_C: str,
    option_D: str,
    option_E: str,
) -> str:
    """Ask a question and get an answer using a simple RAG pipeline"""
    openai = OpenAI(api_key=os.getenv("OPENAI_KEY"))

    # Retrieve context
    retrieved_data = retrieval_tool(question)
    inputs = {
        "question": question,
        "option_A": option_A,
        "option_B": option_B,
        "option_C": option_C,
        "option_D": option_D,
        "option_E": option_E,
        "retrieved_data": retrieved_data,
    }

    # Populate the Prompt template
    messages = [
        {
            "role": "user",
            "content": TEMPLATE.format(**inputs),
        }
    ]

    # Call OpenAI to get response
    chat_completion = openai.chat.completions.create(
        model="gpt-4o",
        temperature=0,
        messages=messages,
    )
    return chat_completion.choices[0].message.content


@humanloop.flow(
    path=f"{DIRECTORY}/MedQA Answer Flow",
    attributes={
        "prompt": {
            "model": "gpt-4o",
            "environment": "evaluation",
        }
    },
)
def entrypoint(
    question: str,
    option_A: str,
    option_B: str,
    option_C: str,
    option_D: str,
    option_E: str,
):
    return ask_model(
        question=question,
        option_A=option_A,
        option_B=option_B,
        option_C=option_C,
        option_D=option_D,
        option_E=option_E,
    )

### 6. Evaluate the Flow


In [8]:
humanloop.evaluations.run(
    file={
        "path": f"{DIRECTORY}/MedQA Answer Flow",
        "callable": entrypoint,
        "type": "flow",
    },
    name="MedQA Evaluation Decorators",
    dataset={
        "datapoints": datapoints,
        "path": f"{DIRECTORY}/Dataset",
    },
    evaluators=[
        {
            "path": f"{DIRECTORY}/Levenshtein",
            "args_type": "target_required",
            "return_type": "number",
            "callable": compare_log_and_target,
        },
        {
            "path": f"{DIRECTORY}/Exact Match",
            "args_type": "target_required",
            "return_type": "boolean",
            "callable": exact_match,
        },
    ],
    workers=8,
)

[96mEvaluating your flow function corresponding to `Andrei QA/SDK/Decorators Evaluation/MedQA Answer Flow` on Humanloop[0m 



[96mNavigate to your Evaluation:[0m
https://app.humanloop.com/project/fl_TTxo2YKusqhfMHfQC9sGq/evaluations/evr_vv8Av5FQsHgOeXRtv0DHW/stats

[96mFlow Version ID: flv_K5LsdlvCdscZh29TMFcDd[0m
[96mRun ID: rn_mJ0pbOjJTGWby0zzjpFgP[0m
[96m
Running 'MedQA Answer Flow' over the Dataset 'Dataset' using 8 workers[0m 

[96m⏳ Evaluation Progress[0m
Total Logs: 18
Total Judgments: 36



[96m⏳ Evaluation Progress[0m
Total Logs: 20
Total Judgments: 38



[96m⏳ Evaluation Progress[0m
Total Logs: 20
Total Judgments: 40



[96m📊 Evaluation Results for Andrei QA/SDK/Decorators Evaluation/MedQA Answer Flow [0m
+-------------------------------------------------+---------------------+
|                                                 |        Latest       |
+-------------------------------------------------+---------------------+
|                                 

[########################################] 20/20 (100.00%) | DONE1s


[]