# Evaluating Files declared in code


This tutorial highlights how to combine Humanloop decorated files.


### 1. Setting up imports


In [1]:
import os

from dotenv import load_dotenv
from humanloop import Humanloop
import chromadb
from openai import OpenAI
import pandas as pd

from levenshtein import compare_log_and_target
from exact_match import exact_match


load_dotenv()

True

### 2. Instantiating the Humanloop client


In [2]:
humanloop = Humanloop(api_key=os.getenv("HL_API_KEY"), base_url="http://0.0.0.0:80/v5")

### 3. Instantiating the vector database


In [3]:
chroma = chromadb.chromadb.Client()
collection = chroma.get_or_create_collection(name="MedQA")

# init collection into which we will add documents
knowledge_base = pd.read_parquet("../../../assets/sources/textbooks.parquet")
knowledge_base = knowledge_base.sample(5, random_state=42)
collection.add(
    documents=knowledge_base["contents"].to_list(),
    ids=knowledge_base["id"].to_list(),
)

### 4. Loading the evaluation dataset


In [4]:
datapoints_df = pd.read_json("../../../assets/datapoints.jsonl", lines=True)
datapoints = [row.to_dict() for _, row in datapoints_df.iterrows()][:20]

### 5. Declare Humanloop Files via code


In [5]:
TEMPLATE = """Answer the following question factually.

Question: {question}

Options:
- {option_A}
- {option_B}
- {option_C}
- {option_D}
- {option_E}

---

Here is some retrieved information that might be helpful.
Retrieved data:
{retrieved_data}

---

Give you answer in 3 sections using the following format. Do not include the quotes or the brackets. Do include the "---" separators.
```
<chosen option verbatim>
---
<clear explanation of why the option is correct and why the other options are incorrect. keep it ELI5.>
---
<quote relevant information snippets from the retrieved data verbatim. every line here should be directly copied from the retrieved data>
```
"""

In [6]:
@humanloop.tool(
    path="Evaluations SDK Demo/Retrieval Tool",
)
def retrieval_tool(question: str) -> str:
    """Retrieve most relevant document from the vector db (Chroma) for the question."""
    response = collection.query(query_texts=[question], n_results=1)
    retrieved_doc = response["documents"][0][0]
    return retrieved_doc


@humanloop.prompt(
    path="Evaluations SDK Demo/MedQA Answer",
    model="gpt-4o",
    template=TEMPLATE,
    tools=[retrieval_tool.json_schema],
)
def ask_model(
    question: str,
    option_A: str,
    option_B: str,
    option_C: str,
    option_D: str,
    option_E: str,
) -> str:
    """Ask a question and get an answer using a simple RAG pipeline"""
    openai = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

    # Retrieve context
    retrieved_data = retrieval_tool(question)
    inputs = {
        "question": question,
        "option_A": option_A,
        "option_B": option_B,
        "option_C": option_C,
        "option_D": option_D,
        "option_E": option_E,
        "retrieved_data": retrieved_data,
    }

    # Populate the Prompt template
    messages = [
        {
            "role": "user",
            "content": TEMPLATE.format(**inputs),
        }
    ]

    # Call OpenAI to get response
    chat_completion = openai.chat.completions.create(
        model="gpt-4o",
        temperature=0,
        messages=messages,
    )
    return chat_completion.choices[0].message.content


@humanloop.flow(
    path="Evaluations SDK Demo/MedQA Answer Flow",
    attributes={
        "prompt": {
            "model": "gpt-4o",
            "environment": "evaluation",
        }
    },
)
def entrypoint(
    question: str,
    option_A: str,
    option_B: str,
    option_C: str,
    option_D: str,
    option_E: str,
):
    return ask_model(
        question=question,
        option_A=option_A,
        option_B=option_B,
        option_C=option_C,
        option_D=option_D,
        option_E=option_E,
    )

### 6. Evaluate the Flow


In [7]:
humanloop.evaluations.run(
    file={
        "path": "Evaluations SDK Demo/MedQA Answer Flow",
        "callable": entrypoint,
        "type": "flow",
    },
    name="MedQA Evaluation Decorators",
    dataset={
        "datapoints": datapoints,
        "path": "Evaluations SDK Demo/Dataset",
    },
    evaluators=[
        {
            "path": "Evaluations SDK Demo/Levenshtein",
            "args_type": "target_required",
            "return_type": "number",
            "callable": compare_log_and_target,
        },
        {
            "path": "Evaluations SDK Demo/Exact Match",
            "args_type": "target_required",
            "return_type": "boolean",
            "callable": exact_match,
        },
    ],
    workers=8,
)

[96mEvaluating your flow function corresponding to `Evaluations SDK Demo/MedQA Answer Flow` on Humanloop[0m 



[96mNavigate to your Evaluation:[0m
http://localhost:3000/project/fl_f1LKeioSdpBr6vUXfFMgQ/evaluations/evr_TdSSqdd2e6SH48YI6bsT4/stats

[96mFlow Version ID: flv_GnVlk3RpLWCcFWCJN1flF[0m
[96mRun ID: rn_0Pg7ctzQY42KOrFw1yB4Q[0m
[96m
Running 'MedQA Answer Flow' over the Dataset 'Dataset' using 8 workers[0m 


HEY {'source_datapoint_id': 'dp_3tm7OsWbudYyOb9507EEq11FcUk', 'upload_callback': <function run_eval.<locals>.process_datapoint.<locals>.upload_callback at 0x1755e0280>, 'file_id': 'fl_f1LKeioSdpBr6vUXfFMgQ', 'run_id': 'rn_0Pg7ctzQY42KOrFw1yB4Q', 'path': 'Evaluations SDK Demo/MedQA Answer Flow'}
HEY {'source_datapoint_id': 'dp_Z98R2axKHmCtQl0WMVdMm5WTpN2', 'upload_callback': <function run_eval.<locals>.process_datapoint.<locals>.upload_callback at 0x315488c10>, 'file_id': 'fl_f1LKeioSdpBr6vUXfFMgQ', 'run_id': 'rn_0Pg7ctzQY42KOrFw1yB4Q', 'path': 'Evaluations SDK Demo/MedQA Answer Flow'}
HEY {'source_datapoint_id': 'dp_UoUeDhkka2HBg2y0qAeF4hcJiWs', 'upload_callback': <function run_eval.<locals>.process_datapoint.<locals>.upload_callback at 0x1755e00d0>, 'file_id': 'fl_f1LKeioSdpBr6vUXfFMgQ', 'run_id': 'rn_0Pg7ctzQY42KOrFw1yB4Q', 'path': 'Evaluations SDK Demo/MedQA Answer Flow'}
HEY {'source_datapoint_id': 'dp_LQNwHabGSHaCuv2ttPguQhBNd5O', 'upload_callback': <function run_eval.<locals>.pro

[########################################] 20/20 (100.00%) | DONE1ss



[96m⏳ Evaluation Progress[0m
Total Logs: 71
Total Judgments: 139



[96m⏳ Evaluation Progress[0m
Total Logs: 75
Total Judgments: 148



[96m⏳ Evaluation Progress[0m
Total Logs: 79
Total Judgments: 156



[96m⏳ Evaluation Progress[0m
Total Logs: 80
Total Judgments: 160



[96m📊 Evaluation Results for Evaluations SDK Demo/MedQA Answer Flow [0m
+----------------------------------+---------------------+---------------------+
|                                  |       Control       |        Latest       |
+----------------------------------+---------------------+---------------------+
|                           Run ID |        ZS3HR        |        0Pg7c        |
+----------------------------------+---------------------+---------------------+
|                       Version ID |         None        |         None        |
+----------------------------------+---------------------+---------------------+
|                            Added | 2024-11-11 14:11:30 | 2024-11-11 14:46:33

ValueError: Unsupported Evaluator Stat type: <class 'humanloop.types.text_evaluator_stats_response.TextEvaluatorStatsResponse'>