# Retrieve

In [1]:
!uv add requests python-frontmatter

[2mResolved [1m120 packages[0m [2min 0.46ms[0m[0m
[2mAudited [1m116 packages[0m [2min 1ms[0m[0m


In [2]:
import requests
import io
import zipfile
import frontmatter

In [3]:
repo_owner = "evidentlyai"
repo_name = "docs"
branch_name = "main"

zip_url = f"https://github.com/{repo_owner}/{repo_name}/archive/refs/heads/{branch_name}.zip"
zip_response = requests.get(zip_url)

In [6]:
documents = []
with zipfile.ZipFile(io.BytesIO(zip_response.content)) as zip_archive:
    for file_path in zip_archive.namelist():
        if not file_path.endswith(('.md', '.mdx')):
            continue
        with zip_archive.open(file_path) as file:
            content = file.read().decode('utf-8')
            post = frontmatter.loads(content)
            doc = {
                'content': post.content,
                'title':post.metadata.get('title'),
                'description': post.metadata.get('description'),
                'filename': file_path.split('/', 1)[-1]
            }
            documents.append(doc)

In [8]:
print(len(documents))

95


In [9]:
from gitsource import GithubRepositoryDataReader

reader = GithubRepositoryDataReader(
    repo_name="docs",
    repo_owner="evidentlyai",
    allowed_extensions=("md", "mdx")
)

files = reader.read()

print(f"Loaded {len(files)} documents")

Loaded 95 documents


In [10]:
documents = [f.parse() for f in files]

In [12]:
print(documents[10])

{'title': 'Output formats', 'description': 'How to export the evaluation results.', 'content': 'You can view or export Reports in multiple formats.\n\n**Pre-requisites**:\n\n* You know how to [generate Reports](/docs/library/report).\n\n## Log to Workspace\n\nYou can save the computed Report in Evidently Cloud or your local workspace.\n\n```python\nws.add_run(project.id, my_eval, include_data=False)\n```\n\n<Info>\n  **Uploading evals**. Check Quickstart examples [for ML](/quickstart_ml) or [for LLM](/quickstart_llm) for a full workflow.\n</Info>\n\n## View in Jupyter notebook\n\nYou can directly render the visual summary of evaluation results in interactive Python environments like Jupyter notebook or Colab.\n\nAfter running the Report, simply call the resulting Python object:\n\n```python\nmy_report\n```\n\nThis will render the HTML object directly in the notebook cell.\n\n## HTML\n\nYou can also save this interactive visual Report as an HTML file to open in a browser:\n\n```python\n

# Search

In [29]:
!uv add minsearch

[2mResolved [1m120 packages[0m [2min 0.54ms[0m[0m
[2mAudited [1m116 packages[0m [2min 1ms[0m[0m


In [30]:
from minsearch import Index

In [31]:
index = Index(
    text_fields=["title", "description", "content"],
    keyword_fields=["filename"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x7842f82334d0>

In [32]:
query = "LLM as a judge"

results = index.search(query=query)

print(len(results))

10


# Chunking

In [33]:
document = list(range(0, 100))

In [35]:
window_size = 10
start = 0
step = 5

while start < len(document):
    end = start + window_size
    chunk = document[start:end]
    print(chunk)

    start += step

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
[10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
[15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
[20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
[25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
[30, 31, 32, 33, 34, 35, 36, 37, 38, 39]
[35, 36, 37, 38, 39, 40, 41, 42, 43, 44]
[40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
[45, 46, 47, 48, 49, 50, 51, 52, 53, 54]
[50, 51, 52, 53, 54, 55, 56, 57, 58, 59]
[55, 56, 57, 58, 59, 60, 61, 62, 63, 64]
[60, 61, 62, 63, 64, 65, 66, 67, 68, 69]
[65, 66, 67, 68, 69, 70, 71, 72, 73, 74]
[70, 71, 72, 73, 74, 75, 76, 77, 78, 79]
[75, 76, 77, 78, 79, 80, 81, 82, 83, 84]
[80, 81, 82, 83, 84, 85, 86, 87, 88, 89]
[85, 86, 87, 88, 89, 90, 91, 92, 93, 94]
[90, 91, 92, 93, 94, 95, 96, 97, 98, 99]
[95, 96, 97, 98, 99]


In [36]:
def sliding_window(text, size=1000, step=500):
    chunks = []
    start = 0
    text_length = len(text)

    while start < text_length:
        end = start + size
        chunk = text[start:end]
        chunks.append({'start':start, 'content':chunk})

        start = end - step

        if end >= text_length:
            break
    
    return chunks

In [37]:
document_chunks = []

for doc in documents:
    if not doc.get('content'):
        continue
    copy = doc.copy()
    content = copy.pop('content')

    chunks = sliding_window(content, size=3000, step=1500)

    for i, chunk in enumerate(chunks):
        chunk.update(copy)
        chunk['chunk_id'] = i
        document_chunks.append(chunk)

In [38]:
print(document_chunks[10])

{'start': 9000, 'content': 'cation=[BinaryClassification(\n        target="target",\n        prediction_labels="prediction")],\n    categorical_columns=["target", "prediction"])\n```\n\nAvailable options and defaults:\n\n```python\n    target: str = "target"\n    prediction_labels: Optional[str] = None\n    prediction_probas: Optional[str] = "prediction" #if probabilistic classification\n    pos_label: Label = 1 #name of the positive label\n    labels: Optional[Dict[Label, str]] = None\n```\n\n### Ranking\n\n#### RecSys\n\nTo evaluate recommender systems performance, you must map the columns with:\n\n- Prediction: this could be predicted score or rank.\n- Target: relevance labels (e.g., this could be an interaction result like user click or upvote, or a true relevance label)\n\nThe **target** column can contain either:\n\n- a binary label (where `1` is a positive outcome)\n- any scores (positive values, where a higher value corresponds to a better match or a more valuable user action).

In [39]:
chunk_index = Index(
    text_fields=["title", "description", "content"],
    keyword_fields=["filename"]
)

chunk_index.fit(document_chunks)

<minsearch.minsearch.Index at 0x7842d1bc8f50>

In [40]:
result = chunk_index.search("LLM as a judge")
print(len(result))

10


In [41]:
result

[{'start': 0,
  'content': 'import CloudSignup from \'/snippets/cloud_signup.mdx\';\nimport CreateProject from \'/snippets/create_project.mdx\';\n\nIn this tutorial, we\'ll show how to evaluate text for custom criteria using LLM as the judge, and evaluate the LLM judge itself.\n\n<Info>\n  **This is a local example.** You will run and explore results using the open-source Python library. At the end, we’ll optionally show how to upload results to the Evidently Platform for easy exploration.\n</Info>\n\nWe\'ll explore two ways to use an LLM as a judge:\n\n- **Reference-based**. Compare new responses against a reference. This is useful for regression testing or whenever you have a "ground truth" (approved responses) to compare against.\n- **Open-ended**. Evaluate responses based on custom criteria, which helps evaluate new outputs when there\'s no reference available.\n\nWe will focus on demonstrating **how to create and tune the LLM evaluator**, which you can then apply in different cont

In [42]:
from gitsource import chunk_documents
document_chunks = chunk_documents(documents)
document_chunks[10]

{'start': 4000,
 'content': 'apping**                               |\n| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------- |\n| `numerical_columns`   | <ul>      <li>      Columns with numeric values.</li>            </ul>                                                                | All columns with numeric types (`np.number`).       |\n| `datetime_columns`    | <ul>      <li>      Columns with datetime values.</li>            <li>      Ignored in data drift calculations.</li>            </ul> | All columns with DateTime format (`np.datetime64`). |\n| `categorical_columns` | <ul>      <li>      Columns with categorical values.</li>            </ul>                                                            | All non-numeric/non-datetime columns.               |\n| `text_columns`        | <ul>      <li>      Text columns.</

# Augmenting

In [52]:
query = "How do I implement an LLM as a judge?"
search_result = chunk_index.search(query, num_results=5)

In [53]:
import json
search_result_json = json.dumps(search_result)

In [54]:
instructions = """
You're a course assistant, your task is to anser the QUESTION from
course students using the provided CONTEXT
""".strip()

user_prompt = f"""
<QUESTION>
{query}
</QUESTION>

<CONTEXT>
{search_result_json}
</CONTEXT>
""".strip()

In [None]:
def llm(client, user_prompt, instructions=None, model="gpt-4o-mini"):
    messages = []
    
    if instructions is not None:
        messages.append({
            "role":"system", 
            "content":instructions
        })
    
    messages.append({
        "role":"user", 
        "content":user_prompt
    })

    response = client.responses.create(
        model=model,
        input=messages
    )

    return response.output_text

In [56]:
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()
openai_client = OpenAI()
response = llm(openai_client, user_prompt, instructions)
print(response)

To implement an LLM (Large Language Model) as a judge, you can follow these steps based on the tutorial provided:

1. **Install Required Libraries**:
   - Use the following command to install the Evidently library:
     ```bash
     pip install evidently
     ```

2. **Import Necessary Modules**:
   - In your Python environment, import the necessary libraries:
     ```python
     import pandas as pd
     import numpy as np
     from evidently import Dataset, DataDefinition, Report, BinaryClassification
     from evidently.llm.templates import BinaryClassificationPromptTemplate
     ```

3. **Set Up OpenAI API Key**: 
   - Assign your OpenAI API key as an environment variable:
     ```python
     import os
     os.environ["OPENAI_API_KEY"] = "YOUR_KEY"
     ```

4. **Create an Evaluation Dataset**:
   - Generate a toy Q&A dataset with questions, target responses, new responses, and manual labels indicating correctness.

5. **Define the LLM Judge Prompt**:
   - Design a prompt that the L

# RAG

In [57]:
def search(query):
    return chunk_index.search(query)

In [58]:
def augment(query, search_result):
    json_search_results = json.dumps(search_result)

    user_prompt = f"""
    <QUESTION>
    {query}
    </QUESTION>

    <CONTEXT>
    {search_result_json}
    </CONTEXT>
    """.strip()

    return user_prompt

In [59]:
def rag(client, query, instructions=None, model="gpt-4o-mini"):
    search_result = search(query)
    prompt = augment(query, search_result)
    answer = llm(client, prompt, instructions, model)
    return answer

In [60]:
question ="How do I implement an LLM as a judge using EvidentlyAI?"
system_prompt = """
You're a course assistant, your task is to anser the QUESTION from
course students using the provided CONTEXT
""".strip()
answer = rag(openai_client, question, system_prompt)
print(answer)

To implement an LLM as a judge using EvidentlyAI, follow these steps:

1. **Installation and Setup**:
   - Install the Evidently library by running:
     ```bash
     pip install evidently
     ```
   - Import the necessary modules in your Python environment:
     ```python
     import pandas as pd
     import numpy as np
     from evidently import Dataset
     from evidently import DataDefinition
     from evidently import Report
     from evidently import BinaryClassification
     from evidently.llm.templates import BinaryClassificationPromptTemplate
     ```

2. **Set Up API Key**:
   - Provide your OpenAI API key as an environment variable:
     ```python
     import os
     os.environ["OPENAI_API_KEY"] = "YOUR_KEY"
     ```

3. **Create Dataset**:
   - Create a Q&A dataset including:
     - Questions
     - Target responses (approved)
     - New responses (system-generated)
     - Manual labels indicating correctness.
   - Use code like this to generate the dataset:
     ```python