# Evaluating Our Data

Let's see an example of how we can evaluate our data using the ms-marco dataset with real user queries

## Using a real Dataset

In [3]:
from itertools import product

SIZES = [3, 5, 10, 15, 25]

def calculate_recall(predictions, labels):
    correct_predictions = sum(1 for label in labels if label in predictions)
    if labels:
        return correct_predictions / len(labels)
    return 0


def calculate_reciprocal_rank(predictions, labels):
    for index, prediction in enumerate(predictions):
        if prediction in labels:
            return 1 / (index + 1)
    return 0


metrics = {"mrr": calculate_reciprocal_rank, "recall": calculate_recall}


def score(preds, label):
    return {
        f"{fn_name}@{size}": round(metrics[fn_name](preds[:size], [label]), 3)
        for fn_name, size in product(metrics.keys(), SIZES)
    }

In [17]:
from lib.data import get_labels
from tqdm import tqdm
from lib.query import full_text_search
from lib.models import EmbeddedPassage
import lancedb
from lib.db import get_table
import pandas as pd

db = lancedb.connect("../lance")
data = get_labels("../queries_single_label.json")
table = get_table(db,"ms_marco",EmbeddedPassage)
search_results = full_text_search(table,data,25)
evaluation_metrics = [
    score(retrieved_chunk_ids,query['selected_chunk_id']) 
    for retrieved_chunk_ids,query in zip(search_results,data)
]
pd.DataFrame(evaluation_metrics).mean()

100%|█████████████████████████████████████████████████████████████████████████████████████████| 111/111 [00:02<00:00, 45.71it/s]


mrr@3        0.345306
mrr@5        0.391703
mrr@10       0.416468
mrr@15       0.419622
mrr@25       0.420027
recall@3     0.522523
recall@5     0.729730
recall@10    0.909910
recall@15    0.945946
recall@25    0.954955
dtype: float64

## Cold Starting with Instructor

What can we do if we have no user queries and we're just starting out? Well, the easiest way is to use synthethic queries to automatically generate the data to do so!

In [18]:
import instructor
import openai
from pydantic import BaseModel,Field
from tqdm.asyncio import tqdm_asyncio as asyncio

client = instructor.from_openai(openai.AsyncOpenAI())

class QuestionAnswerPair(BaseModel):
    """
    This model represents a pair of a question generated from a text chunk, its corresponding answer,
    and the chain of thought leading to the answer. The chain of thought provides insight into how the answer
    was derived from the question.
    """

    chain_of_thought: str = Field(
        ..., description="The reasoning process leading to the answer."
    )
    question: str = Field(
        ..., description="The generated question from the text chunk."
    )
    answer: str = Field(..., description="The answer to the generated question.")

async def generate_question_batch(text_chunk_batch):
    async def generate_question(text: str):
        question = await client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {
                    "role": "system",
                    "content": "You are a world class AI that excels at generating hypothethical search queries. You're about to be given a text snippet and asked to generate a search query which is specific to the specific text chunk that you'll be given. Make sure to use information from the text chunk.",
                },
                {"role": "user", "content": f"Here is the text chunk : {text}"},
            ],
            response_model=QuestionAnswerPair,
            max_retries=3,
        )
        return (question,text)

    coros = [
        generate_question(item) for item in text_chunk_batch
    ]
    res = await asyncio.gather(*coros)
    return [{"input": item.question, "source": text} for item,text in res]

chunks = [
    "Conversion disorder is a type of somatoform disorder where physical symptoms or signs are present that cannot be explained by a medical condition. Very importantly, unlike factitious disorders and malingering, the symptoms of somatoform disorders are not intentional or under conscious control of the patient",
    "A conifer is a tree or shrub which produces distinctive cones as part of its sexual reproduction. These woody plants are classified among the gymnosperms, and they have a wide variety of uses, from trapping carbon in the environment to providing resins which can be used in the production of solvents. Several features beyond the cones set conifers apart from other types of woody plants. A conifer is typically evergreen, although some individuals are deciduous, and almost all conifers have needle or scale-like leaves",
    "Known by multiple common names, such as humbug damselfish, three-striped damselfish and white-tailed damselfish, Dascyllus aruanus is a feisty little fish that adapts well to aquarium life. Three-striped damselfish can be pugnacious and are better introduced at the latter stages of setting up a marine fish community. Remove as many of the three-striped damselfish fry as you want to try and raise to a rearing aquarium, with an absence of adult fish and invertebrates that might look upon the young fish as tasty morsels for the taking. Dascyllus aruanus is a worthy first-time breeding project for up-and-coming marine aquarists"
]

questions = await generate_question_batch(chunks)
questions

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:05<00:00,  1.73s/it]


[{'input': 'What distinguishes conversion disorder from factitious disorders and malingering?',
  'source': 'Conversion disorder is a type of somatoform disorder where physical symptoms or signs are present that cannot be explained by a medical condition. Very importantly, unlike factitious disorders and malingering, the symptoms of somatoform disorders are not intentional or under conscious control of the patient'},
 {'input': 'What are some characteristics that set conifers apart from other types of woody plants?',
  'source': 'A conifer is a tree or shrub which produces distinctive cones as part of its sexual reproduction. These woody plants are classified among the gymnosperms, and they have a wide variety of uses, from trapping carbon in the environment to providing resins which can be used in the production of solvents. Several features beyond the cones set conifers apart from other types of woody plants. A conifer is typically evergreen, although some individuals are deciduous, 

### What is Instructor?

Instructor is a library that provides structured output validation

In [19]:
import instructor
from pydantic import BaseModel
from openai import OpenAI


# Define your desired output structure
class UserInfo(BaseModel):
    name: str
    age: int


# Patch the OpenAI client
client = instructor.from_openai(OpenAI())

# Extract structured data from natural language
user_info = client.chat.completions.create(
    model="gpt-3.5-turbo",
    response_model=UserInfo,
    messages=[{"role": "user", "content": "John Doe is 30 years old."}],
)

print(user_info.name)
print(user_info.age)

John Doe
30


### Exercises

Now that we've seen what Instructor can do, let's work through a few different exercises to get a better understanding of the library

#### Adding some docstrings

Let's try creating a Pydantic Model that has docstrings and descriptions using the `Field` object.

Modify the original `UserInfo` object to include a docstring and a description of each field

In [20]:
from pydantic import Field

class UserInfo(BaseModel):
    """
    This is a model which represents a single user's information
    """
    name: str = Field(...,description="This is the user's name which we have extracted")
    age: int = Field(...,description="This is the user's age which we have extracted")

UserInfo.model_json_schema()

{'description': "This is a model which represents a single user's information",
 'properties': {'name': {'description': "This is the user's name which we have extracted",
   'title': 'Name',
   'type': 'string'},
  'age': {'description': "This is the user's age which we have extracted",
   'title': 'Age',
   'type': 'integer'}},
 'required': ['name', 'age'],
 'title': 'UserInfo',
 'type': 'object'}

#### Using simple validation

Now that we've seen how to work with simple User Fields, let's start implementing validators. 

Validators are simple functions that run on the returned response from OpenAI. Using Validators, we can ensure that we have valid output. To show how a simple validator might work, let's try to implement a simple function which generates three categories given an article title. 

In [21]:
from pydantic import field_validator

class Metadata(BaseModel):
    """
    This is a model which represents a list of categories that we can classify the given article into
    """
    categories: list[str] = Field(..., description="This is the list of categories that we can classify the given article into")
    keywords: list[str] = Field(...,description="These are some keywords that users might search for when looking for similar articles as the given article.")

    @field_validator('categories')
    def check_categories_length(cls, v):
        if not (3 <= len(v) <= 5):
            raise ValueError('categories must have at least 3 elements and at most 5 elements')
        return v


In [22]:
metadata = client.chat.completions.create(
    model="gpt-3.5-turbo",
    response_model=Metadata,
    messages=[{"role": "system", "content": "You are a World Class classification Algorithm. You are about to be given an article title to classify. Make sure to return your response in the model provided"},
             {"role": "user", "content": "Give me a sample article title for classification: The Future of Artificial Intelligence in Healthcare"}
             ],
)
metadata

Metadata(categories=['Technology', 'Healthcare', 'Artificial Intelligence'], keywords=['Future', 'Artificial Intelligence', 'Healthcare'])

#### More Complex Types

We've now seen how to use Pydantic to validate our returned types with instructor. Now let's try a more complex example

Imagine you're trying to do some query parsing and you have a set of given tools

1. Internet Search
2. Database Queries
3. Meeting Scheduler

How might we represent this in a Pydantic Model?

In [31]:
from datetime import datetime
from typing import List,Literal,Union
from pydantic import field_validator
from openai import OpenAI
import instructor

client = instructor.from_openai(OpenAI())

class InternetSearch(BaseModel):
    """
    Model for representing an internet search query.
    
    """
    id: int = Field(..., description="Unique id of the query")
    search_query: str = Field(..., description="This is an internet search query that we will execute to identify relevant information.")
    dependencies: List[int] = Field(
        default_factory=list,
        description="List of sub questions that need to be answered before asking this question",
    )

class CalendarQuery(BaseModel):
    """
    A model that represents
    """
    id: int = Field(..., description="Unique id of the query")
    calendar: Literal['Personal', 'Work'] = Field(..., description="The type of calendar (Personal or Work).")
    start_date: str = Field(..., description="The earliest date for events that we'd like to fetch for this calendar")
    end_date: str = Field(..., description="The latest date for events that we'd like to fetch for this calendar")
    dependencies: List[int] = Field(
        default_factory=list,
        description="List of sub questions that need to be answered before asking this question",
    )

    @field_validator("start_date", "end_date")
    def validate_date_format(cls, value):
        try:
            datetime.strptime(value, "%d-%m")
        except ValueError:
            raise ValueError("Date must be in the format dd-mm")
        return value
    

class QueryModel(BaseModel):
    """
    A list of actions to execute in order to complete the user's request
    """
    actions: List[Union[InternetSearch, CalendarQuery]] = Field(..., description="A list of actions.")


def generate_actions(request: str) -> QueryModel:
    """
    Generate a list of actions to schedule an appointment based on the user's request.
    """
    return client.chat.completions.create(
        model="gpt-4o",
        response_model=QueryModel,
        messages=[
            {"role": "system", "content": "You are a scheduling assistant capable of breaking down complex user queries into actions to be executed. Do not answer the question but instead return a list of steps in order to get enough information to answer the user's query. Always err on the side of caution."},
            {"role": "assistant", "content": "The date today is 27 May 2024, Monday. The user lives in Downtown Toronto and generally likes Japanese Food"},
            {"role": "user", "content": request}
        ],
        max_retries=3
    )

request = "I'd like to grab dinner with Daniel sometime next week. Can you help me find some time in my calendar and some potential dinner spots?"
actions = generate_actions(request)
print(actions)

actions=[CalendarQuery(id=1, calendar='Personal', start_date='03-06', end_date='10-06', dependencies=[]), InternetSearch(id=2, search_query='best Japanese restaurants in Downtown Toronto', dependencies=[])]
