## Setting up the notebook

High-level configs

In [1]:
%reload_ext autoreload
%autoreload 2

from dotenv import load_dotenv

# Load environment variables from .env file. Adjust the path to the .env file as needed.
load_dotenv(dotenv_path='../.env')

# Enable asyncio in Jupyter
import asyncio
import nest_asyncio

nest_asyncio.apply()

#  Add the package to the path (required if you are running this notebook from the examples folder)
import sys
sys.path.append('../../')


Import required packages

In [9]:
import json
import numpy as np
from openai import AsyncOpenAI
import pandas as pd
from pydantic import BaseModel
from tqdm.auto import tqdm

from lattereview.providers import OpenAIProvider
from lattereview.providers import LiteLLMProvider
from lattereview.agents import ScoringReviewer
from lattereview.workflows import ReviewWorkflow

## Data

Building five example stories and dummy question-answering pairs from each story:

In [4]:
class BuildStoryOutput(BaseModel):
    story: str
    questions: list[str]
    answers: list[bool]

async def build_story():
    prompt = """
    Write a one-paragraph story with whatever realistic or imaginary theme you like,  
    then create three TRUE/FALSE questions based on your story. 
    Ensure that only readers of your story can determine whether the statements are true or false. 
    Do not reveal the answers to your questions.
    Return your story, a Python list of three questions, and another Python list of three boolean responses to the questions as your output.
    """
    provider = OpenAIProvider(model="gpt-4o", response_format_class=BuildStoryOutput)
    return await provider.get_json_response(prompt, temperature=0.9)

def run_build_story():
    response =  asyncio.run(build_story())[0]
    return response

data = {
    "question": [],
    "answer": [],
    "story": []
}
for i in tqdm(range(5)):
    out = json.loads(run_build_story())
    for j in range(3):
        data["question"].append(out["questions"][j])
        data["answer"].append(out["answers"][j])
        data["story"].append(out["story"])


data = pd.DataFrame(data)
data.to_csv("data.csv", index=False)
data

100%|██████████| 5/5 [00:18<00:00,  3.64s/it]


Unnamed: 0,question,answer,story
0,Did Orville ask Felix to gather the creatures ...,False,In the heart of an enchanted forest lived a wi...
1,Was the riddle about what sings with a voice a...,False,In the heart of an enchanted forest lived a wi...
2,Did Felix thank Orville for his wisdom at the ...,True,In the heart of an enchanted forest lived a wi...
3,The Whispering Oak was located in a bustling c...,False,"In the heart of the enchanted forest, there la..."
4,Liri found a hidden door inside the Whispering...,True,"In the heart of the enchanted forest, there la..."
5,The pool inside the Whispering Oak showed Liri...,True,"In the heart of the enchanted forest, there la..."
6,The villagers of Harmonia cultivated vegetable...,False,In a hidden valley surrounded by towering peak...
7,The Festival of Notes celebrated the unique me...,True,In a hidden valley surrounded by towering peak...
8,"Lila discovered the Whisper Bloom, a flower wi...",True,In a hidden valley surrounded by towering peak...
9,The story takes place in a bustling city with ...,False,"In the heart of the enchanted forest, there st..."


Embedding the stories to build a vector base:

In [5]:
data = pd.read_csv("data.csv")

async def get_embedding(text):
    client = AsyncOpenAI()
    if isinstance(text, str):
        text = [text]
    text = [x.replace("\n", " ") for x in text]         
    out = await client.embeddings.create(
        model="text-embedding-ada-002",
        input=text,
        encoding_format="float"
    )
    out = [np.array(x.embedding) for x in out.data]
    return out if len(out) > 1 else out[0]

stories = {story: None for story in set(data["story"].tolist())}

# Create async tasks for all embeddings
async def process_embeddings():
    tasks = [get_embedding(story) for story in stories.keys()]
    embeddings = await asyncio.gather(*tasks)
    return list(zip(embeddings, stories.keys()))

# Run the async code and get results
vector_story_pairs = await process_embeddings()
vector_base = np.array([x[0] for x in vector_story_pairs])
vector_base

array([[ 0.00841486, -0.00538075, -0.00787282, ..., -0.00288703,
        -0.00163191, -0.03064515],
       [-0.00365802, -0.00146723,  0.00292441, ...,  0.00866937,
        -0.01723155, -0.02383743],
       [ 0.02797584, -0.00581644, -0.00768624, ..., -0.00786093,
         0.00175011, -0.04288249],
       [ 0.01053426,  0.00031505, -0.02427604, ..., -0.0193117 ,
         0.00797888, -0.03425796],
       [ 0.02183011, -0.01315995, -0.02121128, ..., -0.00819617,
        -0.01381169, -0.01560234]])

## Retrieval

In [6]:
async def find_relevant_story(statement):
    s_embeddings = await get_embedding(statement)
    dot_product = np.dot(vector_base, s_embeddings)
    base_norms = np.linalg.norm(vector_base, axis=1)
    query_norm = np.linalg.norm(s_embeddings)
    cosine_similarities = dot_product / (base_norms * query_norm)
    retrieved_index = np.argmax(cosine_similarities)
    retrieved_story = vector_story_pairs[retrieved_index][1]
    return retrieved_story

input_index = 11
statement = data.iloc[input_index]["question"]
retrieved_story = await find_relevant_story(statement)

print(f"=== The question was chosen from row {input_index} ===\n{statement}")
print(f"=== The related story to the question ===\n{data.iloc[input_index]['story']}")
print(f"=== The retrieved Story ===\n{retrieved_story}")

=== The question was chosen from row 11 ===
Dara used a high-tech gadget to find Eldergrove.
=== The related story to the question ===
In the heart of the enchanted forest, there stood an ancient oak tree known as Eldergrove. It was said that Eldergrove was as old as time itself, with branches stretching out like the fingers of an old, wise giant. The villagers from the nearby town often whispered about the magical fruit it bore once every hundred years. Young Dara, who was both curious and bold, decided to seek out this legendary tree. Guided by a map she found in a dusty old book at the library, she embarked on her journey. When Dara finally reached Eldergrove, she was in awe of its majesty. Hanging from one of the branches was a single, glowing fruit, pulsating with a soft golden light. As she reached out to touch it, she heard a faint whisper that seemed to come from the tree itself, "Only those pure of heart may taste my fruit."
=== The retrieved Story ===
In the heart of the ench

## Scoring with Retrieval Augmented Generation

In [7]:
reviewer = ScoringReviewer(
    provider=LiteLLMProvider(model="gpt-4o-mini"),
    name="reviewer",
    max_concurrent_requests=20, 
    backstory="A frequent book reader",
    input_description="TRUE/FALSE questions about stories",
    model_args={"max_tokens": 200, "temperature": 0.1},
    reasoning = "brief",
    scoring_task="Decide if the input statement is True or False given the provided story in the provided context",
    scoring_set=[1, 2],
    scoring_rules='Score 1 if the statement is TRUE and 2 if the statement is FALSE.',
    additional_context = find_relevant_story
)

review = ReviewWorkflow(
    workflow_schema=[
        {
            "round": 'A',
            "reviewers": [reviewer],
            "text_inputs": ["question"]
        }
    ]
)

updated_data = asyncio.run(review(data))
updated_data



Processing 15 eligible rows


['round: A', 'reviewer_name: reviewer'] -                     2024-12-26 23:27:56: 100%|██████████| 15/15 [00:02<00:00,  6.52it/s]

The following columns are present in the dataframe at the end of reviewer's reivew in round A: ['question', 'answer', 'story', 'round-A_reviewer_output', 'round-A_reviewer_reasoning', 'round-A_reviewer_score', 'round-A_reviewer_certainty']





Unnamed: 0,question,answer,story,round-A_reviewer_output,round-A_reviewer_reasoning,round-A_reviewer_score,round-A_reviewer_certainty
0,Did Orville ask Felix to gather the creatures ...,False,In the heart of an enchanted forest lived a wi...,{'reasoning': 'The statement is FALSE because ...,The statement is FALSE because Orville asked F...,2,90
1,Was the riddle about what sings with a voice a...,False,In the heart of an enchanted forest lived a wi...,{'reasoning': 'The riddle presented in the sto...,The riddle presented in the story is about wha...,2,90
2,Did Felix thank Orville for his wisdom at the ...,True,In the heart of an enchanted forest lived a wi...,{'reasoning': 'Felix did thank Orville for his...,Felix did thank Orville for his wisdom at the ...,1,90
3,The Whispering Oak was located in a bustling c...,False,"In the heart of the enchanted forest, there la...",{'reasoning': 'The statement claims that the W...,The statement claims that the Whispering Oak w...,2,90
4,Liri found a hidden door inside the Whispering...,True,"In the heart of the enchanted forest, there la...",{'reasoning': 'The statement is TRUE because t...,The statement is TRUE because the context clea...,1,100
5,The pool inside the Whispering Oak showed Liri...,True,"In the heart of the enchanted forest, there la...",{'reasoning': 'The statement is TRUE because t...,The statement is TRUE because the context expl...,1,100
6,The villagers of Harmonia cultivated vegetable...,False,In a hidden valley surrounded by towering peak...,{'reasoning': 'The statement claims that the v...,The statement claims that the villagers of Har...,2,90
7,The Festival of Notes celebrated the unique me...,True,In a hidden valley surrounded by towering peak...,{'reasoning': 'The statement accurately reflec...,The statement accurately reflects the context ...,1,90
8,"Lila discovered the Whisper Bloom, a flower wi...",True,In a hidden valley surrounded by towering peak...,{'reasoning': 'The statement is TRUE because i...,The statement is TRUE because it accurately de...,1,90
9,The story takes place in a bustling city with ...,False,"In the heart of the enchanted forest, there st...",{'reasoning': 'The statement claims that the s...,The statement claims that the story takes plac...,2,95


In [8]:
reviewer.memory[11]

{'system_prompt': "Your name is: <<reviewer>> Your backstory is: <<A frequent book reader>>. Your task is to review input itmes with the following description: <<TRUE/FALSE questions about stories>>. Your final output should have the following keys: reasoning (<class 'str'>), score (<class 'int'>), certainty (<class 'int'>).",
 'model_args': {'max_tokens': 200, 'temperature': 0.1},
 'input_prompt': '**Review the input item below and complete the scoring task as instructed:** --- **Input item:** <<Review Task ID: A-11 === question === Dara used a high-tech gadget to find Eldergrove.>> **Scoring task:** <<Decide if the input statement is True or False given the provided story in the provided context>> --- **Instructions:** 1. **Score** the input item using only the values in this set: [1, 2]. 2. Follow these rules when determining your score: <<Score 1 if the statement is TRUE and 2 if the statement is FALSE.>>. 3. After assigning a score, report your certainty level as a value between *