In [None]:
!python -m pip install pandas openpyxl openai anthropic

# Eval multi openai models

In [3]:
import asyncio
import json
import re
import pandas as pd
from openai import AsyncOpenAI
from anthropic import AsyncAnthropic
import os
from datetime import datetime

client = AsyncOpenAI()
anthropic_client = AsyncAnthropic(
    api_key=os.environ.get("ANTHROPIC_API_KEY"),
)

async def get_model_answer(instruction, model_name):
  messages = [
      {"role": "system", "content": "You are the most intelligent entity in the universe. Reasoning step by step to make sure you get the correct answer."},
      {"role": "user", "content": instruction}
  ]
  response = await client.chat.completions.create(
      model=model_name,
      messages=messages,
      temperature=0.0,
      tool_choice=None
  )
  return response.choices[0].message.content

async def evaluate_answer(model_answer, expected_output):
  messages = [
      {"role": "system", "content": "You are a world-class AI model evaluator. "},
      {"role": "user", "content": f"""
       Model answer: {model_answer}\n\n
       Expected output: {expected_output}\n\n
       Your task is to compare the model's answer WITH THE EXPECTED OUTPUT and provide a super concise reason in one short sentence for the score, and then a score from 0 to 100. 
       Example: Reason: [super concise reason here]. Score: [score here]. 
       Use the following scale: 0 is completely wrong, 50 is missing half of the solution, 100 is completely correct, 80-90 if correct but missing some detail or not a complete answer. 
       Don't grade on formatting, as long as the answer is correct compare to the expected output. 
       If the logic is correct but the final answer is wrong, it's still wrong.
       If the answer is correct but it has extra information, it's still correct. As long as the extra info is not completely wrong or hallucinated.
       Do not grade by your knowledge, but grade based on the expected output. 
       Always include the numeric score (0-10) in your response.
       """}
  ]
  response = await client.chat.completions.create(
      model="gpt-4o",
      messages=messages,
      temperature=0.0,
      tool_choice=None
  )
  return response.choices[0].message.content

def extract_score_and_reason(evaluation):
  match = re.search(r'Reason:\s*(.*?)\s*Score:\s*(\d+|10)', evaluation, re.IGNORECASE | re.DOTALL)
  if match:
      reason = match.group(1).strip()
      score = int(match.group(2))
      return score, reason
  else:
      print(f"Warning: Could not extract score and reason from evaluation: {evaluation}")
      return 0, "Unable to extract reason"  # Default values if extraction fails

async def process_item(item, model_name):
  model_answer = await get_model_answer(item['instruction'], model_name)
  evaluation = await evaluate_answer(model_answer, item['output'])
  score, reason = extract_score_and_reason(evaluation)
  return item['instruction'], item['output'], model_answer, score, reason

async def evaluate_model(model_name):
    tasks = [process_item(item, model_name) for item in eval_data]
    results = await asyncio.gather(*tasks)

    df = pd.DataFrame(results, columns=['Instruction', 'Expected Output', 'Model Answer', 'Score', 'Reason'])
    avg_score = df['Score'].mean()
    
    print(f"\nModel: {model_name}")
    print(f"Average Evaluation Score: {avg_score:.2f}")

    excel_path = f'{output_folder}/{dataset_name}_{current_time}_{model_name}.xlsx'
    df.to_excel(excel_path, index=False)
    print(f"Results saved to {excel_path}")

    return df, avg_score

async def main():
    models_to_evaluate = ["gpt-4o-mini", "gpt-4o-2024-08-06", "gpt-4o-2024-05-13", "gpt-4-0125-preview"]  # Add your model names here
    results = {}

    for model in models_to_evaluate:
        df, avg_score = await evaluate_model(model)
        results[model] = {"df": df, "avg_score": avg_score}

    # Create a summary DataFrame
    summary_data = [(model, data["avg_score"]) for model, data in results.items()]
    summary_df = pd.DataFrame(summary_data, columns=["Model", "Average Score"])
    summary_df = summary_df.sort_values("Average Score", ascending=False).reset_index(drop=True)

    print("\nModel Comparison Summary:")
    display(summary_df)

    # Save summary to Excel in the output folder
    summary_excel_path = f'{output_folder}/model_comparison_summary.xlsx'
    summary_df.to_excel(summary_excel_path, index=False)
    print(f"\nSummary saved to {summary_excel_path}")

    return results, summary_df



# Load the evaluation dataset
input_file_path = './input/EvalDataset-20.json'
# './input/huy_dataset/huy_test.json'
# 'input/math-EvalDataset-10.json'
#'./input/huy_dataset/huy_test2.json'
# './input/EvalDataset-20.json'


with open(input_file_path, 'r') as f:
  eval_data = json.load(f)

# Extract dataset name from input file path
dataset_name = os.path.splitext(os.path.basename(input_file_path))[0]

# Create output folder name
current_time = datetime.now().strftime("%m%d%y_%I%M%p")
output_folder = f'./output/{dataset_name}_{current_time}'

# Create the output folder
os.makedirs(output_folder, exist_ok=True)


# Check if we're in a Jupyter notebook
try:
    get_ipython()
    is_notebook = True
except NameError:
    is_notebook = False

if is_notebook:
    # If in a Jupyter notebook, use this:
    results, summary_df = await main()
else:
    # If in a regular Python script, use this:
    results, summary_df = asyncio.run(main())


Model: gpt-4o-mini
Average Evaluation Score: 85.00
Results saved to ./output/EvalDataset-20_081224_1229PM/EvalDataset-20_081224_1229PM_gpt-4o-mini.xlsx

Model: gpt-4o-2024-08-06
Average Evaluation Score: 92.00
Results saved to ./output/EvalDataset-20_081224_1229PM/EvalDataset-20_081224_1229PM_gpt-4o-2024-08-06.xlsx

Model: gpt-4o-2024-05-13
Average Evaluation Score: 91.00
Results saved to ./output/EvalDataset-20_081224_1229PM/EvalDataset-20_081224_1229PM_gpt-4o-2024-05-13.xlsx

Model: gpt-4-0125-preview
Average Evaluation Score: 86.50
Results saved to ./output/EvalDataset-20_081224_1229PM/EvalDataset-20_081224_1229PM_gpt-4-0125-preview.xlsx

Model Comparison Summary:


Unnamed: 0,Model,Average Score
0,gpt-4o-2024-08-06,92.0
1,gpt-4o-2024-05-13,91.0
2,gpt-4-0125-preview,86.5
3,gpt-4o-mini,85.0



Summary saved to ./output/EvalDataset-20_081224_1229PM/model_comparison_summary.xlsx


# Eval structure output

In [18]:
import asyncio
import json
import re
import pandas as pd
from openai import AsyncOpenAI
import os
from datetime import datetime
from pydantic import BaseModel
from typing import List

class Step(BaseModel):
  explanation: str
  output: str

class MathResponse(BaseModel):
  steps: List[Step]
  final_answer: str

async def get_model_answer(instruction, model_name):
  messages = [
      {"role": "system", "content": "You are the most intelligent entity in the universe. Reasoning step by step and consider multi angles to make sure you get the correct and complete answer."},
      {"role": "user", "content": instruction}
  ]
  
  if model_name == "gpt-4o-2024-08-06":
      response = await client.beta.chat.completions.parse(
          model=model_name,
          messages=messages,
          response_format=MathResponse,
      )
      message = response.choices[0].message
      if message.parsed:
          steps_text = "\n".join([f"Step {i+1}: {step.explanation} Output: {step.output}" for i, step in enumerate(message.parsed.steps)])
          final_answer_text = f"Final Answer: {message.parsed.final_answer}"
          combined_answer = f"{steps_text}\n{final_answer_text}"
          return combined_answer
      else:
          return message.refusal
  else:
      response = await client.chat.completions.create(
          model=model_name,
          messages=messages,
          temperature=0.0,
      )
      return response.choices[0].message.content

async def evaluate_answer(model_answer, expected_output):
  messages = [
      {"role": "system", "content": "You are a world-class AI model evaluator. "},
      {"role": "user", "content": f"""
      Model answer: {model_answer}\n\n
      Expected output: {expected_output}\n\n
      Your task is to compare the model's answer WITH THE EXPECTED OUTPUT and provide a super concise reason in one short sentence for the score, and then a score from 0 to 100. 
      Example: Reason: [super concise reason here]. Score: [score here]. 
      Use the following scale: 0 is completely wrong, 100 is completely correct, 80-90 if correct but missing detail or not a complete answer. 
      Don't grade on formatting, as long as the answer is correct compare to the expected output. 
      If the logic is correct but the final answer is wrong, it's still wrong.
      If the answer is correct but it has extra information, it's still correct. As long as the extra info is not completely wrong or hallucinated.
      Do not grade by your knowledge, but grade based on the expected output. 
      Always include the numeric score (0-100) in your response.
      """}
  ]
  response = await client.chat.completions.create(
      model="gpt-4-0125-preview",
      messages=messages,
      temperature=0.0,
      tool_choice=None
  )
  return response.choices[0].message.content

def extract_score_and_reason(evaluation):
  match = re.search(r'Reason:\s*(.*?)\s*Score:\s*(\d+|100)', evaluation, re.IGNORECASE | re.DOTALL)
  if match:
      reason = match.group(1).strip()
      score = int(match.group(2))
      return score, reason
  else:
      print(f"Warning: Could not extract score and reason from evaluation: {evaluation}")
      return 0, "Unable to extract reason"  # Default values if extraction fails

async def process_item(item, model_name):
  model_answer = await get_model_answer(item['instruction'], model_name)
  evaluation = await evaluate_answer(model_answer, item['output'])
  score, reason = extract_score_and_reason(evaluation)
  return item['instruction'], item['output'], model_answer, score, reason

async def evaluate_model(model_name):
  tasks = [process_item(item, model_name) for item in eval_data]
  results = await asyncio.gather(*tasks)

  df = pd.DataFrame(results, columns=['Instruction', 'Expected Output', 'Model Answer', 'Score', 'Reason'])
  avg_score = df['Score'].mean()
  
  print(f"\nModel: {model_name}")
  print(f"Average Evaluation Score: {avg_score:.2f}")

  excel_path = f'{output_folder}/{dataset_name}_{current_time}_{model_name}.xlsx'
  df.to_excel(excel_path, index=False)
  print(f"Results saved to {excel_path}")

  return df, avg_score

async def main():
  models_to_evaluate = ["gpt-4o-mini", "gpt-4o", "gpt-4o-2024-08-06", "gpt-4-0125-preview"]  # Add your model names here
  results = {}

  for model in models_to_evaluate:
      df, avg_score = await evaluate_model(model)
      results[model] = {"df": df, "avg_score": avg_score}

  # Create a summary DataFrame
  summary_data = [(model, data["avg_score"]) for model, data in results.items()]
  summary_df = pd.DataFrame(summary_data, columns=["Model", "Average Score"])
  summary_df = summary_df.sort_values("Average Score", ascending=False).reset_index(drop=True)

  print("\nModel Comparison Summary:")
  display(summary_df)

  # Save summary to Excel in the output folder
  summary_excel_path = f'{output_folder}/model_comparison_summary.xlsx'
  summary_df.to_excel(summary_excel_path, index=False)
  print(f"\nSummary saved to {summary_excel_path}")

  return results, summary_df

client = AsyncOpenAI()

# Load the evaluation dataset
input_file_path = './input/huy_dataset/huy_test.json'

with open(input_file_path, 'r') as f:
  eval_data = json.load(f)

# Extract dataset name from input file path
dataset_name = os.path.splitext(os.path.basename(input_file_path))[0]

# Create output folder name
current_time = datetime.now().strftime("%m%d%y_%I%M%p")
output_folder = f'./output/{dataset_name}_{current_time}'

# Create the output folder
os.makedirs(output_folder, exist_ok=True)

# Check if we're in a Jupyter notebook
try:
  get_ipython()
  is_notebook = True
except NameError:
  is_notebook = False

if is_notebook:
  # If in a Jupyter notebook, use this:
  results, summary_df = await main()
else:
  # If in a regular Python script, use this:
  results, summary_df = asyncio.run(main())


Model: gpt-4o-mini
Average Evaluation Score: 78.12
Results saved to ./output/huy_test_081024_1208PM/huy_test_081024_1208PM_gpt-4o-mini.xlsx

Model: gpt-4o
Average Evaluation Score: 98.44
Results saved to ./output/huy_test_081024_1208PM/huy_test_081024_1208PM_gpt-4o.xlsx

Model: gpt-4o-2024-08-06
Average Evaluation Score: 94.06
Results saved to ./output/huy_test_081024_1208PM/huy_test_081024_1208PM_gpt-4o-2024-08-06.xlsx

Model: gpt-4-0125-preview
Average Evaluation Score: 86.25
Results saved to ./output/huy_test_081024_1208PM/huy_test_081024_1208PM_gpt-4-0125-preview.xlsx

Model Comparison Summary:


Unnamed: 0,Model,Average Score
0,gpt-4o,98.4375
1,gpt-4o-2024-08-06,94.0625
2,gpt-4-0125-preview,86.25
3,gpt-4o-mini,78.125



Summary saved to ./output/huy_test_081024_1208PM/model_comparison_summary.xlsx


# Eval different models:
- Eval pipeline for other models: google, anthropic, open sources

In [None]:
import asyncio
import json
import re
import pandas as pd
from openai import AsyncOpenAI
from anthropic import AsyncAnthropic
import os
from datetime import datetime

client = AsyncOpenAI()
anthropic_client = AsyncAnthropic(
    api_key=os.environ.get("ANTHROPIC_API_KEY"),
)

async def get_openai_answer(instruction, model_name):
    messages = [
        {"role": "system", "content": "You are the most intelligent entity in the universe. Reasoning step by step and consider multiple angles to make sure you get the correct answer(s)."},
        {"role": "user", "content": instruction}
    ]
    response = await client.chat.completions.create(
        model=model_name,
        messages=messages,
        temperature=0.0,
        tool_choice=None
    )
    return response.choices[0].message.content

async def get_anthropic_answer(instruction, model_name):
  message = await anthropic_client.messages.create(
      model=model_name,
      max_tokens=1024,
      temperature=0,
      messages=[
          {
              "role": "user",
              "content": f"You are the most intelligent entity in the universe. Reasoning step by step and consider multiple angles to make sure you get the correct answer(s). Here's the task: {instruction}",
          }
      ],
  )
  return message.content[0].text
async def evaluate_answer(model_answer, expected_output):
  messages = [
      {"role": "system", "content": "You are a world-class AI model evaluator. "},
      {"role": "user", "content": f"""
       Model answer: {model_answer}\n\n
       Expected output: {expected_output}\n\n
       Your task is to compare the model's answer WITH THE EXPECTED OUTPUT and provide a super concise reason in one short sentence for the score, and then a score from 0 to 100. 
       Example: Reason: [super concise reason here]. Score: [score here]. 
       Use the following scale: 0 is completely wrong, 50 is missing half of the solution, 100 is completely correct, 80-90 if correct but missing some detail or not a complete answer. 
       Don't grade on formatting, as long as the answer is correct compare to the expected output. 
       If the logic is correct but the final answer is wrong, it's still wrong.
       If the answer is correct but it has extra information, it's still correct. As long as the extra info is not completely wrong or hallucinated.
       Do not grade by your knowledge, but grade based on the expected output. 
       Always include the numeric score (0-100) in your response.
       """}
  ]
  response = await client.chat.completions.create(
      model="gpt-4o",
      messages=messages,
      temperature=0.0,
      tool_choice=None
  )
  return response.choices[0].message.content

def extract_score_and_reason(evaluation):
  match = re.search(r'Reason:\s*(.*?)\s*Score:\s*(\d+|10)', evaluation, re.IGNORECASE | re.DOTALL)
  if match:
      reason = match.group(1).strip()
      score = int(match.group(2))
      return score, reason
  else:
      print(f"Warning: Could not extract score and reason from evaluation: {evaluation}")
      return 0, "Unable to extract reason"  # Default values if extraction fails

async def process_item(item, model_name):
  if model_name.startswith("claude"):
      model_answer = await get_anthropic_answer(item['instruction'], model_name)
  else:
      model_answer = await get_openai_answer(item['instruction'], model_name)
  evaluation = await evaluate_answer(model_answer, item['output'])
  score, reason = extract_score_and_reason(evaluation)
  return item['instruction'], item['output'], model_answer, score, reason

async def evaluate_model(model_name):
    tasks = [process_item(item, model_name) for item in eval_data]
    results = await asyncio.gather(*tasks)

    df = pd.DataFrame(results, columns=['Instruction', 'Expected Output', 'Model Answer', 'Score', 'Reason'])
    avg_score = df['Score'].mean()
    
    print(f"\nModel: {model_name}")
    print(f"Average Evaluation Score: {avg_score:.2f}")

    excel_path = f'{output_folder}/{dataset_name}_{current_time}_{model_name}.xlsx'
    df.to_excel(excel_path, index=False)
    print(f"Results saved to {excel_path}")

    return df, avg_score

async def main():
  models_to_evaluate = [
      "claude-3-5-sonnet-20240620",
      "gpt-4o-mini", 
      "gpt-4o-2024-08-06", 
      "gpt-4o-2024-05-13", 
      "gpt-4-0125-preview"
  ]
  results = {}

  for model in models_to_evaluate:
      df, avg_score = await evaluate_model(model)
      results[model] = {"df": df, "avg_score": avg_score}

  # Create a summary DataFrame
  summary_data = [(model, data["avg_score"]) for model, data in results.items()]
  summary_df = pd.DataFrame(summary_data, columns=["Model", "Average Score"])
  summary_df = summary_df.sort_values("Average Score", ascending=False).reset_index(drop=True)

  print("\nModel Comparison Summary:")
  print(summary_df)

  # Save summary to Excel in the output folder
  summary_excel_path = f'{output_folder}/model_comparison_summary.xlsx'
  summary_df.to_excel(summary_excel_path, index=False)
  print(f"\nSummary saved to {summary_excel_path}")

  return results, summary_df

# Load the evaluation dataset
input_file_path = './input/huy_dataset/huy_test.json'
# './input/huy_dataset/huy_test.json'
# 'input/math-EvalDataset-10.json'
#'./input/huy_dataset/huy_test2.json'
# './input/EvalDataset-20.json'


with open(input_file_path, 'r') as f:
  eval_data = json.load(f)

# Extract dataset name from input file path
dataset_name = os.path.splitext(os.path.basename(input_file_path))[0]

# Create output folder name
current_time = datetime.now().strftime("%m%d%y_%I%M%p")
output_folder = f'./output/{dataset_name}_{current_time}'

# Create the output folder
os.makedirs(output_folder, exist_ok=True)


# Check if we're in a Jupyter notebook
try:
    get_ipython()
    is_notebook = True
except NameError:
    is_notebook = False

if is_notebook:
    results, summary_df = await main()
else:
    results, summary_df = asyncio.run(main())

# Prompt optimizer:

In [28]:
import asyncio
import json
from openai import AsyncOpenAI
import os
import re
import nest_asyncio
from collections import deque
import json

# Apply nest_asyncio to allow running async code in Jupyter
nest_asyncio.apply()

client = AsyncOpenAI(api_key=os.environ["OPENAI_API_KEY"])

model_name = "chatgpt-4o-latest"

async def get_model_answer(instruction, prompt):
    messages = [
        {"role": "system", "content": prompt},
        {"role": "user", "content": instruction}
    ]
    response = await client.chat.completions.create(
        model=model_name,
        messages=messages,
        temperature=0
    )
    return response.choices[0].message.content

async def evaluate_answer(model_answer, expected_output):
    eval_prompt = load_prompt('eval_prompt.txt')
    
    messages = [
        {"role": "system", "content": "You are an AI model evaluator."},
        {"role": "user", "content": eval_prompt.format(
            model_answer=model_answer,
            expected_output=expected_output
        )}
    ]
    response = await client.chat.completions.create(
        model='gpt-4-0125-preview',
        messages=messages,
        temperature=0
    )
    return response.choices[0].message.content

def extract_score(evaluation):
    match = re.search(r'Score:\s*(\d+)', evaluation)
    if match:
        return int(match.group(1))
    return 0

# Function to save a prompt to a file
def save_prompt(prompt, filename):
    os.makedirs('prompts', exist_ok=True)
    with open(os.path.join('prompts', filename), 'w') as f:
        f.write(prompt)

# Function to load a prompt from a file
def load_prompt(filename):
    with open(os.path.join('prompts', filename), 'r') as f:
        return f.read()

# New function to remove content inside <analysis> tags
def remove_analysis(prompt):
    return re.sub(r'<analysis>.*?</analysis>', '', prompt, flags=re.DOTALL)

async def optimize_prompt(initial_prompt, dataset, max_iterations=5):
    current_prompt = initial_prompt
    best_prompt = initial_prompt
    best_score = 0
    initial_score = None
    no_improvement_count = 0
    last_scores = deque(maxlen=5)

    # Save the initial prompt
    save_prompt(initial_prompt, f'prompt_iteration_0.txt')

    for iteration in range(max_iterations):
        total_score = 0
        item_results = []
        print(f"\n{'='*50}\nIteration {iteration + 1}\n{'='*50}")
        
        print("Getting model answers and evaluations...")
        for item in dataset:
            model_answer = await get_model_answer(item['instruction'], current_prompt)
            evaluation = await evaluate_answer(model_answer, item['output'])
            score = extract_score(evaluation)
            total_score += score
            item_results.append({
                'instruction': item['instruction'],
                'model_answer': model_answer,
                'expected_output': item['output'],
                'evaluation': evaluation,
                'score': score
            })
            print(f"\nInstruction: {item['instruction']}")
            print(f"Model answer: {model_answer}")
            print(f"Expected output: {item['output']}")
            print(f"Evaluation: {evaluation}")

        avg_score = total_score / len(dataset)
        print(f"\nAverage score for iteration {iteration + 1}: {avg_score:.2f}")
        
        if iteration == 0:
            initial_score = avg_score
            
        if avg_score > best_score:
            best_score = avg_score
            best_prompt = current_prompt
            no_improvement_count = 0
        else:
            no_improvement_count += 1

        if no_improvement_count >= 3:
            print("\nNo improvement for 3 consecutive iterations. Stopping optimization.")
            break
        if best_score == 100:
            print("\nMaximum score reached. Stopping optimization.")
            break

        last_scores.append(avg_score)

        print(f"Current best score: {best_score:.2f}")
        print("\nGenerating improved prompt...")
        
        worst_items = sorted(item_results, key=lambda x: x['score'])[:5]
        
        with open('./prompts/improvement_prompt_template.txt', 'r') as f:
            improvement_prompt_template = f.read()
        
        improvement_prompt = improvement_prompt_template.format(
            current_prompt=current_prompt,
            avg_score=avg_score,
            last_scores=[f"{score:.2f}" for score in last_scores],
            worst_items=json.dumps(worst_items, indent=2)
        )   
        
        print(f"\nFULL PROMPT:\n{improvement_prompt}")
        
        improved_prompt_response = await client.chat.completions.create(
            model=model_name,
            messages=[{"role": "user", "content": improvement_prompt}],
            temperature=0
        )
        
        new_prompt = improved_prompt_response.choices[0].message.content
        
        # Remove content inside <analysis> tags
        new_prompt_cleaned = remove_analysis(new_prompt)
        
        print("\nModel's reasoning for the improved prompt:")
        print(new_prompt)
        print("\nCleaned prompt (with <analysis> content removed):")
        print(new_prompt_cleaned)
        
        # Save the new prompt for the next iteration
        save_prompt(new_prompt_cleaned, f'prompt_iteration_{iteration + 1}.txt')
        print(f"Improved prompt saved to prompt_iteration_{iteration + 1}.txt")
        
        # Use the cleaned prompt in the next iteration
        current_prompt = new_prompt_cleaned

    return best_prompt, best_score, initial_score

# Modified main function
async def main():
    # Load dataset
    with open('input/huy_dataset/optimizer_test.json', 'r') as f:
        dataset = json.load(f)

    # Load initial prompt from file or use default
    try:
        initial_prompt = load_prompt('initial_prompt.txt')
        print(f"Loaded initial prompt: {initial_prompt}")
    except FileNotFoundError:
        initial_prompt = "You are a helpful AI assistant. Provide accurate and concise answers."
        save_prompt(initial_prompt, 'initial_prompt.txt')

    best_prompt, best_score, initial_score = await optimize_prompt(initial_prompt, dataset)

    print(f"\n{'='*50}\nOptimization Results\n{'='*50}")
    print(f"Initial score: {initial_score:.2f}")
    print(f"Final score: {best_score:.2f}")
    print(f"Improvement: {best_score - initial_score:.2f} points")
    print(f"Percentage improvement: {((best_score - initial_score) / initial_score) * 100:.2f}%")

    # Save the best prompt
    save_prompt(best_prompt, 'best_prompt.txt')
    print("Best prompt saved to best_prompt.txt")

# Function to run the main coroutine
def run_main():
    loop = asyncio.get_event_loop()
    return loop.run_until_complete(main())

if __name__ == "__main__":
    run_main()
else:
    # This allows the script to be run in both regular Python and Jupyter environments
    print("To run the optimization, use: await main()")

Loaded initial prompt: You are a helpful AI assistant. Provide accurate and concise answers.

Iteration 1
Getting model answers and evaluations...

Instruction: How many 'r' in the word 'strawberry'?
Model answer: There are two 'r's in the word "strawberry."
Expected output: There are 3 letter 'r' in the word strawberry.
Evaluation: Score: 0 Reason: The model's answer is completely wrong compared to the expected output, which states there are 3 letter 'r' in the word "strawberry," while the model states there are only two.

Instruction: There is a three-digit number. The second digit is four times as big as the third digit, while the first digit is three less than the second digit. What is the number(s)?
Model answer: Let's denote the three-digit number as \( ABC \), where \( A \), \( B \), and \( C \) are the digits.

Given:
1. The second digit \( B \) is four times the third digit \( C \): \( B = 4C \).
2. The first digit \( A \) is three less than the second digit \( B \): \( A = B 