In [None]:
!python -m pip install pandas openpyxl openai anthropic

# Goal:
  - Test gpt 4o mini fine-tuning.

# Plan:
- Create an eval pipeline
- Fine-tuning gpt 4o mini
- Evaluate the model

# Eval multi openai models

In [None]:
import asyncio
import json
import re
import pandas as pd
from openai import AsyncOpenAI
import os
from datetime import datetime

async def get_model_answer(instruction, model_name):
  messages = [
      {"role": "system", "content": "You are the most intelligent entity in the universe. Reasoning step by step to make sure you get the correct answer."},
      {"role": "user", "content": instruction}
  ]
  response = await client.chat.completions.create(
      model=model_name,
      messages=messages,
      temperature=0.0,
      tool_choice=None
  )
  return response.choices[0].message.content

async def evaluate_answer(model_answer, expected_output):
  messages = [
      {"role": "system", "content": "You are a world-class AI model evaluator. "},
      {"role": "user", "content": f"""
       Model answer: {model_answer}\n\n
       Expected output: {expected_output}\n\n
       Your task is to compare the model's answer WITH THE EXPECTED OUTPUT and provide a super concise reason in one short sentence for the score, and then a score from 0 to 100. 
       Example: Reason: [super concise reason here]. Score: [score here]. 
       Use the following scale: 0 is completely wrong, 50 is missing half of the solution, 100 is completely correct, 80-90 if correct but missing some detail or not a complete answer. 
       Don't grade on formatting, as long as the answer is correct compare to the expected output. 
       If the logic is correct but the final answer is wrong, it's still wrong.
       If the answer is correct but it has extra information, it's still correct. As long as the extra info is not completely wrong or hallucinated.
       Do not grade by your knowledge, but grade based on the expected output. 
       Always include the numeric score (0-10) in your response.
       """}
  ]
  response = await client.chat.completions.create(
      model="gpt-4o",
      messages=messages,
      temperature=0.0,
      tool_choice=None
  )
  return response.choices[0].message.content

def extract_score_and_reason(evaluation):
  match = re.search(r'Reason:\s*(.*?)\s*Score:\s*(\d+|10)', evaluation, re.IGNORECASE | re.DOTALL)
  if match:
      reason = match.group(1).strip()
      score = int(match.group(2))
      return score, reason
  else:
      print(f"Warning: Could not extract score and reason from evaluation: {evaluation}")
      return 0, "Unable to extract reason"  # Default values if extraction fails

async def process_item(item, model_name):
  model_answer = await get_model_answer(item['instruction'], model_name)
  evaluation = await evaluate_answer(model_answer, item['output'])
  score, reason = extract_score_and_reason(evaluation)
  return item['instruction'], item['output'], model_answer, score, reason

async def evaluate_model(model_name):
    tasks = [process_item(item, model_name) for item in eval_data]
    results = await asyncio.gather(*tasks)

    df = pd.DataFrame(results, columns=['Instruction', 'Expected Output', 'Model Answer', 'Score', 'Reason'])
    avg_score = df['Score'].mean()
    
    print(f"\nModel: {model_name}")
    print(f"Average Evaluation Score: {avg_score:.2f}")

    excel_path = f'{output_folder}/{dataset_name}_{current_time}_{model_name}.xlsx'
    df.to_excel(excel_path, index=False)
    print(f"Results saved to {excel_path}")

    return df, avg_score

async def main():
    models_to_evaluate = ["gpt-4o-mini", "gpt-4o-2024-08-06", "gpt-4o-2024-05-13", "gpt-4-0125-preview"]  # Add your model names here
    results = {}

    for model in models_to_evaluate:
        df, avg_score = await evaluate_model(model)
        results[model] = {"df": df, "avg_score": avg_score}

    # Create a summary DataFrame
    summary_data = [(model, data["avg_score"]) for model, data in results.items()]
    summary_df = pd.DataFrame(summary_data, columns=["Model", "Average Score"])
    summary_df = summary_df.sort_values("Average Score", ascending=False).reset_index(drop=True)

    print("\nModel Comparison Summary:")
    display(summary_df)

    # Save summary to Excel in the output folder
    summary_excel_path = f'{output_folder}/model_comparison_summary.xlsx'
    summary_df.to_excel(summary_excel_path, index=False)
    print(f"\nSummary saved to {summary_excel_path}")

    return results, summary_df


client = AsyncOpenAI()

# Load the evaluation dataset
input_file_path = './input/huy_dataset/huy_test.json'
# './input/huy_dataset/huy_test.json'
# 'input/math-EvalDataset-10.json'
#'./input/huy_dataset/huy_test2.json'


with open(input_file_path, 'r') as f:
  eval_data = json.load(f)

# Extract dataset name from input file path
dataset_name = os.path.splitext(os.path.basename(input_file_path))[0]

# Create output folder name
current_time = datetime.now().strftime("%m%d%y_%I%M%p")
output_folder = f'./output/{dataset_name}_{current_time}'

# Create the output folder
os.makedirs(output_folder, exist_ok=True)


# Check if we're in a Jupyter notebook
try:
    get_ipython()
    is_notebook = True
except NameError:
    is_notebook = False

if is_notebook:
    # If in a Jupyter notebook, use this:
    results, summary_df = await main()
else:
    # If in a regular Python script, use this:
    results, summary_df = asyncio.run(main())

In [18]:
import asyncio
import json
import re
import pandas as pd
from openai import AsyncOpenAI
import os
from datetime import datetime
from pydantic import BaseModel
from typing import List

class Step(BaseModel):
  explanation: str
  output: str

class MathResponse(BaseModel):
  steps: List[Step]
  final_answer: str

async def get_model_answer(instruction, model_name):
  messages = [
      {"role": "system", "content": "You are the most intelligent entity in the universe. Reasoning step by step and consider multi angles to make sure you get the correct and complete answer."},
      {"role": "user", "content": instruction}
  ]
  
  if model_name == "gpt-4o-2024-08-06":
      response = await client.beta.chat.completions.parse(
          model=model_name,
          messages=messages,
          response_format=MathResponse,
      )
      message = response.choices[0].message
      if message.parsed:
          steps_text = "\n".join([f"Step {i+1}: {step.explanation} Output: {step.output}" for i, step in enumerate(message.parsed.steps)])
          final_answer_text = f"Final Answer: {message.parsed.final_answer}"
          combined_answer = f"{steps_text}\n{final_answer_text}"
          return combined_answer
      else:
          return message.refusal
  else:
      response = await client.chat.completions.create(
          model=model_name,
          messages=messages,
          temperature=0.0,
      )
      return response.choices[0].message.content

async def evaluate_answer(model_answer, expected_output):
  messages = [
      {"role": "system", "content": "You are a world-class AI model evaluator. "},
      {"role": "user", "content": f"""
      Model answer: {model_answer}\n\n
      Expected output: {expected_output}\n\n
      Your task is to compare the model's answer WITH THE EXPECTED OUTPUT and provide a super concise reason in one short sentence for the score, and then a score from 0 to 100. 
      Example: Reason: [super concise reason here]. Score: [score here]. 
      Use the following scale: 0 is completely wrong, 100 is completely correct, 80-90 if correct but missing detail or not a complete answer. 
      Don't grade on formatting, as long as the answer is correct compare to the expected output. 
      If the logic is correct but the final answer is wrong, it's still wrong.
      If the answer is correct but it has extra information, it's still correct. As long as the extra info is not completely wrong or hallucinated.
      Do not grade by your knowledge, but grade based on the expected output. 
      Always include the numeric score (0-100) in your response.
      """}
  ]
  response = await client.chat.completions.create(
      model="gpt-4-0125-preview",
      messages=messages,
      temperature=0.0,
      tool_choice=None
  )
  return response.choices[0].message.content

def extract_score_and_reason(evaluation):
  match = re.search(r'Reason:\s*(.*?)\s*Score:\s*(\d+|100)', evaluation, re.IGNORECASE | re.DOTALL)
  if match:
      reason = match.group(1).strip()
      score = int(match.group(2))
      return score, reason
  else:
      print(f"Warning: Could not extract score and reason from evaluation: {evaluation}")
      return 0, "Unable to extract reason"  # Default values if extraction fails

async def process_item(item, model_name):
  model_answer = await get_model_answer(item['instruction'], model_name)
  evaluation = await evaluate_answer(model_answer, item['output'])
  score, reason = extract_score_and_reason(evaluation)
  return item['instruction'], item['output'], model_answer, score, reason

async def evaluate_model(model_name):
  tasks = [process_item(item, model_name) for item in eval_data]
  results = await asyncio.gather(*tasks)

  df = pd.DataFrame(results, columns=['Instruction', 'Expected Output', 'Model Answer', 'Score', 'Reason'])
  avg_score = df['Score'].mean()
  
  print(f"\nModel: {model_name}")
  print(f"Average Evaluation Score: {avg_score:.2f}")

  excel_path = f'{output_folder}/{dataset_name}_{current_time}_{model_name}.xlsx'
  df.to_excel(excel_path, index=False)
  print(f"Results saved to {excel_path}")

  return df, avg_score

async def main():
  models_to_evaluate = ["gpt-4o-mini", "gpt-4o", "gpt-4o-2024-08-06", "gpt-4-0125-preview"]  # Add your model names here
  results = {}

  for model in models_to_evaluate:
      df, avg_score = await evaluate_model(model)
      results[model] = {"df": df, "avg_score": avg_score}

  # Create a summary DataFrame
  summary_data = [(model, data["avg_score"]) for model, data in results.items()]
  summary_df = pd.DataFrame(summary_data, columns=["Model", "Average Score"])
  summary_df = summary_df.sort_values("Average Score", ascending=False).reset_index(drop=True)

  print("\nModel Comparison Summary:")
  display(summary_df)

  # Save summary to Excel in the output folder
  summary_excel_path = f'{output_folder}/model_comparison_summary.xlsx'
  summary_df.to_excel(summary_excel_path, index=False)
  print(f"\nSummary saved to {summary_excel_path}")

  return results, summary_df

client = AsyncOpenAI()

# Load the evaluation dataset
input_file_path = './input/huy_dataset/huy_test.json'

with open(input_file_path, 'r') as f:
  eval_data = json.load(f)

# Extract dataset name from input file path
dataset_name = os.path.splitext(os.path.basename(input_file_path))[0]

# Create output folder name
current_time = datetime.now().strftime("%m%d%y_%I%M%p")
output_folder = f'./output/{dataset_name}_{current_time}'

# Create the output folder
os.makedirs(output_folder, exist_ok=True)

# Check if we're in a Jupyter notebook
try:
  get_ipython()
  is_notebook = True
except NameError:
  is_notebook = False

if is_notebook:
  # If in a Jupyter notebook, use this:
  results, summary_df = await main()
else:
  # If in a regular Python script, use this:
  results, summary_df = asyncio.run(main())


Model: gpt-4o-mini
Average Evaluation Score: 78.12
Results saved to ./output/huy_test_081024_1208PM/huy_test_081024_1208PM_gpt-4o-mini.xlsx

Model: gpt-4o
Average Evaluation Score: 98.44
Results saved to ./output/huy_test_081024_1208PM/huy_test_081024_1208PM_gpt-4o.xlsx

Model: gpt-4o-2024-08-06
Average Evaluation Score: 94.06
Results saved to ./output/huy_test_081024_1208PM/huy_test_081024_1208PM_gpt-4o-2024-08-06.xlsx

Model: gpt-4-0125-preview
Average Evaluation Score: 86.25
Results saved to ./output/huy_test_081024_1208PM/huy_test_081024_1208PM_gpt-4-0125-preview.xlsx

Model Comparison Summary:


Unnamed: 0,Model,Average Score
0,gpt-4o,98.4375
1,gpt-4o-2024-08-06,94.0625
2,gpt-4-0125-preview,86.25
3,gpt-4o-mini,78.125



Summary saved to ./output/huy_test_081024_1208PM/model_comparison_summary.xlsx


# Eval different models:
- Eval pipeline for other models: google, anthropic, open sources

# Testing for fun


In [None]:
from pydantic import BaseModel
from openai import OpenAI
from typing import List
from IPython.display import display, Markdown

class Step(BaseModel):
  explanation: str
  output: str

class MathResponse(BaseModel):
  steps: List[Step]
  final_answer: str

client = OpenAI()
completion = client.beta.chat.completions.parse(
  model="gpt-4o-2024-08-06",
  messages=[
      {"role": "system", "content": "You are a helpful math tutor."},
      {"role": "user", "content": "* There is a three-digit number. The second digit is four times as big as the third digit, while the first digit is three less than the second digit. What is the number?"},
  ],
  response_format=MathResponse,
)

message = completion.choices[0].message
if message.parsed:
  steps_markdown = "\n".join([f"**Step {i+1}:**\n\n{step.explanation}\n\n*Output:* {step.output}\n" for i, step in enumerate(message.parsed.steps)])
  final_answer_markdown = f"**Final Answer:** {message.parsed.final_answer}"
  
  display(Markdown(steps_markdown))
  display(Markdown(final_answer_markdown))
else:
  display(message.refusal)