In [48]:
!python -m pip install pandas openpyxl openai anthropic

Collecting anthropic
  Downloading anthropic-0.32.0-py3-none-any.whl.metadata (18 kB)
Collecting tokenizers>=0.13.0 (from anthropic)
  Downloading tokenizers-0.19.1-cp38-cp38-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting huggingface-hub<1.0,>=0.16.4 (from tokenizers>=0.13.0->anthropic)
  Downloading huggingface_hub-0.24.5-py3-none-any.whl.metadata (13 kB)
Collecting filelock (from huggingface-hub<1.0,>=0.16.4->tokenizers>=0.13.0->anthropic)
  Using cached filelock-3.15.4-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub<1.0,>=0.16.4->tokenizers>=0.13.0->anthropic)
  Using cached fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB)
Downloading anthropic-0.32.0-py3-none-any.whl (866 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m866.6/866.6 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hDownloading tokenizers-0.19.1-cp38-cp38-macosx_11_0_arm64.whl (2.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Goal:
  - Test gpt 4o mini fine-tuning.

# Plan:
- Create an eval pipeline
- Fine-tuning gpt 4o mini
- Evaluate the model

# Eval multi openai models

In [None]:
import json

# Read the original JSON file
input_file = '/Users/huyknguyen/Desktop/redhorse/code_projects/eval/eval/EvalDataset-20.json'
output_file = '/Users/huyknguyen/Desktop/redhorse/code_projects/eval/eval/EvalDataset-20-openaiFormat.jsonl'

# Read the input file
with open(input_file, 'r') as f:
    data = json.load(f)

# Transform the data into openai format and write to JSONL file
with open(output_file, 'w') as f:
    for item in data:
        transformed_item = {
            "messages": [
                {
                    "role": "system",
                    "content": "You are the smartest entity in the universe. Reasoning step by step to get the best answer."
                },
                {
                    "role": "user",
                    "content": item["instruction"]
                },
                {
                    "role": "assistant",
                    "content": item["output"]
                }
            ]
        }
        # Write each transformed item as a single line in the output file
        f.write(json.dumps(transformed_item) + '\n')

print(f"Transformation complete. Output written to {output_file}")

In [None]:
import asyncio
import json
import re
import pandas as pd
from openai import AsyncOpenAI
import os

client = AsyncOpenAI()

# Load the evaluation dataset
input_file_path = './input/huy_dataset/huy_test.json'
with open(input_file_path, 'r') as f:
  eval_data = json.load(f)

async def get_model_answer(instruction, model_name):
  messages = [
      {"role": "system", "content": "You are a world-class mathematician. First briefly repeat the answer in 1 sentence, then reasoning step by step to make sure you get the correct answer."},
      {"role": "user", "content": instruction}
  ]
  response = await client.chat.completions.create(
      model=model_name,
      messages=messages,
      temperature=0.0,
      tool_choice=None
  )
  return response.choices[0].message.content

async def evaluate_answer(model_answer, expected_output):
  messages = [
      {"role": "system", "content": "You are a world-class AI model evaluator. Your task is to compare the model's answer with the expected output and provide a score from 0 to 10. Use the following scale: 0 is completely wrong, 10 is completely correct, 8-9 if correct but missing detail or not a complete answer. Dont grade on formatting, as long as the answer is correct. Always include the numeric score (0-10) in your response."},
      {"role": "user", "content": f"Model answer: {model_answer}\n\nExpected output: {expected_output}\n\nPlease evaluate and provide a score from 0 to 10, no text or explanations needed."}
  ]
  response = await client.chat.completions.create(
      model="gpt-4o",
      messages=messages,
      temperature=0.0,
      tool_choice=None
  )
  return response.choices[0].message.content

def extract_score(evaluation):
  match = re.search(r'\b(?:10|[0-9])\b', evaluation)
  if match:
      return int(match.group())
  else:
      print(f"Warning: Could not extract score from evaluation: {evaluation}")
      return 0  # Default to 0 if we can't extract a valid score

async def process_item(item, model_name):
  model_answer = await get_model_answer(item['instruction'], model_name)
  evaluation = await evaluate_answer(model_answer, item['output'])
  score = extract_score(evaluation)
  return item['instruction'], item['output'], model_answer, evaluation, score

async def evaluate_model(model_name):
  tasks = [process_item(item, model_name) for item in eval_data]
  results = await asyncio.gather(*tasks)

  df = pd.DataFrame(results, columns=['Instruction', 'Expected Output', 'Model Answer', 'Evaluation', 'Score'])
  avg_score = df['Score'].mean()
  
  print(f"\nModel: {model_name}")
  print(f"Average Evaluation Score: {avg_score:.2f}")

  # Create output directory if it doesn't exist
  os.makedirs('./output', exist_ok=True)

  # Remove the 'Evaluation' column before saving to Excel
  df_to_save = df.drop(columns=['Evaluation'])

  excel_path = f'./output/evaluation_results_{model_name}.xlsx'
  df_to_save.to_excel(excel_path, index=False)
  print(f"Results saved to {excel_path}")

  return df, avg_score
async def main():
  models_to_evaluate = ["gpt-4o-mini", "gpt-4o"]  # Add your model names here
  results = {}

  for model in models_to_evaluate:
      df, avg_score = await evaluate_model(model)
      results[model] = {"df": df, "avg_score": avg_score}

  # Create a summary DataFrame
  summary_data = [(model, data["avg_score"]) for model, data in results.items()]
  summary_df = pd.DataFrame(summary_data, columns=["Model", "Average Score"])
  summary_df = summary_df.sort_values("Average Score", ascending=False).reset_index(drop=True)

  print("\nModel Comparison Summary:")
  display(summary_df)

  # Save summary to Excel in the output folder
  summary_excel_path = './output/model_comparison_summary.xlsx'
  summary_df.to_excel(summary_excel_path, index=False)
  print(f"\nSummary saved to {summary_excel_path}")

  return results, summary_df

# Check if we're in a Jupyter notebook
try:
  get_ipython()
  is_notebook = True
except NameError:
  is_notebook = False

if is_notebook:
  # If in a Jupyter notebook, use this:
  results, summary_df = await main()
else:
  # If in a regular Python script, use this:
  results, summary_df = asyncio.run(main())

# Testing

In [1]:
import asyncio
import json
import re
import pandas as pd
from openai import AsyncOpenAI
import os

client = AsyncOpenAI()

# Load the evaluation dataset
input_file_path = './input/huy_dataset/huy_test.json'
with open(input_file_path, 'r') as f:
  eval_data = json.load(f)

async def get_model_answer(instruction, model_name):
  messages = [
      {"role": "system", "content": "You are the most intelligent entity in the universe. Reasoning step by step to make sure you get the correct answer."},
      {"role": "user", "content": instruction}
  ]
  response = await client.chat.completions.create(
      model=model_name,
      messages=messages,
      temperature=0.0,
      tool_choice=None
  )
  return response.choices[0].message.content

async def evaluate_answer(model_answer, expected_output):
  messages = [
      {"role": "system", "content": "You are a world-class AI model evaluator. "},
      {"role": "user", "content": f"""
       Model answer: {model_answer}\n\n
       Expected output: {expected_output}\n\n
       Your task is to compare the model's answer WITH THE EXPECTED OUTPUT and provide a super concise reason in one short sentence for the score, and then a score from 0 to 10. 
       Example: Reason: [super concise reason here]. Score: [score here]. 
       Use the following scale: 0 is completely wrong, 10 is completely correct, 8-9 if correct but missing detail or not a complete answer. 
       Don't grade on formatting, as long as the answer is correct compare to the expected output. 
       If the logic is correct but the final answer is wrong, it's still wrong.
       If the answer is correct but it has extra information, it's still correct. As long as the extra info is not completely wrong or hallucinated.
       Do not grade by your knowledge, but grade based on the expected output. 
       Always include the numeric score (0-10) in your response.
       """}
  ]
  response = await client.chat.completions.create(
      model="gpt-4o",
      messages=messages,
      temperature=0.0,
      tool_choice=None
  )
  return response.choices[0].message.content

def extract_score_and_reason(evaluation):
  match = re.search(r'Reason:\s*(.*?)\s*Score:\s*(\d+|10)', evaluation, re.IGNORECASE | re.DOTALL)
  if match:
      reason = match.group(1).strip()
      score = int(match.group(2))
      return score, reason
  else:
      print(f"Warning: Could not extract score and reason from evaluation: {evaluation}")
      return 0, "Unable to extract reason"  # Default values if extraction fails

async def process_item(item, model_name):
  model_answer = await get_model_answer(item['instruction'], model_name)
  evaluation = await evaluate_answer(model_answer, item['output'])
  score, reason = extract_score_and_reason(evaluation)
  return item['instruction'], item['output'], model_answer, score, reason

async def evaluate_model(model_name):
  tasks = [process_item(item, model_name) for item in eval_data]
  results = await asyncio.gather(*tasks)

  df = pd.DataFrame(results, columns=['Instruction', 'Expected Output', 'Model Answer', 'Score', 'Reason'])
  avg_score = df['Score'].mean()
  
  print(f"\nModel: {model_name}")
  print(f"Average Evaluation Score: {avg_score:.2f}")

  # Create output directory if it doesn't exist
  os.makedirs('./output', exist_ok=True)

  excel_path = f'./output/evaluation_results_{model_name}.xlsx'
  df.to_excel(excel_path, index=False)
  print(f"Results saved to {excel_path}")

  return df, avg_score

async def main():
  models_to_evaluate = ["gpt-4o-mini", "gpt-4o", "gpt-4-0125-preview"]  # Add your model names here
  results = {}

  for model in models_to_evaluate:
      df, avg_score = await evaluate_model(model)
      results[model] = {"df": df, "avg_score": avg_score}

  # Create a summary DataFrame
  summary_data = [(model, data["avg_score"]) for model, data in results.items()]
  summary_df = pd.DataFrame(summary_data, columns=["Model", "Average Score"])
  summary_df = summary_df.sort_values("Average Score", ascending=False).reset_index(drop=True)

  print("\nModel Comparison Summary:")
  display(summary_df)

  # Save summary to Excel in the output folder
  summary_excel_path = './output/model_comparison_summary.xlsx'
  summary_df.to_excel(summary_excel_path, index=False)
  print(f"\nSummary saved to {summary_excel_path}")

  return results, summary_df

# Check if we're in a Jupyter notebook
try:
  get_ipython()
  is_notebook = True
except NameError:
  is_notebook = False

if is_notebook:
  # If in a Jupyter notebook, use this:
  results, summary_df = await main()
else:
  # If in a regular Python script, use this:
  results, summary_df = asyncio.run(main())


Model: gpt-4o-mini
Average Evaluation Score: 7.69
Results saved to ./output/evaluation_results_gpt-4o-mini.xlsx

Model: gpt-4o
Average Evaluation Score: 8.94
Results saved to ./output/evaluation_results_gpt-4o.xlsx

Model: gpt-4-0125-preview
Average Evaluation Score: 9.31
Results saved to ./output/evaluation_results_gpt-4-0125-preview.xlsx

Model Comparison Summary:


Unnamed: 0,Model,Average Score
0,gpt-4-0125-preview,9.3125
1,gpt-4o,8.9375
2,gpt-4o-mini,7.6875



Summary saved to ./output/model_comparison_summary.xlsx


# Eval different models:
- Eval pipeline for other models: google, anthropic, open sources