# Goal:
  - Test gpt 4o mini fine-tuning.

# Plan:
- Create an eval pipeline
- Fine-tuning gpt 4o mini
- Evaluate the model

# Eval multi openai models

In [8]:
import asyncio
import json
import re
import pandas as pd
from openai import AsyncOpenAI

client = AsyncOpenAI()

# Load the evaluation dataset
with open('/Users/huyknguyen/Desktop/redhorse/code_projects/finetuning/EvalDataset-100.json', 'r') as f:
  eval_data = json.load(f)

async def get_model_answer(instruction, model_name):
  messages = [
      {"role": "system", "content": "You are a the most intelligence entity in the universe. Your task is to carefully reasoning and provide a step-by-step  correct solution to the following math problem."},
      {"role": "user", "content": instruction}
  ]
  response = await client.chat.completions.create(
      model=model_name,
      messages=messages,
      temperature=0.0,
      tool_choice=None
  )
  return response.choices[0].message.content

async def evaluate_answer(model_answer, expected_output):
  messages = [
      {"role": "system", "content": "You are an world-class AI model evaluator. Your task is to compare the model's answer with the expected output and provide a score of 0 for incorrect and 1 for correct. Focus primarily on whether the model got the answer correct or not. The format doesn't affect the score. Always include the numeric score (0 or 1) in your response."},
      {"role": "user", "content": f"Model answer: {model_answer}\n\nExpected output: {expected_output}\n\nPlease evaluate and provide a score of 0 (incorrect) or 1 (correct), no text or explanations needed."}
  ]
  response = await client.chat.completions.create(
      model="gpt-4o",
      messages=messages,
      temperature=0.0,
      tool_choice=None
  )
  return response.choices[0].message.content

def extract_score(evaluation):
  match = re.search(r'\b[01]\b', evaluation)
  if match:
      return int(match.group())
  else:
      print(f"Warning: Could not extract score from evaluation: {evaluation}")
      return 0  # Default to 0 if we can't extract a valid score

async def process_item(item, model_name):
  model_answer = await get_model_answer(item['instruction'], model_name)
  evaluation = await evaluate_answer(model_answer, item['output'])
  score = extract_score(evaluation)
  return item['instruction'], item['output'], model_answer, evaluation, score

async def evaluate_model(model_name):
  tasks = [process_item(item, model_name) for item in eval_data]
  results = await asyncio.gather(*tasks)

  df = pd.DataFrame(results, columns=['Instruction', 'Expected Output', 'Model Answer', 'Evaluation', 'Score'])
  avg_score = df['Score'].mean()
  
  print(f"\nModel: {model_name}")
  print(f"Average Evaluation Score: {avg_score:.2f}")

  excel_path = f'./evaluation_results_{model_name}.xlsx'
  df.to_excel(excel_path, index=False)
  print(f"Results saved to {excel_path}")

  return df, avg_score

async def main():
  models_to_evaluate = ["gpt-4o-mini", "gpt-4o", "gpt-4-0125-preview" ]  # Add your model names here "gpt-3.5-turbo", "gpt-4o-mini", "gpt-4o",
  results = {}

  for model in models_to_evaluate:
      df, avg_score = await evaluate_model(model)
      results[model] = {"df": df, "avg_score": avg_score}

  # Create a summary DataFrame
  summary_data = [(model, data["avg_score"]) for model, data in results.items()]
  summary_df = pd.DataFrame(summary_data, columns=["Model", "Average Score"])
  summary_df = summary_df.sort_values("Average Score", ascending=False).reset_index(drop=True)

  print("\nModel Comparison Summary:")
  display(summary_df)

  # Save summary to Excel
  summary_excel_path = './model_comparison_summary.xlsx'
  summary_df.to_excel(summary_excel_path, index=False)
  print(f"\nSummary saved to {summary_excel_path}")

  return results, summary_df

# Check if we're in a Jupyter notebook
try:
  get_ipython()
  is_notebook = True
except NameError:
  is_notebook = False

if is_notebook:
  # If in a Jupyter notebook, use this:
  results, summary_df = await main()
else:
  # If in a regular Python script, use this:
  results, summary_df = asyncio.run(main())


Model: gpt-4o-mini
Average Evaluation Score: 0.68
Results saved to ./evaluation_results_gpt-4o-mini.xlsx

Model: gpt-4o
Average Evaluation Score: 0.71
Results saved to ./evaluation_results_gpt-4o.xlsx

Model: gpt-4-0125-preview
Average Evaluation Score: 0.70
Results saved to ./evaluation_results_gpt-4-0125-preview.xlsx

Model Comparison Summary:


Unnamed: 0,Model,Average Score
0,gpt-4o,0.71
1,gpt-4-0125-preview,0.7
2,gpt-4o-mini,0.68



Summary saved to ./model_comparison_summary.xlsx
