In [7]:
import os
from dotenv import load_dotenv
from openai import OpenAI


### Check available models

In [24]:
load_dotenv()

client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    base_url=os.getenv("OPENAI_BASE_URL")
)

models = client.models.list()
for m in models.data:
    print(m.id)
    

gpt-4-0613
gpt-4
gpt-3.5-turbo
gpt-5.2-codex
gpt-4o-mini-tts-2025-12-15
gpt-realtime-mini-2025-12-15
gpt-audio-mini-2025-12-15
chatgpt-image-latest
davinci-002
babbage-002
gpt-3.5-turbo-instruct
gpt-3.5-turbo-instruct-0914
dall-e-3
dall-e-2
gpt-4-1106-preview
gpt-3.5-turbo-1106
tts-1-hd
tts-1-1106
tts-1-hd-1106
text-embedding-3-small
text-embedding-3-large
gpt-4-0125-preview
gpt-4-turbo-preview
gpt-3.5-turbo-0125
gpt-4-turbo
gpt-4-turbo-2024-04-09
gpt-4o
gpt-4o-2024-05-13
gpt-4o-mini-2024-07-18
gpt-4o-mini
gpt-4o-2024-08-06
chatgpt-4o-latest
gpt-4o-audio-preview
gpt-4o-realtime-preview
omni-moderation-latest
omni-moderation-2024-09-26
gpt-4o-realtime-preview-2024-12-17
gpt-4o-audio-preview-2024-12-17
gpt-4o-mini-realtime-preview-2024-12-17
gpt-4o-mini-audio-preview-2024-12-17
o1-2024-12-17
o1
gpt-4o-mini-realtime-preview
gpt-4o-mini-audio-preview
o3-mini
o3-mini-2025-01-31
gpt-4o-2024-11-20
gpt-4o-search-preview-2025-03-11
gpt-4o-search-preview
gpt-4o-mini-search-preview-2025-03-11
gpt

### structured JSON sanity

In [31]:
load_dotenv()

# model = os.getenv("OPENAI_MODEL")
model = "gpt-5.2"

client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    base_url=os.getenv("OPENAI_BASE_URL")
)

prompt = """
Generate ONE reasoning question and output ONLY valid JSON in this format:

{
  "category": "reasoning",
  "difficulty": 1-5,
  "prompt": "question text"
}
"""

response = client.chat.completions.create(
    model=model,
    messages=[
        {"role": "system", "content": "You generate benchmark questions."},
        {"role": "user", "content": prompt}
    ],
    temperature = 0.0
)

print(response.choices[0].message.content)


{
  "category": "reasoning",
  "difficulty": 3,
  "prompt": "A store sells notebooks for $3 each and pens for $2 each. Maya buys 4 items total and spends $10. How many notebooks and how many pens did she buy?"
}


### JSON scoring reliability

In [12]:
import os, json
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"), base_url=os.getenv("OPENAI_BASE_URL"))

# question = "If five machines take five minutes to make five widgets, how long would it take one machine to make a hundred widgets?"
# answer = "100 minutes."

question = "Explain why the Earth has seasons."
answer = "The Earth has seasons because it moves around the Sun."

judge_prompt = f"""
You are a strict evaluator for benchmark answers.

Grade the answer to the question using this rubric:
- correctness (0 to 1)
- completeness (0 to 1)
- clarity (0 to 1)

Return ONLY valid JSON exactly in this schema:
{{
  "score": 0.0,
  "pass": false,
  "reasons": ["..."],
  "rubric_breakdown": {{
    "correctness": 0.0,
    "completeness": 0.0,
    "clarity": 0.0
  }},
  "confidence": 0.0
}}

Question: {question}
Answer: {answer}
"""

resp = client.chat.completions.create(
    model=os.getenv("OPENAI_MODEL"),
    messages=[
        {"role": "system", "content": "You are a strict JSON-only grader."},
        {"role": "user", "content": judge_prompt},
    ],
)

txt = resp.choices[0].message.content
print(txt)

# Optional: validate it's parseable JSON
data = json.loads(txt)
print("\nParsed score:", data["score"])


{
  "score": 0.0,
  "pass": false,
  "reasons": [
    "The explanation is incomplete and misleading: seasons are primarily caused by the tilt of the Earth's axis relative to its orbital plane around the Sun, which changes the angle and duration of sunlight; simply stating that the Earth 'moves around the Sun' does not explain the seasonal cycle."
  ],
  "rubric_breakdown": {
    "correctness": 0.0,
    "completeness": 0.0,
    "clarity": 0.5
  },
  "confidence": 0.6
}

Parsed score: 0.0


In [23]:
# %mkdir src scripts
# %mkdir data
!type nul > scripts/bench.py