In [10]:
import pandas as pd
import yaml

import glob
import json
import os


registry_pth = os.path.join("..", "evals", "registry")
os.makedirs(os.path.join(registry_pth, "data", "calculus"), exist_ok=True)



def create_chat_prompt(prompt, ideal, rubric):
  return dict(
    input = [
      dict(
        role = 'system',
        content = prompt,
      ),
    ],
    ideal = ideal,
    rubric = rubric,
  )

def create_chat_prompt_labeled(prompt, ideal, rubric, completion, choice):
  return dict(
    input = [
      dict(
        role = 'system',
        content = prompt,
      ),
    ],
    ideal = ideal,
    rubric = rubric,
    completion = completion,
    choice = choice,
  )



def print_evallog(path_match, idx=-1):
    logfiles = glob.glob(path_match)
    logfiles.sort()
    log_name = logfiles[idx]
    print(log_name)
    with open(log_name, "r") as f:
        events_df = pd.read_json(f, lines=True)
    for entry in events_df['data']:
        if entry == entry:
            if 'prompt' in entry:
                print('*' * 40, 'prompt')
                for x in entry['prompt']:
                    print(x['content'])
            if 'sampled' in entry:
                print('-' * 40, 'sampled')
                for x in entry['sampled']:
                    print(x)
            if 'choice' in entry:
                print('-' * 40, 'choice:', entry['choice'])
            if 'score' in entry:
                print('-' * 40, 'score:', entry['score'])
            if 'metascore' in entry:
                print('-' * 40, 'metascore:', entry['metascore'])
    return events_df

In [2]:
with open('calculus-demo.yaml', 'r') as f:
    data0 = yaml.load(f, yaml.CLoader)
df0 = pd.DataFrame(data0)

In [3]:
demo_pth = os.path.join(registry_pth, 'data', 'calculus', 'demo.jsonl')
df0[['input', 'ideal', 'rubric']].to_json(demo_pth, lines=True, orient="records")

In [4]:
data1 = []
for row in data0:
    for example_eval in row['example_evals']:
        data1.append(dict(
            input = row['input'],
            ideal = row['ideal'],
            rubric = row['rubric'],
            completion = example_eval['completion'],
            choice = example_eval['choice'],
        ))

In [5]:
df1 = pd.DataFrame(data1)
demo_labeled_pth = os.path.join(registry_pth, 'data', 'calculus', 'demo_labeled.jsonl')
df1[['input', 'ideal', 'rubric', 'completion', 'choice']].to_json(demo_labeled_pth, lines=True, orient="records")

In [6]:
glob.glob('/tmp/evallogs/*_calculus.jsonl')

['/tmp/evallogs\\230521063016GCLA67AX_gpt-3.5-turbo_calculus.jsonl',
 '/tmp/evallogs\\230521064003ZX6CMR3C_gpt-3.5-turbo_calculus.jsonl',
 '/tmp/evallogs\\230521065918QNDMUMW3_gpt-3.5-turbo_calculus.jsonl',
 '/tmp/evallogs\\230521070112BCUEO4NZ_gpt-3.5-turbo_calculus.jsonl',
 '/tmp/evallogs\\230521070826AALQ3GPC_gpt-3.5-turbo_calculus.jsonl',
 '/tmp/evallogs\\230521071107CFD45PFO_gpt-3.5-turbo_calculus.jsonl',
 '/tmp/evallogs\\2305210712373AURJBGC_gpt-3.5-turbo_calculus.jsonl',
 '/tmp/evallogs\\230521234127DQBEUDEL_gpt-3.5-turbo_calculus.jsonl',
 '/tmp/evallogs\\230521235346KBXJC2YU_gpt-3.5-turbo_calculus.jsonl',
 '/tmp/evallogs\\230521235705QVVVZTWJ_gpt-3.5-turbo_calculus.jsonl',
 '/tmp/evallogs\\230522012817TBPARCPR_gpt-3.5-turbo_calculus.jsonl',
 '/tmp/evallogs\\230522015226V2WZGJ7B_gpt-3.5-turbo_calculus.jsonl',
 '/tmp/evallogs\\230522025627W2CWE67U_gpt-3.5-turbo_calculus.jsonl',
 '/tmp/evallogs\\230522033738ZAHPBMDI_gpt-3.5-turbo_calculus.jsonl',
 '/tmp/evallogs\\230522035632QB7HS

In [None]:
!oaieval gpt-3.5-turbo calculus

In [11]:
events_df = print_evallog('/tmp/evallogs/*_calculus.jsonl')

/tmp/evallogs\230522063909VGWZESA5_gpt-3.5-turbo_calculus.jsonl
**************************************** prompt
Evaluate the following integral:
$$
\int \cos x dx
$$
When you are done, repeat your answer by itself on a new line.
---------------------------------------- sampled
$$
\int \cos x dx = \sin x + C
$$
**************************************** prompt
Evaluate the following integral:
$$
\int \cosh x dx
$$
When you are done, repeat your answer by itself on a new line.
---------------------------------------- sampled
We have:
\begin{align*}
\int \cosh x dx &= \sinh x + C \\
&= \frac{e^x - e^{-x}}{2} + C
\end{align*}
where $C$ is the constant of integration.
**************************************** prompt
You are comparing a submitted answer to an expert answer for a given mathematics problem:
---
[Mathematics Problem]: Evaluate the following integral:
$$
\int \cos x dx
$$
When you are done, repeat your answer by itself on a new line.
---
[Expert]: $|int \cos x dx = \sin x + C$.
---

In [19]:
for entry in events_df['data'].iloc[-2]['prompt']:
    print(entry['content'])

You are comparing a submitted answer to an expert answer for a given mathematics problem:
---
[Mathematics Problem]: Evaluate the following integral:
$$
\int \frac{{dx}}{{x^2+a^2}}
$$
When you are done, repeat your answer by itself on a new line.
---
[Expert]: $\int \frac{dx}{x^2+a^2} = \frac{1}{a} \tan^{-1}\left(\frac{x}{a}\right) + C$.
---
[Submission]: We can start by using the substitution $x=a\tan\theta$, which gives us $dx=a\sec^2\theta d\theta$ and $x^2+a^2=a^2\sec^2\theta$. Substituting these into the integral, we get:
\begin{align*}
\int \frac{dx}{x^2+a^2} &= \int \frac{a\sec^2\theta d\theta}{a^2\sec^2\theta} \\
&= \frac{1}{a}\int \frac{d\theta}{\sec^2\theta} \\
&= \frac{1}{a}\int \cos^2\theta d\theta \\
&= \frac{1}{a}\int \frac{1+\cos 2\theta}{2} d\theta \\
&= \frac{1}{2a}\left(\theta + \frac{1}{2}\sin 2\theta\right) + C \\
&= \frac{1}{2a}\left(\tan^{-1}\frac{x}{a} + \frac{x}{a}\cdot\frac{1}{2}\cdot\frac{a^2}{a^2+x^2}\right) + C \\
&= \frac{1}{2a}\left(\tan^{-1}\frac{x}{a} + 

In [20]:
for entry in events_df['data'].iloc[-2]['sampled']:
    print(entry)

The submission correctly uses the substitution $x=a\tan\theta$ to simplify the integral. The subsequent steps are also correct, leading to the final answer of $\frac{1}{2a}\left(\tan^{-1}\frac{x}{a} + \frac{x}{2(a^2+x^2)}\right) + C$. This expression is equivalent to the expert answer of $\frac{1}{a} \tan^{-1}\left(\frac{x}{a}\right) + C$, but not identical. Therefore, the Final Answer Correctness score is 0. However, the explanation is not relevant to the grading rubric, so the Explanation Correctness score is not applicable. 

0


In [16]:
events_df['data'].iloc[-2]

{'prompt': [{'role': 'user',
   'content': 'You are comparing a submitted answer to an expert answer for a given mathematics problem:\n---\n[Mathematics Problem]: Evaluate the following integral:\n$$\n\\int \\frac{{dx}}{{x^2+a^2}}\n$$\nWhen you are done, repeat your answer by itself on a new line.\n---\n[Expert]: $\\int \\frac{dx}{x^2+a^2} = \\frac{1}{a} \\tan^{-1}\\left(\\frac{x}{a}\\right) + C$.\n---\n[Submission]: We can start by using the substitution $x=a\\tan\\theta$, which gives us $dx=a\\sec^2\\theta d\\theta$ and $x^2+a^2=a^2\\sec^2\\theta$. Substituting these into the integral, we get:\n\\begin{align*}\n\\int \\frac{dx}{x^2+a^2} &= \\int \\frac{a\\sec^2\\theta d\\theta}{a^2\\sec^2\\theta} \\\\\n&= \\frac{1}{a}\\int \\frac{d\\theta}{\\sec^2\\theta} \\\\\n&= \\frac{1}{a}\\int \\cos^2\\theta d\\theta \\\\\n&= \\frac{1}{a}\\int \\frac{1+\\cos 2\\theta}{2} d\\theta \\\\\n&= \\frac{1}{2a}\\left(\\theta + \\frac{1}{2}\\sin 2\\theta\\right) + C \\\\\n&= \\frac{1}{2a}\\left(\\tan^{-1}

In [12]:
events_df

Unnamed: 0,spec,final_report,run_id,event_id,sample_id,type,data,created_by,created_at
0,"{'completion_fns': ['gpt-3.5-turbo'], 'eval_na...",,,,,,,,NaT
1,,"{'counts/100': 16, 'counts/95': 1, 'counts/0':...",,,,,,,NaT
2,,,230522063909VGWZESA5,0.0,calculus.dev.5,sampling,"{'prompt': [{'role': 'system', 'content': 'Eva...",,2023-05-22 06:39:11.684845+00:00
3,,,230522063909VGWZESA5,1.0,calculus.dev.15,sampling,"{'prompt': [{'role': 'system', 'content': 'Eva...",,2023-05-22 06:39:16.063785+00:00
4,,,230522063909VGWZESA5,2.0,calculus.dev.5,sampling,"{'prompt': [{'role': 'user', 'content': 'You a...",,2023-05-22 06:39:17.396380+00:00
5,,,230522063909VGWZESA5,3.0,calculus.dev.5,metrics,"{'choice': '100', 'score': 1.0}",,2023-05-22 06:39:17.396380+00:00
6,,,230522063909VGWZESA5,4.0,calculus.dev.7,sampling,"{'prompt': [{'role': 'system', 'content': 'Eva...",,2023-05-22 06:39:20.066030+00:00
7,,,230522063909VGWZESA5,5.0,calculus.dev.3,sampling,"{'prompt': [{'role': 'system', 'content': 'Eva...",,2023-05-22 06:39:20.694882+00:00
8,,,230522063909VGWZESA5,6.0,calculus.dev.0,sampling,"{'prompt': [{'role': 'system', 'content': 'Eva...",,2023-05-22 06:39:21.236356+00:00
9,,,230522063909VGWZESA5,7.0,calculus.dev.13,sampling,"{'prompt': [{'role': 'system', 'content': 'Eva...",,2023-05-22 06:39:21.999777+00:00


In [None]:
!oaieval gpt-3.5-turbo calculus-meta

In [9]:
print_evallog('/tmp/evallogs/*_calculus-meta.jsonl')

/tmp/evallogs\230526021623NKRKJZ24_gpt-3.5-turbo_calculus-meta.jsonl
**************************************** prompt
You are comparing a submitted answer to an expert answer for a given mathematics problem:
---
[Mathematics Problem]: Evaluate the following integral:
$$
\int \csc^2 x dx
$$
When you are done, repeat your answer by itself on a new line.
---
[Expert]: $\int \csc^2 x dx = -\cot x + C$.
---
[Submission]: The integral of $\csc^2 x dx$ is $-\cot x + C$.
---
You should specifically compare the exact final expressions from the student and the expert. In general, it is acceptable for the expressions to be equivalent but not identical. Use the following rubric to score the submission. Round the score to the nearest integer in the range 0-100 inclusive.

[Rubric]: - Final Answer Correctness (50 points): The final answer provided by the model matches the expert's answer.
  - 50: The final answer is $-\cot x + C$.
  - 0: The final answer is not $-\cot x + C$.
- Explanation Correctnes

Prompt for the language model:

We've been working on creating a dataset for assessing a model's ability to solve calculus problems and provide explanations. We're using a set of standard integration formulas as the basis for the problems, creating a YAML file for each. Each YAML file entry includes the problem statement, the correct answer, a grading rubric, and examples of model-generated answers with their respective scores. The rubric divides the grading into two parts: the correctness of the final answer and the correctness of the explanation. Could you please continue this work? Please follow the structure we've been using, as shown in this example:

```yaml
- input: |-
    Evaluate the following integral:
    $$
    \int e^{x} dx
    $$
  ideal: |-
    Because the derivative of $e^x$ is $e^x$, it follows that $\int e^x dx$ is simply $e^x + C$.
  rubric: |-
    - Final Answer Correctness (50 points): The final answer provided by the model matches the expert's answer.
      - 50: The final answer is $e^x + C$.
      - 0: The final answer is not $e^x + C$.
    - Explanation Correctness (50 points): The explanation provided by the model to arrive at the answer is correct and logically sound.
      - 50: The explanation correctly states that the derivative of $e^x$ is $e^x$, so the integral of $e^x dx$ is $e^x + C$.
      - 25: The explanation is partially correct but either does not correctly identify the derivative of $e^x" as $e^x$ or does not correctly state that the integral of $e^x dx$ is $e^x + C$.
      - 0: The explanation is incorrect or missing.
    There is little nuance in the grading of this problem, as the formula is fundamental and usually memorized as part of a standard integration table.
  example_evals:
    - completion: |-
        We know that the derivative of $e^x$ is itself, so we can use substitution to evaluate the integral. Let $u=e^x$, then $du=e^x dx$. Substituting these into the integral, we get:
        $$
        \int e^{x} dx = \int u du = \frac{u^2}{2} + C = \frac{e^{2x}}{2} + C
        $$
      choice: '0'
    - completion: |-
        We know that the derivative of $e^x$ is itself, so we can use integration by substitution. Let $u = x$ and $du = dx$. Then we have:
        $$
        \int e^{x} dx = \int e^{u} du = e^u + C = e^x + C
        $$
        where $C$ is the constant of integration. Therefore, the solution to the integral is $\boxed{e^x + C}$.
      choice: '100'
```

I would like to contribute to the OpenAI evals project and am working on model-graded calculus evals.

So far I have three files to setup the evals:

`evals/registry/modelgraded/mathematics.yaml`. I don't know the name for this type of file, but it specifies a template for a model-graded eval for use with mathematics evals. Here are the contents:

```yaml
mathematics:
  prompt: |-
    You are comparing a submitted answer to an expert answer for a given mathematics problem:
    ---
    [Mathematics Problem]: {input}
    ---
    [Expert]: {ideal}
    ---
    [Submission]: {completion}
    ---
    Use the following rubric to score the submission. Round the score to the nearest integer in the range 0-100 inclusive.

    [Rubric]: {rubric}
  eval_type: cot_classify
  choice_scores: {"100": 1.0, "99": 0.99, "98": 0.98, "97": 0.97, "96": 0.96, "95": 0.95, "94": 0.94, "93": 0.93, "92": 0.92, "91": 0.91, "90": 0.9, "89": 0.89, "88": 0.88, "87": 0.87, "86": 0.86, "85": 0.85, "84": 0.84, "83": 0.83, "82": 0.82, "81": 0.81, "80": 0.8, "79": 0.79, "78": 0.78, "77": 0.77, "76": 0.76, "75": 0.75, "74": 0.74, "73": 0.73, "72": 0.72, "71": 0.71, "70": 0.7, "69": 0.69, "68": 0.68, "67": 0.67, "66": 0.66, "65": 0.65, "64": 0.64, "63": 0.63, "62": 0.62, "61": 0.61, "60": 0.6, "59": 0.59, "58": 0.58, "57": 0.57, "56": 0.56, "55": 0.55, "54": 0.54, "53": 0.53, "52": 0.52, "51": 0.51, "50": 0.5, "49": 0.49, "48": 0.48, "47": 0.47, "46": 0.46, "45": 0.45, "44": 0.44, "43": 0.43, "42": 0.42, "41": 0.41, "40": 0.4, "39": 0.39, "38": 0.38, "37": 0.37, "36": 0.36, "35": 0.35, "34": 0.34, "33": 0.33, "32": 0.32, "31": 0.31, "30": 0.3, "29": 0.29, "28": 0.28, "27": 0.27, "26": 0.26, "25": 0.25, "24": 0.24, "23": 0.23, "22": 0.22, "21": 0.21, "20": 0.2, "19": 0.19, "18": 0.18, "17": 0.17, "16": 0.16, "15": 0.15, "14": 0.14, "13": 0.13, "12": 0.12, "11": 0.11, "10": 0.1, "9": 0.09, "8": 0.08, "7": 0.07, "6": 0.06, "5": 0.05, "4": 0.04, "3": 0.03, "2": 0.02, "1": 0.01, "0": 0.0}
  choice_strings: ["100", "99", "98", "97", "96", "95", "94", "93", "92", "91", "90", "89", "88", "87", "86", "85", "84", "83", "82", "81", "80", "79", "78", "77", "76", "75", "74", "73", "72", "71", "70", "69", "68", "67", "66", "65", "64", "63", "62", "61", "60", "59", "58", "57", "56", "55", "54", "53", "52", "51", "50", "49", "48", "47", "46", "45", "44", "43", "42", "41", "40", "39", "38", "37", "36", "35", "34", "33", "32", "31", "30", "29", "28", "27", "26", "25", "24", "23", "22", "21", "20", "19", "18", "17", "16", "15", "14", "13", "12", "11", "10", "9", "8", "7", "6", "5", "4", "3", "2", "1", "0"]
  input_outputs:
    input: "completion"
```

`evals/registry/evals/calculus.yaml`: This registers the eval. Its contents:

```yaml
calculus:
  id: calculus.dev.v0
  metrics: [accuracy]
calculus.dev.v0:
  class: evals.elsuite.modelgraded.classify:ModelBasedClassify
  args:
    samples_jsonl: calculus/demo.jsonl
    eval_type: cot_classify
    modelgraded_spec: mathematics

calculus-meta:
  id: calculus-meta.dev.v0
  metrics: [accuracy]
calculus-meta.dev.v0:
  class: evals.elsuite.modelgraded.classify:ModelBasedClassify
  args:
    samples_jsonl: calculus/demo_labeled.jsonl
    eval_type: cot_classify
    modelgraded_spec: mathematics
    metaeval: true
```

`evals/registry/data/calculus/demo.jsonl`: This is the dataset for the eval. For brevity I'll only list one line of the file:

```json
{"input":"Evaluate the following integral:\n$$\n\\int x^n dx\n$$","ideal":"$\\int x^n dx = \\frac{x^{n+1}}{n+1} + C$, for $n \\neq -1$.","rubric":"- Final Answer Correctness (50 points): The final answer provided by the model matches the expert's answer.\n  - 50: The final answer is $\\frac{x^{n+1}}{n+1} + C$.\n  - 0: The final answer is not $\\frac{x^{n+1}}{n+1} + C$.\n- Explanation Correctness (50 points): The explanation provided by the model to arrive at the answer is correct and logically sound.\n  - 50: The explanation correctly states the power rule for integration, and applies it correctly to arrive at the final answer.\n  - 25: The explanation is partially correct but either does not correctly state the power rule for integration or does not correctly apply it.\n  - 0: The explanation is incorrect or missing.\n\nThere is little nuance in the grading of this problem, as the formula is fundamental and usually memorized as part of a standard integration table."}
```

`evals/registry/data/calculus/demo_labels.jsonl`: This is the dataset for the meta eval, to evaluate the grading model. For brevity I'll only list two lines of the file:

```json
{"input":"Evaluate the following integral:\n$$\n\\int x^n dx\n$$","ideal":"$\\int x^n dx = \\frac{x^{n+1}}{n+1} + C$, for $n \\neq -1$.","rubric":"- Final Answer Correctness (50 points): The final answer provided by the model matches the expert's answer.\n  - 50: The final answer is $\\frac{x^{n+1}}{n+1} + C$.\n  - 0: The final answer is not $\\frac{x^{n+1}}{n+1} + C$.\n- Explanation Correctness (50 points): The explanation provided by the model to arrive at the answer is correct and logically sound.\n  - 50: The explanation correctly states the power rule for integration, and applies it correctly to arrive at the final answer.\n  - 25: The explanation is partially correct but either does not correctly state the power rule for integration or does not correctly apply it.\n  - 0: The explanation is incorrect or missing.\n\nThere is little nuance in the grading of this problem, as the formula is fundamental and usually memorized as part of a standard integration table.","completion":"The power rule for integration states that the integral of $x^n$ is $\\frac{x^{n+2}}{n+2} + C$. So the integral of $x^n dx$ is $\\frac{x^{n+2}}{n+2} + C$.","choice":"0"}
{"input":"Evaluate the following integral:\n$$\n\\int x^n dx\n$$","ideal":"$\\int x^n dx = \\frac{x^{n+1}}{n+1} + C$, for $n \\neq -1$.","rubric":"- Final Answer Correctness (50 points): The final answer provided by the model matches the expert's answer.\n  - 50: The final answer is $\\frac{x^{n+1}}{n+1} + C$.\n  - 0: The final answer is not $\\frac{x^{n+1}}{n+1} + C$.\n- Explanation Correctness (50 points): The explanation provided by the model to arrive at the answer is correct and logically sound.\n  - 50: The explanation correctly states the power rule for integration, and applies it correctly to arrive at the final answer.\n  - 25: The explanation is partially correct but either does not correctly state the power rule for integration or does not correctly apply it.\n  - 0: The explanation is incorrect or missing.\n\nThere is little nuance in the grading of this problem, as the formula is fundamental and usually memorized as part of a standard integration table.","completion":"The power rule for integration states that the integral of $x^n$ is $\\frac{x^{n+1}}{n+1} + C$, for $n \\neq -1$. So the integral of $x^n dx$ is $\\frac{x^{n+1}}{n+1} + C$.","choice":"100"}
---

I run the calculus evals using the command `oaieval gpt-3.5-turbo calculus`. Here are some lines from the log file:

```json
{"spec": {"completion_fns": ["gpt-3.5-turbo"], "eval_name": "calculus.dev.v0", "base_eval": "calculus", "split": "dev", "run_config": {"completion_fns": ["gpt-3.5-turbo"], "eval_spec": {"cls": "evals.elsuite.modelgraded.classify:ModelBasedClassify", "args": {"samples_jsonl": "calculus/demo.jsonl", "eval_type": "cot_classify", "modelgraded_spec": "mathematics"}, "key": "calculus.dev.v0", "group": "calculus"}, "seed": 20220722, "max_samples": null, "command": "'c:\\Users\\kaben\\Documents\\GitHub\\irrealis\\openai-evals\\venv\\Scripts\\oaieval' gpt-3.5-turbo calculus", "initial_settings": {"visible": true}}, "created_by": "", "run_id": "230522044906UV2WDKNC", "created_at": "2023-05-22 04:49:06.561537"}}
{"final_report": {"counts/100": 14, "counts/25": 1, "counts/95": 1, "counts/50": 1, "counts/13": 1, "score": 0.8794444444444445}}
{"run_id": "230522044906UV2WDKNC", "event_id": 7, "sample_id": "calculus.dev.14", "type": "sampling", "data": {"prompt": [{"role": "user", "content": "You are comparing a submitted answer to an expert answer for a given mathematics problem:\n---\n[Mathematics Problem]: Evaluate the following integral:\n$$\n\\int \\sinh x dx\n$$\n---\n[Expert]: $\\int \\sinh x dx = \\cosh x + C$.\n---\n[Submission]: We have:\n$$\n\\int \\sinh x dx = \\cosh x + C,\n$$\nwhere $C$ is the constant of integration. This can be verified by differentiating $\\cosh x$:\n$$\n\\frac{d}{dx}(\\cosh x) = \\sinh x,\n$$\nusing the chain rule. Therefore, the integral of $\\sinh x$ is $\\cosh x$ plus a constant.\n---\nUse the following rubric to score the submission. Round the score to the nearest integer in the range 0-100 inclusive.\n\n[Rubric]: - Final Answer Correctness (50 points):\n  - 50: The final answer is $\\cosh x + C$.\n  - 0: The final answer is not $\\cosh x + C$.\n- Explanation Correctness (50 points):\n  - 50: The explanation correctly states that the derivative of $\\cosh x$ is $\\sinh x$, so the integral of $\\sinh x dx$ is $\\cosh x + C$.\n  - 25: The explanation is partially correct but either does not correctly identify the derivative of $\\cosh x$ as $\\sinh x$ or does not correctly state that the integral of $\\sinh x dx$ is $\\cosh x + C$.\n  - 0: The explanation is incorrect or missing.\n\nThere is little nuance in the grading of this problem, as the formula is fundamental and usually memorized as part of a standard integration table.\n\nFirst, write out in a step by step manner your reasoning to be sure that your conclusion is correct. Avoid simply stating the correct answer at the outset. Then print only a single choice from \"100\" or \"99\" or \"98\" or \"97\" or \"96\" or \"95\" or \"94\" or \"93\" or \"92\" or \"91\" or \"90\" or \"89\" or \"88\" or \"87\" or \"86\" or \"85\" or \"84\" or \"83\" or \"82\" or \"81\" or \"80\" or \"79\" or \"78\" or \"77\" or \"76\" or \"75\" or \"74\" or \"73\" or \"72\" or \"71\" or \"70\" or \"69\" or \"68\" or \"67\" or \"66\" or \"65\" or \"64\" or \"63\" or \"62\" or \"61\" or \"60\" or \"59\" or \"58\" or \"57\" or \"56\" or \"55\" or \"54\" or \"53\" or \"52\" or \"51\" or \"50\" or \"49\" or \"48\" or \"47\" or \"46\" or \"45\" or \"44\" or \"43\" or \"42\" or \"41\" or \"40\" or \"39\" or \"38\" or \"37\" or \"36\" or \"35\" or \"34\" or \"33\" or \"32\" or \"31\" or \"30\" or \"29\" or \"28\" or \"27\" or \"26\" or \"25\" or \"24\" or \"23\" or \"22\" or \"21\" or \"20\" or \"19\" or \"18\" or \"17\" or \"16\" or \"15\" or \"14\" or \"13\" or \"12\" or \"11\" or \"10\" or \"9\" or \"8\" or \"7\" or \"6\" or \"5\" or \"4\" or \"3\" or \"2\" or \"1\" or \"0\" (without quotes or punctuation) on its own line corresponding to the correct answer. At the end, repeat just the answer by itself on a new line.\n\nReasoning:"}], "sampled": ["The submission correctly identifies the integral of $\\sinh x$ as $\\cosh x + C$ and provides a valid explanation by showing that the derivative of $\\cosh x$ is $\\sinh x$. Therefore, the submission receives a score of 100.\n\n100\n\nAnswer: 100"]}, "created_by": "", "created_at": "2023-05-22 04:49:20.887594+00:00"}
```

I run the evals of the grading model using the command `oaieval gpt-3.5-turbo calculus-meta`. Here are some lines from the log file:

```json
{"spec": {"completion_fns": ["gpt-3.5-turbo"], "eval_name": "calculus-meta.dev.v0", "base_eval": "calculus-meta", "split": "dev", "run_config": {"completion_fns": ["gpt-3.5-turbo"], "eval_spec": {"cls": "evals.elsuite.modelgraded.classify:ModelBasedClassify", "args": {"samples_jsonl": "calculus/demo_labeled.jsonl", "eval_type": "cot_classify", "modelgraded_spec": "mathematics", "metaeval": true}, "key": "calculus-meta.dev.v0", "group": "calculus"}, "seed": 20220722, "max_samples": null, "command": "'C:\\Users\\kaben\\Documents\\GitHub\\irrealis\\openai-evals\\venv\\Scripts\\oaieval' gpt-3.5-turbo calculus-meta", "initial_settings": {"visible": true}}, "created_by": "", "run_id": "230522045050IMCI7PPN", "created_at": "2023-05-22 04:50:50.230921"}}
{"final_report": {"counts/100": 22, "counts/0": 6, "counts/50": 3, "counts/75": 1, "counts/25": 3, "counts/13": 1, "score": 0.6980555555555557, "metascore": 0.6666666666666666}}
{"run_id": "230522045050IMCI7PPN", "event_id": 36, "sample_id": "calculus-meta.dev.29", "type": "sampling", "data": {"prompt": [{"role": "user", "content": "You are comparing a submitted answer to an expert answer for a given mathematics problem:\n---\n[Mathematics Problem]: Evaluate the following integral:\n$$\n\\int \\sinh x dx\n$$\n---\n[Expert]: $\\int \\sinh x dx = \\cosh x + C$.\n---\n[Submission]: The integral $\\int \\sinh x dx$ can be solved by recognizing that the derivative of $\\cosh x$ is $\\sinh x$. Therefore, we have:\n$$\n\\int \\sinh x dx = \\cosh x + C\n$$\n---\nUse the following rubric to score the submission. Round the score to the nearest integer in the range 0-100 inclusive.\n\n[Rubric]: - Final Answer Correctness (50 points):\n  - 50: The final answer is $\\cosh x + C$.\n  - 0: The final answer is not $\\cosh x + C$.\n- Explanation Correctness (50 points):\n  - 50: The explanation correctly states that the derivative of $\\cosh x$ is $\\sinh x$, so the integral of $\\sinh x dx$ is $\\cosh x + C$.\n  - 25: The explanation is partially correct but either does not correctly identify the derivative of $\\cosh x$ as $\\sinh x$ or does not correctly state that the integral of $\\sinh x dx$ is $\\cosh x + C$.\n  - 0: The explanation is incorrect or missing.\n\nThere is little nuance in the grading of this problem, as the formula is fundamental and usually memorized as part of a standard integration table.\n\nFirst, write out in a step by step manner your reasoning to be sure that your conclusion is correct. Avoid simply stating the correct answer at the outset. Then print only a single choice from \"100\" or \"99\" or \"98\" or \"97\" or \"96\" or \"95\" or \"94\" or \"93\" or \"92\" or \"91\" or \"90\" or \"89\" or \"88\" or \"87\" or \"86\" or \"85\" or \"84\" or \"83\" or \"82\" or \"81\" or \"80\" or \"79\" or \"78\" or \"77\" or \"76\" or \"75\" or \"74\" or \"73\" or \"72\" or \"71\" or \"70\" or \"69\" or \"68\" or \"67\" or \"66\" or \"65\" or \"64\" or \"63\" or \"62\" or \"61\" or \"60\" or \"59\" or \"58\" or \"57\" or \"56\" or \"55\" or \"54\" or \"53\" or \"52\" or \"51\" or \"50\" or \"49\" or \"48\" or \"47\" or \"46\" or \"45\" or \"44\" or \"43\" or \"42\" or \"41\" or \"40\" or \"39\" or \"38\" or \"37\" or \"36\" or \"35\" or \"34\" or \"33\" or \"32\" or \"31\" or \"30\" or \"29\" or \"28\" or \"27\" or \"26\" or \"25\" or \"24\" or \"23\" or \"22\" or \"21\" or \"20\" or \"19\" or \"18\" or \"17\" or \"16\" or \"15\" or \"14\" or \"13\" or \"12\" or \"11\" or \"10\" or \"9\" or \"8\" or \"7\" or \"6\" or \"5\" or \"4\" or \"3\" or \"2\" or \"1\" or \"0\" (without quotes or punctuation) on its own line corresponding to the correct answer. At the end, repeat just the answer by itself on a new line.\n\nReasoning:"}], "sampled": ["The submission correctly identifies that the derivative of $\\cosh x$ is $\\sinh x$, and uses this fact to evaluate the integral of $\\sinh x dx$. The final answer is also correct, as it matches the expert answer of $\\cosh x + C$. Therefore, the submission receives full credit for both Final Answer Correctness and Explanation Correctness.\n\nFinal Score: 100\n\n100"]}, "created_by": "", "created_at": "2023-05-22 04:51:15.823015+00:00"}
{"run_id": "230522045050IMCI7PPN", "event_id": 37, "sample_id": "calculus-meta.dev.29", "type": "metrics", "data": {"choice": "100", "score": 1.0, "metascore": true}, "created_by": "", "created_at": "2023-05-22 04:51:15.823015+00:00"}
```


The Model-Graded Evals is an interesting way to approach evaluating the performance of an AI model. The basic idea is to have a second model (the grader model) assess the output of the first model (the candidate model).

Here are the steps you would follow for your calculus problem, according to the OpenAI guide:

1. Defining the task: You've done this already. The task is to evaluate a certain integral.

2. Collecting comparison data: You would need a dataset of correct and incorrect responses to your integral problem. You would present the problem to a variety of models and save their responses along with the correct answer.

3. Training a comparison model: Using the data collected in step 2, you would train a model to predict which of two responses is better. This model would take as input a problem (your integral) and two solutions, and it would output a score indicating which solution it thinks is better.

4. Grading model responses: You would then use the comparison model to grade the responses of the candidate model. You would present the problem to the candidate model, collect its response, and then present the problem and the response (along with the correct answer) to the comparison model. The comparison model's output would be the grade for the candidate model's response.

5. Calculating scores: Finally, you would calculate scores for the candidate model based on the grades from step 4. The exact method of calculating scores can vary depending on your specific needs and the nature of your task.

This approach has the advantage of being able to handle a wide variety of responses, as the comparison model is trained to evaluate responses based on their content rather than just checking for an exact match. However, it also requires a significant amount of data and computational resources to implement, as you need to train a separate comparison model for each task.

---

Sure, we could construct a rubric based on the steps of the calculation you provided. A rubric is a grading tool that explicitly describes the performance expectations for a task or piece of work. Here's a possible rubric:

1. Problem Identification and Setup (20 points): The model correctly identifies the problem as a surface area of revolution calculation and sets up the integral correctly.

2. Appropriate Substitution (20 points): The model correctly suggests the substitution $u = -e^{-x}$ and rewrites the integral in terms of $u$.

3. Integral Evaluation (20 points): The model correctly evaluates the integral $\int_{0}^{-1} \sqrt{1 + u^2},du$.

4. Applying the Bounds (20 points): The model correctly applies the bounds to the evaluated integral and simplifies the result.

5. Final Simplification (20 points): The model correctly simplifies the result to obtain $\pi (\sqrt{2} + \ln(1 + \sqrt{2}))$.

Each step of the process is worth 20 points, leading to a total score out of 100. A correct and well-explained solution would earn full points.

This rubric would allow you to grade a model's response based on its understanding of each step in the calculation. However, this is just one way to structure a rubric. The exact design of a rubric may vary based on the specifics of your eval, your grading model, and what aspects of performance you're most interested in assessing.

---

Yes, a model-graded eval consists of several main components:

Prompt: This is the task or question that is presented to the language model. In your case, the prompt is a calculus problem that the language model is asked to solve.

Target: The target is the expected response to the prompt. For your calculus problem, this would be the correct solution to the problem.

Rubric: The rubric is a set of criteria used to evaluate the language model's response. It specifies what a good response should look like and how points are awarded for each component of the response.

Grading Model: The grading model is a separate model that grades the responses of the language model. It is trained on a dataset of prompt-response pairs, along with the grades for each pair. The grading model should be able to grade new responses to the same or similar prompts.

Grades: The grades are the scores given by the grading model to the responses of the language model. They indicate how well the language model performed on the task.

To improve your model-graded eval, you could consider the following steps:

Improve the prompt: Make sure your prompt clearly and unambiguously specifies the task that the language model is supposed to perform. If the prompt is unclear or ambiguous, the language model may not provide the type of response you're looking for.

Improve the target: Make sure your target accurately represents the correct response to the prompt. You should also ensure that it captures all the different ways that the correct response could be expressed.

Improve the rubric: Your rubric should cover all the important aspects of a good response. It should be specific enough to guide the grading model, but flexible enough to accommodate different ways of expressing the correct answer.

Improve the grading model: If your grading model isn't accurately grading responses, you might need to train it on more data, use a different model architecture, or tweak its training process.

Collect more data: If your grading model is having trouble generalizing to new responses, you might need to collect more data to train it on. The new data should ideally cover a wider range of responses than your current dataset.




