In [1]:
import openai
from IPython.display import Markdown, Math, Latex
from ruamel.yaml import YAML
from ruamel.yaml.scalarstring import LiteralScalarString

import json
import os
import time

In [2]:
openai.api_key = os.environ['OPENAI_API_KEY']

yaml=YAML()
yaml.default_flow_style = False
yaml.allow_unicode = True
yaml.encoding = 'utf-8'

In [3]:
def fix_latex(text):
    text = text.replace('\\(', '$')
    text = text.replace('\\)', '$')
    text = text.replace('\\[', '$$')
    text = text.replace('\\]', '$$')
    return text

In [4]:
with open('calculus-responses.yaml', 'r') as f:
    problems = yaml.load(f)

Verify each response has a `response` field, a `correct` field (evaluation), a `comment` field (clarify evaluations), and nothing else.

In [10]:
for i, problem in enumerate(problems):
    for j, response in enumerate(problem['completions']):
        keys = list(response.keys())
        assert keys == ['completion', 'correct', 'comment'], f'problem {i} response {j} keys: {keys}'

In [11]:
for i, problem in enumerate(problems):
    correct_responses = 0
    for j, response in enumerate(problem['completions']):
        if response['correct']:
            correct_responses += 1
        if response['comment']:
            problems[i]['completions'][j]['comment'] = LiteralScalarString(response['comment'])
    correct_response_rate = correct_responses/len(problem['completions'])
    print(f"*** problem {i}:")
    display(Markdown(fix_latex(problem['input'])))
    print(f"--- correct rate {correct_response_rate}")
    problems[i]['correct_response_rate'] = correct_response_rate

*** problem 0:


$$
\int x^n dx
$$
where $n != -1$.

--- correct rate 1.0
*** problem 1:


$$
\int \frac{1}{x} dx
$$

--- correct rate 0.7
*** problem 2:


$$
\int e^{x} dx
$$

--- correct rate 0.8
*** problem 3:


$$
\int b^x dx
$$

--- correct rate 0.2
*** problem 4:


$$
\int \sin x dx
$$

--- correct rate 0.7
*** problem 5:


$$
\int \cos x dx
$$

--- correct rate 0.7
*** problem 6:


$$
\int \sec^2 x dx
$$

--- correct rate 0.7
*** problem 7:


$$
\int \csc^2 x dx
$$

--- correct rate 0.2
*** problem 8:


$$
\int \sec x \tan x dx
$$

--- correct rate 0.1
*** problem 9:


$$
\int \csc x \cot x dx
$$

--- correct rate 0.3
*** problem 10:


$$
\int \sec x dx
$$

--- correct rate 0.1
*** problem 11:


$$
\int \csc x dx
$$

--- correct rate 0.0
*** problem 12:


$$
\int \tan x dx
$$

--- correct rate 0.7
*** problem 13:


$$
\int \cot x dx
$$

--- correct rate 0.8
*** problem 14:


$$
\int \sinh x dx
$$

--- correct rate 0.3
*** problem 15:


$$
\int \cosh x dx
$$

--- correct rate 0.4
*** problem 16:


$$
\int \frac{dx}{x^2+a^2}
$$

--- correct rate 0.2
*** problem 17:


$$
\int \frac{dx}{\sqrt{a^2-x^2}}, \quad a>0
$$

--- correct rate 0.7


In [13]:
general_rubric = '''
- Final Answer Correctness (40 points): The final answer provided by the model matches the expert's answer in its simplest form.
  - 40: The final answer is mathematically equivalent to the expert's answer and is in its simplest form.
  - 0: The final answer is not mathematically equivalent to the expert's answer or is not in its simplest form.
- Explanation Correctness (30 points): The explanation provided by the model to arrive at the answer is correct and logically sound.
  - 30: The explanation correctly justifies each step of the problem-solving process.
  - 15: The explanation is partially correct but does not correctly justify each step of the problem-solving process.
  - 0: The explanation is incorrect or missing.
- Mathematical Rigor (20 points): The submission correctly uses mathematical identities and applies integral and derivative rules.
  - 20: The submission demonstrates full mathematical rigor.
  - 10: The submission demonstrates some mathematical rigor but makes minor mistakes.
  - 0: The submission does not demonstrate mathematical rigor.
- Correctness of Intermediate Steps (10 points): The submission correctly executes each step of the problem-solving process.
  - 10: All intermediate steps are correct.
  - 5: Some intermediate steps are correct.
  - 0: No intermediate steps are correct.
'''.strip()

In [14]:
for i, problem in enumerate(problems):
    problems[i]['rubric'] = LiteralScalarString(general_rubric)

In [15]:
for i, problem in enumerate(problems):
    d = dict()
    d['input'] = problem['input']
    d['ideal'] = problem['ideal']
    d['rubric'] = problem['rubric']
    d['correct_response_rate'] = problem['correct_response_rate']
    d['completions'] = problem['completions']
    #for j, completion in enumerate(d['completions']):
    #    if 'evaluations' in completion:
    #        del d['completions'][j]['evaluations']
    problems[i] = d

In [164]:
print(json.dumps(problems))

[{"input": "\\[\n\\int x^n dx\n\\]\nwhere \\(n != -1\\).", "ideal": "\\(\\int x^n dx = \\frac{x^{n+1}}{n+1} + C\\), for \\(n \\neq -1\\).", "rubric": "- Final Answer Correctness (40 points): The final answer provided by the model matches the expert's answer in its simplest form.\n  - 40: The final answer is mathematically equivalent to the expert's answer and is in its simplest form.\n  - 0: The final answer is not mathematically equivalent to the expert's answer or is not in its simplest form.\n- Explanation Correctness (30 points): The explanation provided by the model to arrive at the answer is correct and logically sound.\n  - 30: The explanation correctly justifies each step of the problem-solving process.\n  - 15: The explanation is partially correct but does not correctly justify each step of the problem-solving process.\n  - 0: The explanation is incorrect or missing.\n- Mathematical Rigor (20 points): The submission correctly uses mathematical identities and applies integral a

In [16]:
with open('calculus-responses.yaml', 'w') as f:
    yaml.dump(problems, f)

In [5]:
problem_prompt_template = '''
Evaluate the following integral:
{input}

Explain your reasoning. Use LaTeX with delimiters '\(' and '\)' to make mathematical statements. When you are done, repeat your answer by itself on a new line.
'''.strip()

In [6]:
grader_prompt_template = grader_prompt_template_v1 = '''
You are comparing a submitted answer to an expert answer for a given mathematics problem:
---
[Mathematics Problem]: {input}
---
[Expert]: {ideal}
---
[Submission]: {completion}
---
Use the following rubric to score the submission. Round the score to the nearest integer.

[Rubric]:

{rubric}
---
First, write out in a step by step manner your reasoning for the score you gave to be sure that your conclusion is correct. Avoid simply stating the score at the outset. When you are done giving your reasoning, print the score rounded to the nearest integer (without quotes or punctuation) on its own line.

Reasoning:
'''.strip()

grader_prompt_template_v0 = '''
You are comparing a submitted answer to an expert answer for a given mathematics problem:
---
[Mathematics Problem]: {input}
---
[Expert]: {ideal}
---
[Submission]: {completion}
---
Use the following rubric to score the submission. Round the score to the nearest integer.

[Rubric]:

{rubric}
---
First, write out in a step by step manner your reasoning to be sure that your conclusion is correct. Avoid simply stating the correct answer at the outset. Then print only a single choice from "100" or "99" or "98" or "97" or "96" or "95" or "94" or "93" or "92" or "91" or "90" or "89" or "88" or "87" or "86" or "85" or "84" or "83" or "82" or "81" or "80" or "79" or "78" or "77" or "76" or "75" or "74" or "73" or "72" or "71" or "70" or "69" or "68" or "67" or "66" or "65" or "64" or "63" or "62" or "61" or "60" or "59" or "58" or "57" or "56" or "55" or "54" or "53" or "52" or "51" or "50" or "49" or "48" or "47" or "46" or "45" or "44" or "43" or "42" or "41" or "40" or "39" or "38" or "37" or "36" or "35" or "34" or "33" or "32" or "31" or "30" or "29" or "28" or "27" or "26" or "25" or "24" or "23" or "22" or "21" or "20" or "19" or "18" or "17" or "16" or "15" or "14" or "13" or "12" or "11" or "10" or "9" or "8" or "7" or "6" or "5" or "4" or "3" or "2" or "1" or "0" (without quotes or punctuation) on its own line corresponding to the correct answer. At the end, repeat just the answer by itself on a new line.

Reasoning:
'''.strip()

In [7]:
def get_problem_prompt(problem):
    return problem_prompt_template.format(
        **problem
    )

def get_grader_prompt(problem, completion = None, completion_num = None):
    if completion is None:
        assert completion_num is not None, 'must specify either completion or completion_num'
        completion = problem['completions'][completion_num]
    return grader_prompt_template.format(
        input = get_problem_prompt(problem),
        ideal = problem['ideal'],
        rubric = problem['rubric'],
        completion = completion,
    )

In [8]:
print(get_problem_prompt(problems[0]))

Evaluate the following integral:
\[
\int x^n dx
\]
where \(n != -1\).

Explain your reasoning. Use LaTeX with delimiters '\(' and '\)' to make mathematical statements. When you are done, repeat your answer by itself on a new line.


In [9]:
print(get_grader_prompt(problems[0], completion_num=0))

You are comparing a submitted answer to an expert answer for a given mathematics problem:
---
[Mathematics Problem]: Evaluate the following integral:
\[
\int x^n dx
\]
where \(n != -1\).

Explain your reasoning. Use LaTeX with delimiters '\(' and '\)' to make mathematical statements. When you are done, repeat your answer by itself on a new line.
---
[Expert]: \(\int x^n dx = \frac{x^{n+1}}{n+1} + C\), for \(n \neq -1\).
---
[Submission]: {'completion': 'Using the power rule of integration, we have:\n\n\\[\n\\int x^n dx = \\frac{x^{n+1}}{n+1} + C\n\\]\n\nwhere \\(C\\) is the constant of integration. \n\nAnswer: \\[\\int x^n dx = \\frac{x^{n+1}}{n+1} + C\\]', 'correct': True, 'comment': ''}
---
Use the following rubric to score the submission. Round the score to the nearest integer.

[Rubric]:

- Final Answer Correctness (40 points): The final answer provided by the model matches the expert's answer in its simplest form.
  - 40: The final answer is mathematically equivalent to the expert

In [146]:
grader_prompt = get_grader_prompt(problems[-2:-1], completion_num=0)
print(f'*** Grader prompt: {grader_prompt}')
completion = openai.ChatCompletion.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "user", "content": grader_prompt}
  ],
  n = 1,
)
evaluations = [LiteralScalarString(choice.message.content) for choice in completion.choices]
for evaluation in evaluations:
    print(evaluation)
#problems[i]['completions'] = responses
#for j, response in enumerate(responses):
#  print(f'--- Response {j+1}\n')
#  print(response)

*** Grader prompt: You are comparing a submitted answer to an expert answer for a given mathematics problem:
---
[Mathematics Problem]: Evaluate the following integral:
\[
\int x^n dx
\]
where \(n != -1\).

Explain your reasoning. Use LaTeX with delimiters '\(' and '\)' to make mathematical statements. When you are done, repeat your answer by itself on a new line.
---
[Expert]: \(\int x^n dx = \frac{x^{n+1}}{n+1} + C\), for \(n \neq -1\).
---
[Submission]: {'completion': 'Using the power rule of integration, we have:\n\n\\[\n\\int x^n dx = \\frac{x^{n+1}}{n+1} + C\n\\]\n\nwhere \\(C\\) is the constant of integration. \n\nAnswer: \\[\\int x^n dx = \\frac{x^{n+1}}{n+1} + C\\]', 'correct': True, 'comment': ''}
---
Enumerate and evaluate the reasoning steps in the submission. Use the following rubric to score the submission. Round the score to the nearest integer.

[Rubric]:

- Final Answer Correctness (40 points): The final answer provided by the model matches the expert's answer in its s

In [10]:
for evaluation in evaluations:
    print(evaluation)

NameError: name 'evaluations' is not defined

In [166]:
len(problems)

18

In [8]:
for i, problem in enumerate(problems):
  #if i not in [0, 16]:
  if i == 0:
    continue
  for j, completion in enumerate(problem['completions']):
    if j not in [0, 1]:
      continue
    grader_prompt = get_grader_prompt(problem, completion)
    print(f'*** Grader prompt: {grader_prompt}')
    grader_completion = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=[
        {"role": "user", "content": grader_prompt}
      ],
      n = 3,
    )
    evaluations = [
      #dict(evaluation = LiteralScalarString(choice.message.content.encode('utf8')))
      dict(evaluation = LiteralScalarString(choice.message.content))
      for choice in grader_completion.choices
    ]
    problems[i]['completions'][j]['evaluations'] = evaluations
    for k, evaluation in enumerate(evaluations):
      print(f'--- Evaluation {k+1}\n')
      print(evaluation['evaluation'])

    with open('calculus-responses.yaml', 'w') as f:
        yaml.dump(problems, f)

*** Grader prompt: You are comparing a submitted answer to an expert answer for a given mathematics problem:
---
[Mathematics Problem]: Evaluate the following integral:
\[
\int \frac{1}{x} dx
\]

Explain your reasoning. Use LaTeX with delimiters '\(' and '\)' to make mathematical statements. When you are done, repeat your answer by itself on a new line.
---
[Expert]: \(\int \frac{1}{x} dx = \ln|x| + C\).
---
[Submission]: {'completion': 'We have:\n\\[\n\\int \\frac{1}{x} dx = \\ln|x| + C\n\\]\nwhere \\(C\\) is the constant of integration. This is because we can write \\(\\frac{1}{x} = x^{-1}\\) and then use the power rule of integration, which says that\n\\[\n\\int x^n dx = \\frac{x^{n+1}}{n+1} + C\n\\]\nfor any real number \\(n\\neq -1\\) (and where \\(C\\) is the constant of integration). Applying this rule with \\(n=-1\\) gives us the result above. \n\n\\[\\int \\frac{1}{x} dx = \\ln|x| + C\\]', 'correct': False, 'comment': 'The final answer is correct, but citing the power rule of 

In [13]:
_t = 'fubar \u207b'
_t

'fubar ⁻'

In [14]:
_t.encode('utf8')

b'fubar \xe2\x81\xbb'