In [1]:
from math_eval import *

import pandas as pd
from IPython.display import Markdown, Math, Latex
import openai
import ruamel.yaml
from ruamel.yaml import YAML
from ruamel.yaml.scalarstring import LiteralScalarString
from ruamel.yaml.representer import RoundTripRepresenter

import datetime
import decimal
import itertools
import json
import os
import re
import sys
import time

In [2]:
engine = sa.create_engine("sqlite:///230602-1045_math-evals.db")
session = Session(engine)
Base.metadata.create_all(engine)

In [18]:
Base.metadata.drop_all(engine)

In [4]:
session.rollback()

In [3]:
session.commit()

In [3]:
openai.api_key = os.environ['OPENAI_API_KEY']

yaml=YAML()
yaml.default_flow_style = False
yaml.allow_unicode = True
yaml.encoding = 'utf-8'

def represent_decimal(self, data):
  value = '0.' if data.is_zero() else str(data.normalize())
  return self.represent_scalar(u'tag:yaml.org,2002:float', value)

RoundTripRepresenter.add_representer(Decimal, represent_decimal)

In [12]:
gpt4_chatgpt_plus = Model.get_or_create(session, dict(name='GPT-4 via ChatGPT-Plus'))
example_model = Model.get_or_create(session, dict(name='human'))
session.commit()
gpt4_chatgpt_plus, example_model

(<Model: {'id': 1, 'name': 'GPT-4 via ChatGPT-Plus'}>,
 <Model: {'id': 2, 'name': 'human'}>)

In [13]:
model_ct = session.scalar(sa.select().with_only_columns(sa.func.count(Model.id)))
print(f'{model_ct=}')

model_ct=2


In [15]:
gpt4_ps = ProblemSet.get_or_create(session,
  dict(name='GPT-4 calculus problem set 230531-1326'),
)
gpt4_ss = SubmissionSet.get_or_create(session,
  dict(name='GPT-4 calculus submission set 230531-1326'),
)
example_ss = SubmissionSet.get_or_create(session,
  dict(name='GPT-4 calculus example-submission set 230531-1326'),
)
gpt4_es = EvaluationSet.get_or_create(session,
  dict(name='GPT-4 calculus evaluation set 230531-1326'),
)
example_es = EvaluationSet.get_or_create(session,
  dict(name='GPT-4 calculus example-evaluation set 230531-1326'),
)
session.commit()
print(f'{gpt4_ps=}')
print(f'{gpt4_ss=}')
print(f'{example_ss=}')
print(f'{gpt4_es=}')
print(f'{example_es=}')

gpt4_ps=<ProblemSet: {'id': 1, 'name': 'GPT-4 calculus problem set 230531-1326'}>
gpt4_ss=<SubmissionSet: {'id': 1, 'name': 'GPT-4 calculus submission set 230531-1326'}>
example_ss=<SubmissionSet: {'id': 2, 'name': 'GPT-4 calculus example-submission set 230531-1326'}>
gpt4_es=<EvaluationSet: {'id': 1, 'name': 'GPT-4 calculus evaluation set 230531-1326'}>
example_es=<EvaluationSet: {'id': 2, 'name': 'GPT-4 calculus example-evaluation set 230531-1326'}>


In [50]:
ps_ct = session.scalars(sa.select().with_only_columns(sa.func.count(ProblemSet.id)))
ss_ct = session.scalars(sa.select().with_only_columns(sa.func.count(SubmissionSet.id)))
es_ct = session.scalars(sa.select().with_only_columns(sa.func.count(EvaluationSet.id)))
print(f'{ps_ct=}, {ss_ct=}, {es_ct=}')

ps_ct=1, ss_ct=<sqlalchemy.engine.result.ScalarResult object at 0x7f2a92ac75b0>, es_ct=<sqlalchemy.engine.result.ScalarResult object at 0x7f2a92322610>


In [None]:
session.rollback()

def load_to_db(
  session,
  data,
  submission_model,
  example_submission_model,
  evaluation_model,
  example_evaluation_model,
  problem_set,
  submission_set,
  example_submission_set,
  evaluation_set,
  example_evaluation_set,
):
  for p_data in data.get('problems', []):
    p_id = p_data.get('id')
    p_input = p_data.get('input')
    p_ideal = p_data.get('ideal')
    p_rubric = p_data.get('rubric')
    if p_id is not None:
      p = Problem.get_or_create(session, {'id': p_id})
      p.input = p_input
    else:
      p = Problem.get_or_create(session, {'input': p_input})
    if p_ideal is not None: p.ideal = p_ideal
    if p_rubric is not None: p.rubric = p_rubric
    p.problem_sets.append(problem_set)
    
    for s_data in p_data.get('submissions', []):
      s_id = s_data.get('id')
      s_completion = s_data.get('completion')
      s_score = s_data.get('score')
      s_is_example = s_data.get('is_example')
      if s_id is not None:
        s = Submission.get_or_create(session, {'id': s_id})
        s.completion = s_completion
      else:
        s = Submission.get_or_create(session, {'completion': s_completion})
      if s_score is not None: s.score = s_score
      if s_is_example is not None: s.is_example = s_is_example
      if s.is_example:
        s.model = example_submission_model
        s.submission_sets.append(example_submission_set)
      else:
        s.model = submission_model
        s.submission_sets.append(submission_set)
      s.problem = p
    
      for e_data in s_data.get('evaluations', []):
        e_id = e_data.get('id')
        e_completion = e_data.get('completion')
        e_score = e_data.get('score')
        if e_id is not None:
          e = Evaluation.get_or_create(session, {'id': s_id})
          e.completion = e_completion
        else:
          e = Evaluation.get_or_create(session, {'completion': e_completion})
        if e_score is not None: e.score = e_score
        if s.is_example:
          e.model = example_evaluation_model
          e.evaluation_sets.append(example_evaluation_set)
        else:
          e.model = evaluation_model
          e.evaluation_sets.append(evaluation_set)
        e.submission = s


In [42]:
def update_or_create(cls, session, data, *l):
  for search_args, update_args in l:
    search_dict = {key:data[key] for key in search_args if key in data}
    if len(search_dict) < len(search_args):
      continue
    update_dict = {key:data[key] for key in update_args if key in data}
    item = cls.update_or_create(session, search_dict, update_dict)
    if item:
      return item
  print(f'warning: no data found for {cls=}, {l=}')

def load_to_db(sess, data):
  example_submission_model = None
  evaluation_model = None
  example_evaluation_model = None
  problem_set = None
  submission_set = None
  example_submission_set = None
  evaluation_set = None
  example_evaluation_set = None

  general_lookups = (['id'], ['name']), (['name'], []),
  #general_lookup = (['name'], ['notes'])
  if 'example_submission_model' in data:
    example_submission_model = update_or_create(Model, sess, data['example_submission_model'], *general_lookups)
  if 'evaluation_model' in data:
    evaluation_model = update_or_create(Model, sess, data['evaluation_model'], *general_lookups)
  if 'example_evaluation_model' in data:
    example_evaluation_model = update_or_create(Model, sess, data['example_evaluation_model'], *general_lookups)
  if 'problem_set' in data:
    problem_set = update_or_create(ProblemSet, sess, data['problem_set'], *general_lookups)
  if 'submission_set' in data:
    submission_set = update_or_create(SubmissionSet, sess, data['submission_set'], *general_lookups)
  if 'example_submission_set' in data:
    example_submission_set = update_or_create(SubmissionSet, sess, data['example_submission_set'], *general_lookups)
  if 'evaluation_set' in data:
    evaluation_set = update_or_create(EvaluationSet, sess, data['evaluation_set'], *general_lookups)
  if 'example_evaluation_set' in data:
    example_evaluation_set = update_or_create(EvaluationSet, sess, data['example_evaluation_set'], *general_lookups)

  for pd in data.get('problems', []):
    p = update_or_create(
      Problem, sess, pd,
      (['id'], ['input', 'ideal', 'rubric']),
      (['input'], ['ideal', 'rubric']),
    )
    print(f'{p=}')
    if problem_set: p.problem_sets.append(problem_set)
    for sd in pd.get('submissions', []):
      s = update_or_create(
        Submission, sess, sd,
        (['id'], ['completion', 'score', 'is_example', 'model_id']),
        (['completion'], ['score', 'is_example', 'model_id']),
      )
      print(f'{s=}')
      if s.is_example:
        if example_submission_model: s.model = example_submission_model
        if example_submission_set: s.submission_sets.append(example_submission_set)
      else:
        if submission_model: s.model = submission_model
        if submission_set: s.submission_sets.append(submission_set)
      s.problem = p
      for ed in sd.get('evaluations', []):
        e = update_or_create(
          Evaluation, sess, ed,
          (['id'], ['completion', 'score', 'is_example', 'model_id']),
          (['completion'], ['score', 'is_example', 'model_id']),
        )
        print(f'{e=}')
        if e.is_example:
          if example_evaluation_model: e.model = example_evaluation_model
          if example_evaluation_set: e.evaluation_sets.append(example_evaluation_set)
        else:
          if evaluation_model: e.model = evaluation_model
          if evaluation_set: e.evaluation_sets.append(evaluation_set)
        e.submission = s


In [14]:
data=dict()
data['submission_model'] = gpt4_chatgpt_plus.as_dict()
data['example_submission_model'] = example_model.as_dict()
data['evaluation_model'] = gpt4_chatgpt_plus.as_dict()
data['example_evaluation_model'] = example_model.as_dict()
data['problem_set'] = gpt4_ps.as_dict()
data['submission_set'] = gpt4_ss.as_dict()
data['example_submission_set'] = example_ss.as_dict()
data['evaluation_set'] = gpt4_es.as_dict()
data['example_evaluation_set'] = example_es.as_dict()
with open('calculus-responses-gpt4.yaml', 'r') as f:
  data.update(yaml.load(f))

```python
load_to_db(
  sess = session,
  data = data,
  submission_model = gpt4_chatgpt_plus,
  example_submission_model = example_model,
  evaluation_model = gpt4_chatgpt_plus,
  example_evaluation_model = example_model,
  problem_set = gpt4_ps,
  submission_set = gpt4_ss,
  example_submission_set = example_ss,
  evaluation_set = gpt4_es,
  example_evaluation_set = example_es,
)
session.commit()
```

To evaluate the integral \(\int \sec^3 x \, dx\), we can use the method of integration by parts. Let's proceed step by step:

We start by applying the integration by parts formula:
\[\int u \, dv = uv - \int v \, du\]

In this case, we can choose:
\(u = \sec^2 x\) (to differentiate)
\(dv = \sec x \, dx\) (to integrate)

Let's find the differentials \(du\) and \(v\):
\(du = \frac{d}{dx}(\sec^2 x) \, dx = 2\sec x \tan x \, dx\)
\(v = \int \sec x \, dx = \ln|\sec x + \tan x|\)

Now, we can apply the integration by parts formula:
\[
\begin{aligned}
\int \sec^3 x \, dx &= \int u \, dv \\
&= uv - \int v \, du \\
&= \sec^2 x \ln|\sec x + \tan x| - \int \ln|\sec x + \tan x| \cdot 2\sec x \tan x \, dx
\end{aligned}
\]

Simplifying the last integral on the right-hand side, we get:
\[
\int \sec^3 x \, dx = \sec^2 x \ln|\sec x + \tan x| - 2\int \ln|\sec x + \tan x| \sec x \tan x \, dx
\]

At this point, we have a new integral to evaluate. We can proceed further by applying integration by parts again, or we can use a trigonometric identity to simplify the expression.

Using the trigonometric identity \(\sec x = \frac{1}{\cos x}\), we can rewrite the integral as:
\[
\int \sec^3 x \, dx = \sec^2 x \ln|\sec x + \tan x| - 2\int \ln|\sec x + \tan x| \cdot \frac{1}{\cos x} \cdot \frac{\sin x}{\cos x} \, dx
\]

Simplifying the expression, we have:
\[
\int \sec^3 x \, dx = \sec^2 x \ln|\sec x + \tan x| - 2\int \frac{\sin x}{\cos^2 x} \ln|\sec x + \tan x| \, dx
\]

At this point, we can substitute \(u = \sec x + \tan x\) to simplify the integral further. Then, \(du = (\sec x \tan x + \sec^2 x) \, dx\), which can be rearranged to \(du = \sec x (\tan x + \sec x) \, dx\).

Substituting these values, we have:
\[
\int \sec^3 x \, dx = \sec^2 x \ln|\sec x + \tan x| - 2\int \frac{\sin x}{\cos^2 x} \ln(u) \, du
\]

Next, we need to simplify the remaining integral. We can rewrite \(\frac{\sin x}{\cos^2 x}\) as \(\frac{1}{\cos x} \cdot \frac{\sin x}{\cos x}\), and using the identity \(\tan x = \frac{\sin x}{\cos x}\), we have \(\frac{1}{\cos x} \cdot \tan x =

In [21]:
session.rollback()

In [5]:
data = dict()
#with open('calculus-responses-gpt4.yaml', 'r') as f:
#with open('gpt4-test-dump-2.yaml', 'r') as f:
  data.update(yaml.load(f))

In [6]:
load_to_db(sess = session, data = data)
session.commit()

In [7]:
problem_ct = session.scalar(sa.select().with_only_columns(sa.func.count(Problem.id)))
submission_ct = session.scalar(sa.select().with_only_columns(sa.func.count(Submission.id)))
evaluation_ct = session.scalar(sa.select().with_only_columns(sa.func.count(Evaluation.id)))
print(f'{problem_ct=}, {submission_ct=}, {evaluation_ct=}')

problem_ct=1, submission_ct=1, evaluation_ct=1


In [8]:
print(f'{len(gpt4_ps.problems)=}')
print(f'{len(gpt4_ss.submissions)=},  {len(example_ss.submissions)=}')
print(f'{len(gpt4_es.evaluations)=}, {len(example_es.evaluations)=}')

NameError: name 'gpt4_ps' is not defined

In [16]:
session.scalars(sa.select(Submission)).all()

[<Submission: {'id': 1, 'completion': "The area of a surface of revolution, when you're rotating the curve \\(y = f(x)\\), \\(a \\leq x \\leq b\\), around the x-axis is given by the formula:\n\n\\[A = 2\\pi \\int_{a}^{b} y \\sqrt{1 + \\left(\\frac{dy}{dx}\\right)^2} dx\\]\n\nThis equation comes from summing up the surface areas of infinitesimal frustums (which are approximately cylindrical for small enough changes in \\(x\\)) that make up the surface.\n\nIn our case, \\(f(x) = e^{-x}\\), and we're revolving about the x-axis for \\(0 \\leq x < \\infty\\). We need to compute the derivative of \\(f(x)\\), which is \\(f'(x) = -e^{-x}\\).\n\nPlugging these into the formula gives:\n\n\\[A = 2\\pi \\int_{0}^{\\infty} e^{-x} \\sqrt{1 + \\left(-e^{-x}\\right)^2} dx\\]\n\nThis integral simplifies to:\n\n\\[A = 2\\pi \\int_{0}^{\\infty} e^{-x} \\sqrt{1 + e^{-2x}} dx\\]\n\nWe can compute this using a trigonometric substitution. Let \\(e^{-x} = \\sinh{t}\\). Therefore, \\(e^{-2x} = \\sinh^2{t}\\), 

In [33]:
gpt4_chatgpt_plus.as_dict()

{'id': 1, 'name': 'GPT-4 via ChatGPT-Plus'}

In [18]:
def problem_ydict(problem):
  d = problem.as_dict(keys=('id', 'input', 'ideal', 'rubric'))
  for key in ('input', 'ideal', 'rubric'):
    d[key] = LiteralScalarString(d[key])
  return d

def submission_ydict(submission):
  return dict(
    id=submission.id,
    problem_id=submission.problem_id,
    model_id=submission.model_id,
    model_name=submission.model.name,
    is_example=submission.is_example,
    score=submission.score,
    completion=LiteralScalarString(submission.completion),
  )

def evaluation_ydict(evaluation):
  return dict(
    id=evaluation.id,
    submission_id=evaluation.submission.id,
    problem_id=evaluation.submission.problem_id,
    model_id=evaluation.model_id,
    model_name=evaluation.model.name,
    is_example=evaluation.is_example,
    score=evaluation.score,
    completion=LiteralScalarString(evaluation.completion),
  )


def problem_sets_ydict(problem_sets):
  problem_sets_dict = {ps.id:ps.as_dict() for ps in problem_sets}
  problems_dict = {
    p.id:dict(itertools.chain([('problem_set_id', ps.id)], problem_ydict(p).items()))
    for ps in problem_sets
    for p in ps.problems
  }
  return dict(
    problem_sets=list(problem_sets_dict.values()),
    problems=list(problems_dict.values()),
  )

def submission_sets_ydict(submission_sets):
  models_dict = {}
  submission_sets_dict = {}
  problems_dict = {}
  for submission_set in submission_sets:
    for submission in submission_set.submissions:
      problem = submission.problem
      
      if submission_set in submission.submission_sets:
        submission_sets_dict[submission_set.id] = submission_set.as_dict()

      if submission.model_id not in models_dict:
        models_dict[submission.model_id] = submission.model.as_dict()
      
      if problem.id in problems_dict:
        problem_dict = problems_dict[problem_id]
      else:
        problem_dict = problem_ydict(problem)

      problem_submissions = problem_dict.get('submissions', {})
      problem_submissions[submission.id] = submission_ydict(submission)
      problem_dict['submissions'] = problem_submissions

      problems_dict[problem.id] = problem_dict

  for k in problems_dict.keys():
    problems_dict[k]['submissions'] = list(problems_dict[k]['submissions'].values())
  
  return dict(
    models=list(models_dict.values()),
    submission_sets=list(submission_sets_dict.values()),
    problems=list(problems_dict.values()),
  )

def evaluation_sets_ydict(evaluation_sets):
  models_dict = {}
  evaluation_sets_dict = {}
  problems_dict = {}
  submissions_dict = {}
  for evaluation_set in evaluation_sets:
    for evaluation in evaluation_set.evaluations:
      submission = evaluation.submission
      problem = submission.problem

      if evaluation_set not in evaluation_sets_dict:
        evaluation_sets_dict[evaluation_set.id] = evaluation_set.as_dict()

      if evaluation.model_id not in models_dict:
        models_dict[evaluation.model_id] = evaluation.model.as_dict()
      if submission.model_id not in models_dict:
        models_dict[submission.model_id] = submission.model.as_dict()

      if problem.id in problems_dict:
        problem_dict = problems_dict[problem_id]
      else:
        problem_dict = problem_ydict(problem)

      problem_submissions = problem_dict.get('submissions', {})
      if submission.id in problem_submissions:
        submission_dict = submissions_dict[submission.id]
      else:
        submission_dict = submission_ydict(submission)

      submission_evaluations = submission_dict.get('evaluations', {})
      if evaluation.id in submission_evaluations:
        evaluation_dict = submission_evaluations[evaluation.id]
      else:
        evaluation_dict = evaluation_ydict(evaluation)

      submission_evaluations[evaluation.id] = evaluation_dict
      submission_dict['evaluations'] = submission_evaluations
      
      problem_submissions[submission.id] = submission_dict
      submissions_dict[submission.id] = submission_dict
      problem_dict['submissions'] = problem_submissions

      problems_dict[problem.id] = problem_dict

  for pk in problems_dict.keys():
    for sk in problems_dict[pk]['submissions'].keys():
      problems_dict[pk]['submissions'][sk]['evaluations'] = list(
        problems_dict[pk]['submissions'][sk]['evaluations'].values()
      )
    problems_dict[pk]['submissions'] = list(
      problems_dict[pk]['submissions'].values()
    )
  
  return dict(
    models=list(models_dict.values()),
    evaluation_sets=list(evaluation_sets_dict.values()),
    problems=list(problems_dict.values()),
  )

In [10]:
with open('gpt4-test-dump.yaml', 'w') as f:
  yaml.dump(data, f)

In [51]:
#d = problem_ydict(session.scalars(sa.select(Problem)).first())
#d = problem_sets_ydict(session.scalars(sa.select(ProblemSet)).all())
#d = submission_sets_ydict(session.scalars(sa.select(SubmissionSet)).all())
#d = evaluation_sets_ydict(session.scalars(sa.select(EvaluationSet)).all())

d = dict()
d['submission_model'] = gpt4_chatgpt_plus.as_dict()
d['example_submission_model'] = example_model.as_dict()
d['evaluation_model'] = gpt4_chatgpt_plus.as_dict()
d['example_evaluation_model'] = example_model.as_dict()
d['problem_set'] = gpt4_ps.as_dict()
d['submission_set'] = gpt4_ss.as_dict()
d['example_submission_set'] = example_ss.as_dict()
d['evaluation_set'] = gpt4_es.as_dict()
d['example_evaluation_set'] = example_es.as_dict()
d.update(evaluation_sets_ydict(session.scalars(sa.select(EvaluationSet)).all()))


In [52]:
with open('gpt4-test-dump-2.yaml', 'w') as f:
  yaml.dump(d, f)

In [27]:
load_to_db(sess = session, data = d)
#session.commit()

In [43]:
data = dict()
with open('gpt4-test-dump-2.yaml', 'r') as f:
  data.update(yaml.load(f))

In [44]:
load_to_db(sess = session, data = data)

p=<Problem: {'id': 2, 'input': "Evaluate the following integral:\n\\[\\int \\sec^3 x dx\\]\nExplain your reasoning. Use LaTeX delimiters '\\(' and '\\)' to write your answer.", 'ideal': "To evaluate the integral \\(\\int \\sec^3 x \\, dx\\), we can use the method of integration by parts. Let's proceed step by step:\n\nWe start by applying the integration by parts formula:\n\\[\\int u \\, dv = uv - \\int v \\, du\\]\n\nIn this case, we can choose \\(u = \\sec x\\) and \\(dv = \\sec^2 x \\, dx\\).\n\nDifferentiating \\(u\\) with respect to \\(x\\), we obtain \\(du = \\sec x \\tan x \\, dx\\).\nIntegrating \\(dv\\), we have \\(\\int \\sec^2 x \\, dx = \\tan x\\).\n\nUsing the integration by parts formula, \\(\\int u \\, dv = uv - \\int v \\, du\\), we can rewrite the integral as follows:\n\n\\[\n\\begin{aligned}\n\\int \\sec^3 x \\, dx &= \\int u \\, dv \\\\\n&= \\sec x \\tan x - \\int \\tan x \\sec x \\tan x \\, dx \\\\\n&= \\sec x \\tan x - \\int \\sec x \\tan^2 x \\, dx.\n\\end{aligned

In [45]:
session.commit()

In [55]:
session.scalars(sa.select(Problem)).all()[-1].problem_sets

[<ProblemSet: {'id': 1, 'name': 'GPT-4 calculus problem set 230531-1326'}>]

In [57]:
#d = problem_ydict(session.scalars(sa.select(Problem)).first())
#d = problem_sets_ydict(session.scalars(sa.select(ProblemSet)).all())
#d = submission_sets_ydict(session.scalars(sa.select(SubmissionSet)).all())
#d = evaluation_sets_ydict(session.scalars(sa.select(EvaluationSet)).all())

d = dict()
d['submission_model'] = gpt4_chatgpt_plus.as_dict()
d['example_submission_model'] = example_model.as_dict()
d['evaluation_model'] = gpt4_chatgpt_plus.as_dict()
d['example_evaluation_model'] = example_model.as_dict()
d['problem_set'] = gpt4_ps.as_dict()
d['submission_set'] = gpt4_ss.as_dict()
d['example_submission_set'] = example_ss.as_dict()
d['evaluation_set'] = gpt4_es.as_dict()
d['example_evaluation_set'] = example_es.as_dict()
d.update(submission_sets_ydict(session.scalars(sa.select(SubmissionSet)).all()))


In [58]:
with open('gpt4-test-dump-3.yaml', 'w') as f:
  yaml.dump(d, f)