In [51]:
import re
import openai
import toolmode.data.funcqa as funcqa_dataset
import importlib
import tiktoken
from openai import OpenAI

client = OpenAI()

In [114]:
importlib.reload(funcqa_dataset)

dataset = funcqa_dataset.load()

Map:   0%|          | 0/68 [00:00<?, ? examples/s]

Map: 100%|██████████| 68/68 [00:00<00:00, 9934.26 examples/s]
Map: 100%|██████████| 650/650 [00:00<00:00, 16476.68 examples/s]
Map: 100%|██████████| 650/650 [00:00<00:00, 13742.74 examples/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 307.32ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.24s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 442.48ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.09it/s]


In [115]:
def get_operations(row):
    answer = row['answer']
    answer = answer.replace('<T>', '<X><T>')
    operations = answer.split('<X>')
    operations = [x.replace('<T>', '') for x in operations if '<T>' in x]
    operations = [x.split('(')[0] for x in operations]
    return {'operations': operations}

dataset['train'] = dataset['train'].map(get_operations)

Map: 100%|██████████| 650/650 [00:00<00:00, 11112.51 examples/s]


In [116]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'answer_number', 'operations'],
        num_rows: 650
    })
    test: Dataset({
        features: ['question', 'answer_number', 'answer'],
        num_rows: 68
    })
})

Our dataset contains only multihop questions

In [118]:
dataset['train'].filter(lambda x: len(x['operations']) == 0)

Filter: 100%|██████████| 650/650 [00:00<00:00, 23482.52 examples/s]


Dataset({
    features: ['question', 'answer', 'answer_number', 'operations'],
    num_rows: 0
})

In [119]:
from collections import Counter
op_counts = Counter([x for row in dataset['train'] for x in row['operations']])
op_counts

Counter({'power': 50,
         'log': 50,
         'subtract': 50,
         'ln': 50,
         'multiply': 50,
         'sqrt': 50,
         'gcd': 50,
         'lcm': 50,
         'choose': 50,
         'add': 50,
         'divide': 50,
         'permutate': 50,
         'remainder': 50})

In [5]:
max(map(lambda x: len(re.findall("<T>", x)), dataset["train"]["answer"]))

1

In [109]:
dataset['train'][50]

{'question': ' 10^x=8162.85, what is the value of x?',
 'answer': ' 10^x=8162.85, so x=log(8162.85)=<T>log(8162.85)=3.91.',
 'answer_number': 3.91,
 'operations': ['log']}

In [91]:
system_prompt = """
You a data annotator. You follow the instructions below to generate a dataset for a math problem generator.

Generate different math problems similar to this math problem.
 - Make sure each question uses diverse language and includes multiple logical steps.
 - After each generated problem, write down a **detailed and complete solution** to solve the question **step by step**. Do NOT give the result directly.
 - The following tools are available: add, subtract, multiply, divide, power, sqrt, log, lcm, gcd, ln, choose, remainder, and permutate. 
 - The log function works as follows: if only one argument is passed, it is 10th log; if two arguments are passed, it is log with base as the second argument
 - Every tool call should be in the form of <T>tool_name(arg1, arg2, ...)=result. For example, <T>power(0.82, 8)=0.2 means you should call the power function with 0.82 and 8 as the arguments, and the result should be 0.2.
 - DO NOT mix tools in one tool call. DO NOT write <T>add(1, multiply(2, 3))=7. Instead, write <T>multiply(2, 3))=7, <T>add(1, 7)=8.
 - Always use the least amount of steps to solve the problem. For example, <T>add(1, 2)=3 is better than <T>add(1, 1)=2, <T>add(2, 1)=3.
 - NEVER provide calculations in normal human format. The following is an example of a bad solution: "1+2=3". Instead, you should write "<T>add(1, 2)=3".
""".strip()

user_prompt = """
Consider the following math problem:

Question: 10^x=8162.85, so x=log(8162.85)=<T>log(8162.85)=3.91.
Answer: 10^x=8162.85, what is the value of x?

Generate 5 different math problems similar to this math problem. Use the same logic as the question above but use a different theme. 

Answer by first provoding the question, then grading instructions and then the correct answer. Each one of them should be a sentence with no more than 30 words. The format shall be as shown below. Do not use any other format. The question or grading instruction shall never contain a toolcall <T>.
Question: 
Grading instructions:
Answer: 
""".strip()

In [92]:
completion = client.chat.completions.create(
  model="gpt-4-1106-preview",
  messages=[
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": user_prompt}
  ]
)

In [93]:
print(completion.choices[0].message.content)

Question: If a bacterium population doubles every hour, how many hours does it take to reach a population of 4096, starting with one?
Grading instructions: Use logarithmic functions to determine the time in hours based on population growth.
Answer: <T>log(4096, 2)=12.

Question: How many days will it take for an investment to quadruple if the daily interest rate leads to a multiplier of 1.1? Assume the initial amount is $100.
Grading instructions: Calculate the period needed for the investment to reach four times its initial value using logarithms.
Answer: <T>log(4, 1.1)=15.04.

Question: A certain radioactive isotope decays to half its mass every 8 days. After how many days will only 1/256 of the original mass remain?
Grading instructions: Determine the number of days for the isotope to decay by using logarithms.
Answer: <T>log(1/256, 0.5)=8.

Question: If a car depreciates at a rate of 20% per year, how many years will it take for the car's value to depreciate to 1/32 of its original

In [94]:
def cost(completion):
    # https://openai.com/pricing
    usage = completion.usage
    prompt_tokens = usage.prompt_tokens
    completion_tokens = usage.completion_tokens
    prompt_cost = prompt_tokens / 1000 * 0.01
    completion_cost = completion_tokens / 1000 * 0.03
    print(f"prompt cost: {prompt_cost}. completion cost: {completion_cost}")
    return prompt_cost + completion_cost

In [95]:
cost(completion)

prompt cost: 0.005490000000000001. completion cost: 0.01134


0.01683

In [90]:
cost(completion) * 650

prompt cost: 0.005350000000000001. completion cost: 0.0138


12.4475