In [1]:
!pip install datasets

Collecting datasets
  Using cached datasets-2.18.0-py3-none-any.whl (510 kB)
Collecting aiohttp
  Using cached aiohttp-3.9.3-cp310-cp310-win_amd64.whl (365 kB)
Collecting multiprocess
  Using cached multiprocess-0.70.16-py310-none-any.whl (134 kB)
Collecting aiosignal>=1.1.2
  Using cached aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
Collecting attrs>=17.3.0
  Using cached attrs-23.2.0-py3-none-any.whl (60 kB)
Collecting async-timeout<5.0,>=4.0
  Using cached async_timeout-4.0.3-py3-none-any.whl (5.7 kB)
Collecting yarl<2.0,>=1.0
  Using cached yarl-1.9.4-cp310-cp310-win_amd64.whl (76 kB)
Installing collected packages: yarl, multiprocess, attrs, async-timeout, aiosignal, aiohttp, datasets
Successfully installed aiohttp-3.9.3 aiosignal-1.3.1 async-timeout-4.0.3 attrs-23.2.0 datasets-2.18.0 multiprocess-0.70.16 yarl-1.9.4



[notice] A new release of pip available: 22.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import logging
import os
import pandas as pd
import glob 
from autogen import config_list_from_dotenv
from autogen.agentchat.contrib.gpt_assistant_agent import GPTAssistantAgent
from autogen import UserProxyAgent, AssistantAgent, GroupChat, GroupChatManager
import json

In [2]:
logger = logging.getLogger('math_solver')
logger.setLevel(logging.DEBUG)

formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

if False:
    fh = logging.FileHandler('knowledge_elicitation.log')
    fh.setLevel(logging.DEBUG)
    fh.setFormatter(formatter)
    logger.addHandler(fh)

ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
logger.addHandler(ch)


In [4]:
from datasets import load_dataset
dataset_main = load_dataset("gsm8k", "main")
dataset_socratic = load_dataset("gsm8k", "socratic")

Downloading data: 100%|██████████| 2.31M/2.31M [00:02<00:00, 905kB/s]
Downloading data: 100%|██████████| 419k/419k [00:00<00:00, 502kB/s]
Generating train split: 100%|██████████| 7473/7473 [00:00<00:00, 287486.09 examples/s]
Generating test split: 100%|██████████| 1319/1319 [00:00<00:00, 15039.93 examples/s]
Downloading data: 100%|██████████| 2.68M/2.68M [00:01<00:00, 1.97MB/s]
Downloading data: 100%|██████████| 487k/487k [00:00<00:00, 798kB/s]
Generating train split: 100%|██████████| 7473/7473 [00:00<00:00, 404157.54 examples/s]
Generating test split: 100%|██████████| 1319/1319 [00:00<00:00, 165572.89 examples/s]


In [6]:
dataset_main['train'][0]

{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
 'answer': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72'}

In [7]:
dataset_socratic['train'][0]

{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
 'answer': 'How many clips did Natalia sell in May? ** Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nHow many clips did Natalia sell altogether in April and May? ** Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72'}

In [46]:

prompt_template = """
Here is a math question: "{text}"
Let's analyze the question from the following angles:
1. According to math education syllabus, what category does the question belong to?
2. What methods and techniques are commonly used to solve questions of the category?
3. What methods and techniques are likely to suitable to solve the given question?
4. Select the most promising method to solve the question.
5. Solve the question step by step. 
6. Output final result in format "Answer: xxxx"
"""
prompt_template = """
Here is a math question: "{text}"
Let's analyze the question from the following angles:
1. According to math education syllabus, what category does the question belong to?
2. What methods and techniques are commonly used to solve questions of the category?
3. What methods and techniques are likely to suitable to solve the given question?
4. Select the most promising method to solve the question.
5. Review concepts relevant to the question and the solving process.
6. Solve the question step by step. 
7. Output final result in format "Answer: xxxx"
"""

prompt_template = """
Here is a math question: "{text}"
Let's analyze the question from the following angles:
1. According to math education syllabus, what category does the question belong to?
2. What methods and techniques are commonly used to solve questions of the category?
3. What methods and techniques are likely to suitable to solve the given question?
4. Select the most promising method to solve the question.
5. Review concepts relevant to the question and the solving process.
6. Solve the question step by step. 
7. Check the solving process step by step.
8. Output final result in format "Answer: xxxx"
"""

prompt_template_with_choices = """
Here is a math question: "{text}"
Choices: choice_A: {choice_A}, choice_B: {choice_B}, choice_C: {choice_C}, choice_D: {choice_D}.
Let's analyze the question from the following angles:
1. According to math education syllabus, what category does the question belong to?
2. What domain specific problem solving skills and knowledge are commonly used to solve questions of the category?
3. Select the most suitable method to solve the question.
4. Solve the question step by step. 
6. Check the solving process step by step, make sure you apply the proper concepts and skills in each step.
7. Output final result in format "Answer: xxxx" in the last line, without any other text.
"""

prompt_template_with_choices = """
Here is a math question: "{text}"
Choices: choice_A: {choice_A}, choice_B: {choice_B}, choice_C: {choice_C}, choice_D: {choice_D}.
Let's analyze the question from the following angles:
1. According to math education syllabus, what category does the question belong to?
2. What domain specific problem solving skills and knowledge are commonly used to solve questions of the category?
3. Select the most suitable method to solve the question.
4. Solve the question step by step. 
6. Compare answer against the choices. If the answer is not in the choices, review the concepts relevant to the question and the solving process, and try solving the question again.
7. Output final result in format "Answer: xxxx" in the last line, without any other text.
"""

prompt_template_with_choices = """
Here is a math question: "{text}"
Choices: choice_A: {choice_A}, choice_B: {choice_B}, choice_C: {choice_C}, choice_D: {choice_D}.
Let's analyze the question from the following angles:
1. Read question and choices carefully.
2. According to math education syllabus, what category does the question belong to?
3. What domain specific problem solving skills and knowledge are commonly used to solve questions of the category?
4. Select the most suitable method to solve the question.
5. Solve the question step by step, pay attention to make use of information in both question and choices. 
6. Compare answer against the choices choice_A: {choice_A}, choice_B: {choice_B}, choice_C: {choice_C}, choice_D: {choice_D}, and decide which choice is selected. If answer doesn't match any choice, the answer is not correct, go back to step 1. Otherwise go to step 7.
7. Output final result in format "Answer: xxxx" in the last line, without any other text.
"""

prompt_template_with_choices = """
Here is a math question: "{text}"
Choices: choice_A: {choice_A}, choice_B: {choice_B}, choice_C: {choice_C}, choice_D: {choice_D}.
Let's analyze the question from the following angles:
1. Read question and choices carefully.
2. According to math education syllabus, what category does the question belong to?
3. What domain specific problem solving skills and knowledge are commonly used to solve questions of the category?
4. Select the most suitable method to solve the question.
5. Solve the question step by step, pay attention to make use of information in both question and choices. 
6. Compare answer against the choices choice_A: {choice_A}, choice_B: {choice_B}, choice_C: {choice_C}, choice_D: {choice_D}, and decide which choice is selected. If answer matches a choice, select the choice and go to step 7. If answer doesn't match any choice, the answer is not correct, tackle the question again from step 1. 
7. Output final result in format "Answer: xxxx" in the last line, without any other text.
"""

prompt_template_with_choices = """
Here is a math question: "{text}"
Correct answer is among: choice_A: {choice_A}, choice_B: {choice_B}, choice_C: {choice_C}, choice_D: {choice_D}.
Let's analyze the question from the following angles:
1. Read question and choices carefully.
2. According to math education syllabus, what category does the question belong to?
3. What domain specific problem solving skills and knowledge are commonly used to solve questions of the category?
4. Select the most suitable method to solve the question.
5. Solve the question step by step, pay attention to make use of information in both question and choices. 
6. Compare answer against the choices choice_A: {choice_A}, choice_B: {choice_B}, choice_C: {choice_C}, choice_D: {choice_D}, and decide which choice is selected. If answer matches a choice, select the choice and go to step 7. If answer doesn't match any choice, the answer is not correct, tackle the question again from step 1. 
7. Output final result in format "Answer: xxxx" in the last line, without any other text.
"""

prompt_template_with_choices = """
Here is a math question: "{text}"
Correct answer is among: choice_A: {choice_A}, choice_B: {choice_B}, choice_C: {choice_C}, choice_D: {choice_D}.
Let's analyze the question from the following angles, print out each rationals in each step:
1. Read question and choices carefully.
2. According to math education syllabus, what category does the question belong to?
3. What domain specific problem solving skills and knowledge are commonly used to solve questions of the category?
4. Select the most suitable method to solve the question.
5. Solve the question step by step, pay attention to make use of information in both question and choices. 
6. Compare answer against the choices choice_A: {choice_A}, choice_B: {choice_B}, choice_C: {choice_C}, choice_D: {choice_D}, and decide which choice is selected. If answer matches a choice, select the choice as final result; if answer doesn't match any choice, the answer is not correct, and final result is "None".
7. print out final result in format "Answer: the final result" in the last line, without any other text. 
"""


prompt_template_with_choices_zero_shot_cot = """
Here is a math question: "{text}"
Correct answer is among: choice_A: {choice_A}, choice_B: {choice_B}, choice_C: {choice_C}, choice_D: {choice_D}.
1. Let's solve the question step by step, pay attention to make use of information in both question and choices.
2. Compare answer against the choices choice_A: {choice_A}, choice_B: {choice_B}, choice_C: {choice_C}, choice_D: {choice_D}, and decide which choice is selected. If answer matches a choice, select the choice as final result; if answer doesn't match any choice, the answer is not correct, and final result is "None".
3. print out final result in format "Answer: the final result" in the last line, without any other text. 
"""

MMLU abstract algebra

In [4]:
data_file = "data/MMLU/abstract_algebra_test.csv"
abstract_algebra_test = pd.read_csv(data_file, names=['text', 'choice_A', 'choice_B', 'choice_C', 'choice_D', 'answer'] ).to_dict(orient="records")
abstract_algebra_test[:2]

[{'text': 'Find the degree for the given field extension Q(sqrt(2), sqrt(3), sqrt(18)) over Q.',
  'choice_A': '0',
  'choice_B': '4',
  'choice_C': '2',
  'choice_D': '6',
  'answer': 'B'},
 {'text': 'Let p = (1, 2, 5, 4)(2, 3) in S_5 . Find the index of <p> in S_5.',
  'choice_A': '8',
  'choice_B': '2',
  'choice_C': '24',
  'choice_D': '120',
  'answer': 'C'}]

In [40]:
# question = "Find all zeros in the indicated finite field of the given polynomial with coefficients in that field. x^5 + 3x^3 + x^2 + 2x in Z_5"
text = "Let p = (1, 2, 5, 4)(2, 3) in S_5 . Find the index of <p> in S_5." # gpt3.5 failed to notice the two cycles are non-disjoint
prompt = prompt_template.format(text=text)
print(prompt)
q = abstract_algebra_test[1]
prompt =  prompt_template_with_choices.format(**q)
print(prompt)


Here is a math question: "Let p = (1, 2, 5, 4)(2, 3) in S_5 . Find the index of <p> in S_5."
Let's analyze the question from the following angles:
1. According to math education syllabus, what category does the question belong to?
2. What methods and techniques are commonly used to solve questions of the category?
3. What methods and techniques are likely to suitable to solve the given question?
4. Select the most promising method to solve the question.
5. Review concepts relevant to the question and the solving process.
6. Solve the question step by step. 
7. Check the solving process step by step.
8. Output final result in format "Answer: xxxx"


Here is a math question: "Let p = (1, 2, 5, 4)(2, 3) in S_5 . Find the index of <p> in S_5."
Correct answer is among: choice_A: 8, choice_B: 2, choice_C: 24, choice_D: 120.
Let's analyze the question from the following angles:
1. Read question and choices carefully.
2. According to math education syllabus, what category does the question be

In [7]:
from openai import OpenAI 

# get api_key from .env file
import os
from dotenv import load_dotenv
load_dotenv()
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

In [41]:



response = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[{"role": "user",
             "content": prompt}],
  temperature=0.2,
  # max_tokens=60,
  top_p=0.1,
  frequency_penalty=0.0,
  presence_penalty=0.0
)

print(response.choices[0].message.content)

1. The question asks for the index of <p> in S_5.
2. This question belongs to the category of permutation groups in abstract algebra.
3. Domain specific problem solving skills and knowledge commonly used include understanding permutations, cycles, and group theory.
4. The index of a subgroup <p> in a group S_5 can be found using Lagrange's theorem.
5. The permutation p = (1, 2, 5, 4)(2, 3) can be written as (1, 5, 4)(2, 3) in cycle notation. The order of p is the least common multiple of the lengths of the cycles, which is lcm(3, 2) = 6. Therefore, the index of <p> in S_5 is 5!/6 = 20.
6. Answer: None


In [29]:
print(response.choices[0].message.content)

1. The question belongs to the category of group theory in abstract algebra.

2. Common methods and techniques used to solve questions in group theory include understanding group operations, group subgroups, cycles, permutations, and group orders.

3. To solve the given question, we need to understand the concept of group index, which is the number of left cosets of a subgroup in a group.

4. The most promising method to solve the question is to calculate the order of the subgroup generated by p and then use Lagrange's theorem to find the index of <p> in S_5.

5. First, let's calculate the order of the subgroup generated by p. The order of p is the least common multiple of the orders of the disjoint cycles in its cycle decomposition. In this case, p = (1, 2, 5, 4)(2, 3) has disjoint cycles of length 4 and 2, so the order of p is lcm(4, 2) = 4.

Now, according to Lagrange's theorem, the index of <p> in S_5 is the order of S_5 divided by the order of <p>. Since |S_5| = 5! = 120, and |<p>

In [58]:
response_log = []
for idx, q in enumerate(abstract_algebra_test[:100]):
    
    prompt =  prompt_template_with_choices.format(**q)
    response = client.chat.completions.create(
      model="gpt-3.5-turbo",
      messages=[{"role": "user",
                 "content": prompt}],
      temperature=0.2,
      # max_tokens=60,
      top_p=0.1,
      frequency_penalty=0.0,
      presence_penalty=0.0
    )
    result = q.copy()
    result['id'] = idx
    result['response'] = response.choices[0].message.content
    response_log.append(result)
    print("question", idx, q)
    print("response:", response.choices[0].message.content)
    print("")
    lines = [ line.strip() for line in result['response'].split("\n")]
    lines = [line for line in lines if line]
    # check if the last line contains word None without case sensitivity
    # if it does, print the last line
    
    if lines[-1].lower().find("none") != -1:
        retry = 5
        print("retrying")
        while retry > 0:
            response = client.chat.completions.create(
              model="gpt-3.5-turbo",
              messages=[{"role": "user",
                         "content": prompt}],
              temperature=0.8,
              # max_tokens=60,
              top_p=0.1,
              frequency_penalty=0.0,
              presence_penalty=0.0
            )
            result = q.copy()
            result['id'] = idx
            result['response'] = response.choices[0].message.content
            response_log.append(result)
            print("question", idx, q)
            print("response:", response.choices[0].message.content)
            print("")
            lines = [ line.strip() for line in result['response'].split("\n")]
            lines = [line for line in lines if line]
            if lines[-1].lower().find("none") == -1:
                break
            retry -= 1
    
with open('./ke_result/abstract_algebra_100_response_log.json', 'w') as f:
    json.dump(response_log, f)

    

question 0 {'text': 'Find the degree for the given field extension Q(sqrt(2), sqrt(3), sqrt(18)) over Q.', 'choice_A': '0', 'choice_B': '4', 'choice_C': '2', 'choice_D': '6', 'answer': 'B'}
response: 1. The question is asking for the degree of the field extension Q(sqrt(2), sqrt(3), sqrt(18)) over Q.
2. This question belongs to the field of abstract algebra, specifically field extensions.
3. Knowledge of field extensions, degree of field extensions, and properties of square roots are commonly used to solve questions of this category.
4. The most suitable method to solve this question is to determine the degree of the field extension by examining the linearly independent elements in the extension.
5. Since sqrt(18) = sqrt(2*3*3) = sqrt(2)*sqrt(3)*sqrt(3), we can see that sqrt(18) is already in the field extension generated by sqrt(2) and sqrt(3). Therefore, the degree of the field extension Q(sqrt(2), sqrt(3), sqrt(18)) over Q is 2.
6. Answer: choice_C: 2

question 1 {'text': 'Let p = (

7. Answer: None


In [71]:
# for idx, q in enumerate(abstract_algebra_test[:2]):
#     print("question", idx, q)

question 0 {'text': 'Find the degree for the given field extension Q(sqrt(2), sqrt(3), sqrt(18)) over Q.', 'choice_A': '0', 'choice_B': '4', 'choice_C': '2', 'choice_D': '6', 'answer': 'B'}
question 1 {'text': 'Let p = (1, 2, 5, 4)(2, 3) in S_5 . Find the index of <p> in S_5.', 'choice_A': '8', 'choice_B': '2', 'choice_C': '24', 'choice_D': '120', 'answer': 'C'}


In [60]:
response_log

[{'text': 'Find the degree for the given field extension Q(sqrt(2), sqrt(3), sqrt(18)) over Q.',
  'choice_A': '0',
  'choice_B': '4',
  'choice_C': '2',
  'choice_D': '6',
  'answer': 'B',
  'response': "1. The question belongs to the field of abstract algebra, specifically field extensions and degrees of field extensions.\n\n2. To solve this question, knowledge of field extensions, degree of field extensions, and properties of square roots is commonly used. Understanding how to combine field extensions and calculate the degree of the resulting field extension is crucial.\n\n3. The most suitable method to solve this question is to first determine the intermediate field extensions and then calculate the degree of the composite field extension.\n\n4. Let's solve the question step by step:\n- First, we know that Q(sqrt(2)) is a degree 2 extension over Q, as the minimal polynomial of sqrt(2) over Q is x^2 - 2.\n- Similarly, Q(sqrt(3)) is a degree 2 extension over Q, as the minimal polynom

In [None]:
prompt_template = """
Here is a math question: "{question}"
The correct answer is {answer}.  
Your task is to elaborate on the solution process, and provide a detailed explanation of the steps to solve the question.  
Let's analyze the question from the following angles:
1. According to math education syllabus, what category does the question belong to?
2. What domain specific problem solving skills and knowledge are commonly used to solve questions of the category?
3. Select the most suitable method to solve the question.
4. Solve the question step by step. 
6. Check the solving process step by step, make sure you apply the proper concepts and skills in each step.
7. Finalize the solution process. 
"""

question = "Let p = (1, 2, 5, 4)(2, 3) in S_5 . Find the index of <p> in S_5." # gpt3.5 failed to notice the two cycles are non-disjoint
answer = 24 
prompt = prompt_template.format(question=question, answer=answer)
print(prompt)