# GPT-3.5-Turbo Performance on MMLU - Abstract Algebra

In [5]:
import openai
import os
import re
import time
import json

import numpy as np

from tqdm import tqdm
from datasets import load_dataset
from tenacity import retry, stop_after_attempt, wait_chain, wait_fixed

In [19]:
from openai import OpenAI 

# get api_key from .env file
import os
from dotenv import load_dotenv
load_dotenv()
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

In [25]:
@retry(wait=wait_chain(*[wait_fixed(3) for i in range(3)] +
                       [wait_fixed(5) for i in range(2)] +
                       [wait_fixed(10)]))
def completion_with_backoff(**kwargs):
    # return openai.ChatCompletion.create(**kwargs)
    return client.chat.completions.create(**kwargs)

In [8]:
mmlu_prompt = json.load(open('lib_prompt/mmlu-cot.json'))

In [9]:
mmlu_prompt.keys()

dict_keys(['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy

In [10]:
print(mmlu_prompt['abstract_algebra'])

The following are multiple choice questions (with answers) about abstract algebra.

Q: Statement 1 | Every element of a group generates a cyclic subgroup of the group. Statement 2 | The symmetric group S_10 has 10 elements.
(A) True, True (B) False, False (C) True, False (D) False, True
A: Let's think step by step. A cyclic group is a group that is generated by a single element. Hence a subgroup generated by a single element of a group is cyclic and Statement 1 is True. The answer is (C).

Q: The symmetric group $S_n$ has $
actorial{n}$ elements, hence it is not true that $S_{10}$ has 10 elements.
Find the characteristic of the ring 2Z.
(A) 0 (B) 3 (C) 12 (D) 30
A: Let's think step by step. A characteristic of a ring is R is $n$ if the statement $ka = 0$ for all $a\in 2Z$ implies that $k$ is a multiple of $n$. Assume that $ka = 0$ for all $a\in 2Z$ for some $k$. In particular $2k = 0$. Hence $k=0$ and $n=0$. The answer is (A).

Q: Statement 1| Every function from a finite set onto itse

In [11]:
abstract_algebra = load_dataset("lukaemon/mmlu", "abstract_algebra")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Downloading builder script: 100%|██████████| 5.01k/5.01k [00:00<?, ?B/s]
Downloading readme: 100%|██████████| 28.7k/28.7k [00:00<00:00, 4.77MB/s]
Downloading data: 100%|██████████| 166M/166M [01:39<00:00, 1.67MB/s] 
Generating test split: 100%|██████████| 100/100 [00:00<00:00, 934.55 examples/s]
Generating validation split: 100%|██████████| 11/11 [00:00<00:00, 250.02 examples/s]
Generating train split: 100%|██████████| 5/5 [00:00<00:00, 106.38 examples/s]


In [67]:
abstract_algebra['test'][0]

{'input': 'Find the degree for the given field extension Q(sqrt(2), sqrt(3), sqrt(18)) over Q.',
 'A': '0',
 'B': '4',
 'C': '2',
 'D': '6',
 'target': 'B'}

In [14]:
prompt_q = mmlu_prompt['abstract_algebra'] + "\n\n" + abstract_algebra['test'][0]['input'] + '\n'
for letter in ['A', 'B', 'C', 'D']:
    prompt_q += '(' + letter + ') ' + abstract_algebra['test'][0][letter] + ' '
prompt_q += "\nA: Let's think step by step."

In [15]:
print(prompt_q)

The following are multiple choice questions (with answers) about abstract algebra.

Q: Statement 1 | Every element of a group generates a cyclic subgroup of the group. Statement 2 | The symmetric group S_10 has 10 elements.
(A) True, True (B) False, False (C) True, False (D) False, True
A: Let's think step by step. A cyclic group is a group that is generated by a single element. Hence a subgroup generated by a single element of a group is cyclic and Statement 1 is True. The answer is (C).

Q: The symmetric group $S_n$ has $
actorial{n}$ elements, hence it is not true that $S_{10}$ has 10 elements.
Find the characteristic of the ring 2Z.
(A) 0 (B) 3 (C) 12 (D) 30
A: Let's think step by step. A characteristic of a ring is R is $n$ if the statement $ka = 0$ for all $a\in 2Z$ implies that $k$ is a multiple of $n$. Assume that $ka = 0$ for all $a\in 2Z$ for some $k$. In particular $2k = 0$. Hence $k=0$ and $n=0$. The answer is (A).

Q: Statement 1| Every function from a finite set onto itse

In [20]:
# response = openai.ChatCompletion.create(
#     model="gpt-3.5-turbo",
#     messages=[
#         {"role": "system", "content": "Follow the given examples and answer the question."},
#         {"role": "user", "content": prompt_q},
#     ],
#     temperature=0, 
# )


response = client.chat.completions.create(
  model="gpt-3.5-turbo",
     messages=[
        {"role": "system", "content": "Follow the given examples and answer the question."},
        {"role": "user", "content": prompt_q},
    ],
  temperature=0
)



In [22]:

# response['choices'][0]['message']['content']
print(response.choices[0].message.content)

The degree of the field extension Q(sqrt(2), sqrt(3), sqrt(18)) over Q can be found by considering the degrees of the intermediate field extensions.

First, we have Q(sqrt(2)) over Q, which has degree 2 since the minimal polynomial of sqrt(2) over Q is x^2 - 2.

Next, we consider the extension Q(sqrt(2), sqrt(3)) over Q(sqrt(2)). The minimal polynomial of sqrt(3) over Q(sqrt(2)) is x^2 - 3, so this extension also has degree 2.

Finally, we look at the extension Q(sqrt(2), sqrt(3), sqrt(18)) over Q(sqrt(2), sqrt(3)). Since sqrt(18) = sqrt(2) * sqrt(3), it is already contained in the field Q(sqrt(2), sqrt(3)). Therefore, the degree of this extension is 1.

Adding up the degrees of the intermediate extensions, we get 2 + 2 + 1 = 5. Therefore, the correct answer is not among the options provided.


In [55]:
def test_answer_mmlu(pred_str, ans_str):
    # pattern = 'the answer is ('
    pattern = 'answer is ('
    pred = pred_str.lower().split(pattern)
    
    if(len(pred) > 1):
        # print(pred)
        pred = pred[1][0]
        gold = ans_str.split('A:\n')[1][0].lower()
        # print('debug 1, pred %s, gold %s' % (pred, gold))
        return pred == gold
    else: 
        pred = 'C'
        gold = ans_str.split('A:\n')[1][0].lower()
        # print('debug 2, pred %s, gold %s' % (pred, gold))
        return pred == gold
    # return False

def parse_pred_ans(filename):
    with open(filename, encoding='utf-8') as fd: lines = fd.readlines()
    am, a = None, None
    num_q, acc = 0, 0
    current_mode = 'none'
    questions = []
    ans_pred = []
    ans_gold = []
    marks = []
    for l in lines:
        if(l.startswith('Q: ')):
            if(am is not None and a is not None):
                questions.append(q)
                ans_pred.append(am)
                ans_gold.append(a)
                # print(am)
                # print(a)
                if(test_answer_mmlu(am, a)):
                    marks.append(1  )
                    acc += 1
                else:
                    marks.append(0)
            current_mode = 'q'
            q = l
            num_q += 1
        elif(l.startswith('A_model:')):
            current_mode = 'am'
            am = l
        elif(l.startswith('A:')):
            current_mode = 'a'
            a = l
        else:
            if(current_mode == 'q'): q += l
            elif(current_mode == 'am'): am += l
            elif(current_mode == 'a'): a += l
            else:
                raise ValueError(current_mode)
                
    questions.append(q)
    ans_pred.append(am)
    ans_gold.append(a)
    # print(am)
    # print(a)
    if(test_answer_mmlu(am, a)):
        marks.append(1)
        acc += 1
    else:
        marks.append(0)
    print('num_q %d correct %d ratio %.4f' % (num_q, acc, float(acc / num_q)))
    return questions, ans_pred, ans_gold, marks 

def test_finished(ans_model):
    if('answer is' in ans_model): return True
    else: return False

def extract_ans(ans_model):
    ans_model = ans_model.split('\n')
    ans = []
    residual = []
    for li, al in enumerate(ans_model):
        ans.append(al)
        if('answer is' in al):
            break
    residual = list(ans_model[li + 1:])
    ans = '\n'.join(ans)
    residual = '\n'.join(residual)
    return ans, residual

In [32]:
task = 'abstract_algebra'

i = 0
with open('outputs/test_gpt_3.5_turbo_%s.txt' % task, 'w', encoding='utf-8') as fd:
    for q_ in tqdm(abstract_algebra['test'], total=len(abstract_algebra['test'])):
        q = q_['input'] + '\n'
        for letter in ['A', 'B', 'C', 'D']:
            q += '(' + letter + ') ' + q_[letter] + ' '
        q += "\nA: Let's think step by step."  
            
        prompt_q = mmlu_prompt[task] + "\n\n" + q

        response = completion_with_backoff(
              model="gpt-3.5-turbo",
              messages=[
                    {"role": "system", "content": "Follow the given examples and answer the question."},
                    {"role": "user", "content": prompt_q},
                ]
            )
        # ans_model = response['choices'][0]['message']['content']
        ans_model = response.choices[0].message.content
        ans_, residual = extract_ans(ans_model)
            
        a = q_['target']
        fd.write('Q: %s\nA_model:\n%s\nA:\n%s\n\n' % (q, ans_, a))
        i += 1
        # if(i == 2): break

100%|██████████| 100/100 [06:57<00:00,  4.18s/it]


In [57]:
questions, ans_pred, ans_gold, marks  = parse_pred_ans('outputs/test_gpt_3.5_turbo_%s.txt' % task)

num_q 100 correct 43 ratio 0.4300


In [53]:
# # questions
# for idx, score in enumerate(marks):
#     if score == 0:
#         print(ans_pred[idx], ans_gold[idx], test_answer_mmlu(ans_pred[idx], ans_gold[idx]))

A_model:
To find the index of the subgroup generated by $p = (1, 2, 5, 4)(2, 3)$ in $S_5$, we need to find the number of elements in $S_5$ that are not in the subgroup generated by $p$. 

First, let's analyze the cycle structure of $p$:
- $p = (1, 2, 5, 4)(2, 3)$ means that $1 \to 2 \to 5 \to 4 \to 1$ and $2 \to 3 \to 2$. The other elements not mentioned in the cycles remain fixed.

For $S_5$, there are $5! = 120$ total permutations. Now, let's find the number of permutations not generated by $p$. The permutations not generated by $p$ are those that break the cycles of $p$ or are not affected by $p$ at all.

Since the cycle $(1, 2, 5, 4)$ and the cycle $(2, 3)$ do not share any elements in common aside from $2$ (which remains fixed in $(2, 3)$), the permutations not generated by $p$ are those where $1$ does not go to $2$ (4 choices), $2$ does not go to $3$ and still $1$ doesn't go to $5$ nor $4$ (2 choices), $3$ does not go to $2$ (4 choices), and finally $4$ and $5$ can go wherever th

In [50]:
marks  

[0]

In [80]:
task = 'abstract_algebra'

i = 0
with open('outputs/test_gpt_3.5_turbo_%s_temp_0.txt' % task, 'w') as fd:
    for q_ in tqdm(abstract_algebra['test'], total=len(abstract_algebra['test'])):
        q = q_['input'] + '\n'
        for letter in ['A', 'B', 'C', 'D']:
            q += '(' + letter + ') ' + q_[letter] + ' '
        q += "\nA: Let's think step by step."  
            
        prompt_q = mmlu_prompt[task] + "\n\n" + q

        response = completion_with_backoff(
              model="gpt-3.5-turbo",
              messages=[
                    {"role": "system", "content": "Follow the given examples and answer the question."},
                    {"role": "user", "content": prompt_q},
                ],
                temperature=0, 
            )
        ans_model = response['choices'][0]['message']['content']
        ans_, residual = extract_ans(ans_model)
            
        a = q_['target']
        fd.write('Q: %s\nA_model:\n%s\nA:\n%s\n\n' % (q, ans_, a))
        i += 1
        # if(i == 10): break

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 99/99 [13:43<00:00,  8.32s/it]


In [81]:
_, _, _ = parse_pred_ans('outputs/test_gpt_3.5_turbo_%s_temp_0.txt' % task)

num_q 99 correct 46 ratio 0.4646


# zero-shot cot

In [65]:
prompt_template_with_choices_zero_shot_cot = """
Here is a math question: "{input}"
Correct answer is among: (A): {A}, (B): {B}, (C): {C}, (D): {D}.
1. Let's solve the question step by step, print out each step. Pay attention to make use of information in both question and choices.
2. Compare answer against the choices (A): {A}, (B): {B}, (C): {C}, (D): {D}, and decide which choice is selected. If answer matches a choice, select the choice i.e. one of "(A)", "(B)", "(C)" and "(D)" as final result; if answer doesn't match any choice, the answer is not correct, and final result is "(None)".
3. print out final result, must in format "the answer is _final_result_" in the last line where _final_result_ is one of "(A)", "(B)", "(C)", "(D)" and "(None)", without any other text. 
"""
q_ = abstract_algebra['test'][0]
prompt_q = prompt_template_with_choices_zero_shot_cot.format(**q_)
prompt_q  

'\nHere is a math question: "Find the degree for the given field extension Q(sqrt(2), sqrt(3), sqrt(18)) over Q."\nCorrect answer is among: (A): 0, (B): 4, (C): 2, (D): 6.\n1. Let\'s solve the question step by step, print out each step. Pay attention to make use of information in both question and choices.\n2. Compare answer against the choices (A): 0, (B): 4, (C): 2, (D): 6, and decide which choice is selected. If answer matches a choice, select the choice i.e. one of "(A)", "(B)", "(C)" and "(D)" as final result; if answer doesn\'t match any choice, the answer is not correct, and final result is "(None)".\n3. print out final result, must in format "the answer is _final_result_" in the last line where _final_result_ is one of "(A)", "(B)", "(C)", "(D)" and "(None)", without any other text. \n'

In [86]:
task = 'abstract_algebra'

prompt_template_with_choices_zero_shot_cot = """
Here is a math question: "{input}"
Correct answer is among: (A): {A}, (B): {B}, (C): {C}, (D): {D}.
1. Let's solve the question step by step, print out each step. Pay attention to make use of information in both question and choices.
2. Compare answer against the choices (A): {A}, (B): {B}, (C): {C}, (D): {D}, and decide which choice is selected. If answer matches a choice, select the choice i.e. one of "(A)", "(B)", "(C)" and "(D)" as final result; if answer doesn't match any choice, the answer is not correct, and final result is "(None)".
3. print out final result, must in format "the answer is _final_result_" in the last line where _final_result_ is one of "(A)", "(B)", "(C)", "(D)" and "(None)", without any other text. 
"""

i = 0
with open('zero_shot_cot_outputs/test_gpt_3.5_turbo_%s.txt' % task, 'w', encoding='utf-8') as fd:
    for q_ in tqdm(abstract_algebra['test'], total=len(abstract_algebra['test'])):
        # q = q_['input'] + '\n'
        # for letter in ['A', 'B', 'C', 'D']:
        #     q += '(' + letter + ') ' + q_[letter] + ' '
        # q += "\nA: Let's think step by step."  
            
        # prompt_q = mmlu_prompt[task] + "\n\n" + q
        prompt_q = prompt_template_with_choices_zero_shot_cot.format(**q_)
        # print(prompt_q)
        response = completion_with_backoff(
              model="gpt-3.5-turbo",
              messages=[
                    # {"role": "system", "content": "Follow the given examples and answer the question."},
                    {"role": "user", "content": prompt_q},
                ]
            )
        # ans_model = response['choices'][0]['message']['content']
        ans_model = response.choices[0].message.content
        # print(ans_model)
        ans_, residual = extract_ans(ans_model)
            
        a = q_['target']
        # fd.write('Q: %s\nA_model:\n%s\nA:\n%s\n\n' % (q, ans_, a))
        fd.write('Q: %s\nA_model:\n%s\nA:\n%s\n\n' % (prompt_q, ans_, a))
        i += 1
        # if(i == 2): break

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [06:30<00:00,  3.91s/it]


In [87]:
questions, ans_pred, ans_gold, marks  = parse_pred_ans('zero_shot_cot_outputs/test_gpt_3.5_turbo_%s.txt' % task)

num_q 100 correct 47 ratio 0.4700


In [81]:
questions

['Q: \nHere is a math question: "Find the degree for the given field extension Q(sqrt(2), sqrt(3), sqrt(18)) over Q."\nCorrect answer is among: (A): 0, (B): 4, (C): 2, (D): 6.\n1. Let\'s solve the question step by step, print out each step. Pay attention to make use of information in both question and choices.\n2. Compare answer against the choices (A): 0, (B): 4, (C): 2, (D): 6, and decide which choice is selected. If answer matches a choice, select the choice i.e. one of "(A)", "(B)", "(C)" and "(D)" as final result; if answer doesn\'t match any choice, the answer is not correct, and final result is "(None)".\n3. print out final result, must in format "the answer is _final_result_" in the last line where _final_result_ is one of "(A)", "(B)", "(C)", "(D)" and "(None)", without any other text. \n\n',
 'Q: \nHere is a math question: "Let p = (1, 2, 5, 4)(2, 3) in S_5 . Find the index of <p> in S_5."\nCorrect answer is among: (A): 8, (B): 2, (C): 24, (D): 120.\n1. Let\'s solve the quest

In [82]:
ans_model

'1. p = (1, 2, 5, 4)(2, 3) means p(1) = 2, p(2) = 3, p(3) = 3, p(4) = 5, p(5) = 4.\n2. To find the index of <p> in S_5, we need to find the smallest positive integer n such that p^n = (1) in S_5.\n3. We can calculate p^2 as follows: p^2 = (2, 3)(1, 2, 5, 4) = (1, 3, 5)(2, 4).\n4. We see that p^2 is not equal to (1), so we continue to calculate p^3: p^3 = (1, 3, 5)(2, 4)(1, 2, 5, 4) = (3, 5)(1, 4, 2).\n5. We see that p^3 is not equal to (1), so we continue to calculate p^4: p^4 = (3, 5)(1, 4, 2)(1, 2, 5, 4) = (1, 5, 3)(4, 2).\n6. We see that p^4 is equal to (1), so the index of <p> in S_5 is 4.\nthe answer is (B)'

In [83]:
ans_pred

['A_model:\n1. Since Q(sqrt(2), sqrt(3), sqrt(18)) = Q(sqrt(2), sqrt(3)), we need to find the degree of the field extension Q(sqrt(2), sqrt(3)) over Q.\n\n2. The degree of the field extension Q(alpha) over Q is equal to the degree of the minimal polynomial of alpha over Q. \n\n3. The minimal polynomial of sqrt(2) over Q is x^2 - 2, which has degree 2. \n   The minimal polynomial of sqrt(3) over Q(sqrt(2)) is x^2 - 3, which also has degree 2. \n\n4. Therefore, the degree of the field extension Q(sqrt(2), sqrt(3)) over Q is 2*2 = 4.\n\n5. Finally, the answer is (B): 4\n',
 'A_model:\n1. p = (1, 2, 5, 4)(2, 3) means p(1) = 2, p(2) = 3, p(3) = 3, p(4) = 5, p(5) = 4.\n2. To find the index of <p> in S_5, we need to find the smallest positive integer n such that p^n = (1) in S_5.\n3. We can calculate p^2 as follows: p^2 = (2, 3)(1, 2, 5, 4) = (1, 3, 5)(2, 4).\n4. We see that p^2 is not equal to (1), so we continue to calculate p^3: p^3 = (1, 3, 5)(2, 4)(1, 2, 5, 4) = (3, 5)(1, 4, 2).\n5. We s

# KE

In [98]:
task = 'abstract_algebra'

prompt_template_with_choices = """
Here is a math question: "{input}"
Correct answer is among: A: {A}, B: {B}, C: {C}, D: {D}.
Let's analyze the question from the following angles, print out each rationals in each step:
1. Read question and choices carefully.
2. According to math education syllabus, what category does the question belong to?
3. What domain specific problem solving skills and knowledge are commonly used to solve questions of the category?
4. Select the most suitable method to solve the question.
5. Solve the question step by step, pay attention to make use of information in both question and choices. 
6. Compare answer against the choices (A): {A}, (B): {B}, (C): {C}, (D): {D}, and decide which choice is selected. If answer matches a choice, select the choice i.e. one of "(A)", "(B)", "(C)" and "(D)" as final result; if answer doesn't match any choice, the answer is not correct, and final result is "(None)".
7. print out final result, must in format "the answer is _final_result_" in the last line where _final_result_ is one of "(A)", "(B)", "(C)", "(D)" and "(None)", without any other text. 
"""

# prompt_template_with_choices = """
# Here is a math question: "{input}"
# Correct answer is among: A: {A}, B: {B}, C: {C}, D: {D}.
# Let's analyze the question from the following angles, print out each rationals in each step:
# 1. Read question and choices carefully.
# 2. According to math education syllabus, what category does the question belong to?
# 3. What domain specific problem solving skills and knowledge are commonly used to solve questions of the category?
# 4. Select the most suitable method to solve the question.
# 5. Solve the question step by step, pay attention to make use of information in both question and choices. 
# 6. print out final result, must in format "the answer is _final_result_" in the last line where _final_result_ is one of "(A)", "(B)", "(C)", "(D)" and "(None)", without any other text. 
# """

i = 0
with open('ke_outputs/test_gpt_3.5_turbo_%s.txt' % task, 'w', encoding='utf-8') as fd:
    for q_ in tqdm(abstract_algebra['test'], total=len(abstract_algebra['test'])):
        # q = q_['input'] + '\n'
        # for letter in ['A', 'B', 'C', 'D']:
        #     q += '(' + letter + ') ' + q_[letter] + ' '
        # q += "\nA: Let's think step by step."  
            
        # prompt_q = mmlu_prompt[task] + "\n\n" + q
        prompt_q = prompt_template_with_choices.format(**q_)
        # print(prompt_q)
        response = completion_with_backoff(
              model="gpt-3.5-turbo",
              messages=[
                    # {"role": "system", "content": "Follow the given examples and answer the question."},
                    {"role": "user", "content": prompt_q},
                ]
            )
        # ans_model = response['choices'][0]['message']['content']
        ans_model = response.choices[0].message.content
        # print(ans_model)
        ans_, residual = extract_ans(ans_model)
            
        a = q_['target']
        # fd.write('Q: %s\nA_model:\n%s\nA:\n%s\n\n' % (q, ans_, a))
        fd.write('Q: %s\nA_model:\n%s\nA:\n%s\n\n' % (prompt_q, ans_, a))
        i += 1
        # if(i == 2): break



100%|██████████| 100/100 [07:36<00:00,  4.57s/it]


In [99]:
questions, ans_pred, ans_gold, marks  = parse_pred_ans('ke_outputs/test_gpt_3.5_turbo_%s.txt' % task)

num_q 100 correct 29 ratio 0.2900


In [97]:
marks

[0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1]

In [121]:
import re 

prompt_template_with_choices = """
Here is a math question: "{input}"
Correct answer is among: A: {A}, B: {B}, C: {C}, D: {D}.
Let's analyze the question from the following angles, print out each rationals in each step:
1. Read question and choices carefully.
2. According to math education syllabus, what category does the question belong to?
3. What domain specific problem solving skills and knowledge are commonly used to solve questions of the category?
4. Select the most suitable method to solve the question.
5. Solve the question step by step, pay attention to make use of information in both question and choices. 
6. print out final result, must in format "the answer is _final_result_" in the last line where _final_result_ is one of "(A)", "(B)", "(C)", "(D)" and "(None)", without any other text. 
"""


prompt_template_with_choices = """
Here is a math question: "{input}"
Correct answer is among: A: {A}, B: {B}, C: {C}, D: {D}.
Let's analyze the question from the following angles, print out each rationals in each step:
1. Read question and choices carefully.
2. According to math education syllabus, what category does the question belong to?
3. What domain specific problem solving skills and knowledge are commonly used to solve questions of the category?
4. Select the most suitable method to solve the question.
5. Solve the question step by step, pay attention to make use of information in both question and choices. 
6. Compare answer against the choices choice_A: {A}, choice_B: {B}, choice_C: {C}, choice_D: {D}, and decide which choice is selected. If answer matches a choice, select the choice as final result; if answer doesn't match any choice, the answer is not correct, and final result is "None".
7. print out final result in format "Answer: the final result" in the last line, where the final result is one of "(A)", "(B)", "(C)", "(D)" and "(None)", without any other text. 
"""


def parse_pred_ans_json(log_file_name):
    with open(log_file_name, 'r') as f:
        answer_sheet = json.load(f)

    total_count = 0
    correct_count = 0
    for question in answer_sheet:
        total_count += 1
        response = question['response']
        gold = question['target'].strip()

        response = response.strip() 
        last_line = response.split('\n')[-1]
        # pattern = 'the answer is ('
        # Regular expression to find the pattern
        # match = re.search(r'answer is \(?(A|B|C|D)\)?', last_line)
        match = re.search(r'Answer: \(?(A|B|C|D)\)?', last_line)
        selected_choice = "None"
        if match:
            selected_choice = match.group(1)
        question['pred'] = selected_choice
        question['score'] = 0
        if selected_choice == gold:
            correct_count += 1
            question['score'] = 1

    print("total count:", total_count, "correct count:", correct_count, "accuracy:", correct_count / total_count)

    log_file_name_marked = log_file_name.replace(".json", "_marked.json")
    with open(log_file_name_marked, 'w') as f:
        json.dump(answer_sheet, f)
    return answer_sheet

In [106]:


response_log = []
log_file_name = './ke_outputs/test_gpt_3.5_turbo_abstract_algebra_1.json'




for idx, q in tqdm(enumerate(abstract_algebra['test']), total=len(abstract_algebra['test'])):
    
    prompt =  prompt_template_with_choices.format(**q)
    response = client.chat.completions.create(
      model="gpt-3.5-turbo",
      messages=[{"role": "user",
                 "content": prompt}],
      # temperature=0.2,
      # max_tokens=60,
      # top_p=0.1,
      # frequency_penalty=0.0,
      # presence_penalty=0.0
    )
    result = {}
    result['id'] = idx
    for k, v in q.items():
        result[k] = v
    result['prompt'] = prompt
    result['response'] = response.choices[0].message.content
    response_log.append(result)
    # print("question", idx, q)
    # print("response:", response.choices[0].message.content)
    # print("")
    # lines = [ line.strip() for line in result['response'].split("\n")]
    # lines = [line for line in lines if line]
    # check if the last line contains word None without case sensitivity
    # if it does, print the last line
    
    # if lines[-1].lower().find("none") != -1:
    #     retry = 5
    #     print("retrying")
    #     while retry > 0:
    #         response = client.chat.completions.create(
    #           model="gpt-3.5-turbo",
    #           messages=[{"role": "user",
    #                      "content": prompt}],
    #           temperature=0.8,
    #           # max_tokens=60,
    #           top_p=0.1,
    #           frequency_penalty=0.0,
    #           presence_penalty=0.0
    #         )
    #         result = q.copy()
    #         result['id'] = idx
    #         result['response'] = response.choices[0].message.content
    #         response_log.append(result)
    #         print("question", idx, q)
    #         print("response:", response.choices[0].message.content)
    #         print("")
    #         lines = [ line.strip() for line in result['response'].split("\n")]
    #         lines = [line for line in lines if line]
    #         if lines[-1].lower().find("none") == -1:
    #             break
    #         retry -= 1
    # if idx == 2:
    #     break

with open(log_file_name, 'w') as f:
    json.dump(response_log, f)

answer_sheet = parse_pred_ans_json(log_file_name)

100%|██████████| 100/100 [08:07<00:00,  4.87s/it]

total count: 100 correct count: 26 accuracy: 0.26





In [120]:
answer_sheet = parse_pred_ans_json(log_file_name)
answer_sheet

total count: 100 correct count: 44 accuracy: 0.44


[{'id': 0,
  'input': 'Find the degree for the given field extension Q(sqrt(2), sqrt(3), sqrt(18)) over Q.',
  'A': '0',
  'B': '4',
  'C': '2',
  'D': '6',
  'target': 'B',
  'prompt': '\nHere is a math question: "Find the degree for the given field extension Q(sqrt(2), sqrt(3), sqrt(18)) over Q."\nCorrect answer is among: A: 0, B: 4, C: 2, D: 6.\nLet\'s analyze the question from the following angles, print out each rationals in each step:\n1. Read question and choices carefully.\n2. According to math education syllabus, what category does the question belong to?\n3. What domain specific problem solving skills and knowledge are commonly used to solve questions of the category?\n4. Select the most suitable method to solve the question.\n5. Solve the question step by step, pay attention to make use of information in both question and choices. \n6. print out final result, must in format "the answer is _final_result_" in the last line where _final_result_ is one of "(A)", "(B)", "(C)", "(

In [112]:
question = answer_sheet[0]
response = question['response']
gold = question['target'].strip()

response = response.strip() 
last_line = response.split('\n')[-1]
# pattern = 'the answer is ('
# Regular expression to find the pattern
match = re.search(r'answer is \((A|B|C|D)\)', last_line)
selected_choice = "None"
if match:
    selected_choice = match.group(1)

selected_choice

'C'

In [111]:
last_line, match

('6. The answer is (C) 2.', None)