In [1]:
import os
import sys
sys.path.append('../../')

data_path = "../../data/math/test_demos.json"
result_path = "../../result/self_demos"
keys_file_path = "../../utils/raw_key.txt"


if not os.path.exists(result_path):
    os.makedirs(result_path)

suffix = "math_gpt35"

## load dataset

In [2]:
import json

with open(data_path, 'r') as f:
    raw_data = json.load(f)
    
# raw_data = raw_data[:2]
len(raw_data)

2

In [3]:
skip_list = [0] * len(raw_data)

In [4]:
raw_data[0]

{'problem': 'The units digit of a three-digit number is 6. What is the probability that the number is divisible by 6? Express your answer as a common fraction.',
 'level': 'Level 5',
 'type': 'Number Theory',
 'solution': 'The common difference of the arithmetic sequence 106, 116, 126, ..., 996 is relatively prime to 3.  Therefore, given any three consecutive terms, exactly one of them is divisible by 3.  Since there are $1+(996-106)/10=90$ terms in the sequence, $90/3=30$ of them are divisible by 3.  Since every term is even, a term is divisible by 3 if and only if it is divisible by 6.  Therefore, the probability that a randomly selected term in the sequence is a multiple of 6 is $30/90=\\boxed{\\frac{1}{3}}$.',
 'demos': [{'problem': 'At a school, all 60 students play on at least one of three teams: Basketball, Soccer, and Mathletics. 8 students play all three sports, half the students play basketball, and the ratio of the size of the math team to the size of the basketball team to 

In [5]:
from tqdm import tqdm
import json

data = []

for raw_item in tqdm(raw_data):
    item = {}
    item['Question'] = raw_item['problem']
    item['Answer'] = raw_item['answer']
    
    item['Demos_Q'] = ''
    item['Demos_QA'] = ''
    
    for demo in raw_item['demos']:
        item['Demos_Q'] += f"Question: {demo['problem']}\n\n"
        item['Demos_QA'] += f"Question: {demo['problem']}\nAnswer: {demo['solution']}\n\n"
        
    data.append(item)
    
data[0]

100%|██████████| 2/2 [00:00<?, ?it/s]


{'Question': 'The units digit of a three-digit number is 6. What is the probability that the number is divisible by 6? Express your answer as a common fraction.',
 'Answer': '\\frac{1}{3}',
 'Demos_Q': 'Question: At a school, all 60 students play on at least one of three teams: Basketball, Soccer, and Mathletics. 8 students play all three sports, half the students play basketball, and the ratio of the size of the math team to the size of the basketball team to the size of the soccer team is $4:3:2$. How many students at the school play on exactly two teams?\n\nQuestion: The four-digit number $25AB$ is divisible by nine, with $A$ the tens digit and $B$ the units digit. How many different such four-digit numbers could $25AB$ represent?\n\nQuestion: A particular right pyramid has a square base, and each edge of the pyramid is four inches long. What is the volume of the pyramid in cubic inches? Express your answer as a decimal to the nearest hundredth.\n\n',
 'Demos_QA': "Question: At a sc

In [6]:
from utils.openai import OpenAIKey, create_response_chat

MODEL = "gpt-3.5-turbo"
openai_key = OpenAIKey(keys_file_path)

## Step 1: Query Understanding


In [7]:
step1_template = """In this task, you need to give a general understanding of mathematical problems, which can be applied to all similar questions in the same scenario.
There are 7 categories of topics: Intermediate Algebra, Precalculus, Number Theory, Geometry, Prealgebra, Algebra, Counting & Probability.

# Problem:
{Question}

# Instruction: Generate a general understanding.
Give a general understanding of this problem in one line. Highlight the general solution methodologies to solve this type of problems. Focus on the problem-solving approach without delving into specific numerical values or answers.
You can refer to this template for your understanding: This problem involves...To solve this type of problem..."""

In [8]:
prompt_list = []

for item in data:
    
    prompt = step1_template.format(
        Question=item['Question']
    )
    
    prompt_list.append(prompt)
    
print(prompt_list[0])

In this task, you need to give a general understanding of mathematical problems, which can be applied to all similar questions in the same scenario.
There are 7 categories of topics: Intermediate Algebra, Precalculus, Number Theory, Geometry, Prealgebra, Algebra, Counting & Probability.

# Problem:
The units digit of a three-digit number is 6. What is the probability that the number is divisible by 6? Express your answer as a common fraction.

# Instruction: Generate a general understanding.
Give a general understanding of this problem in one line. Highlight the general solution methodologies to solve this type of problems. Focus on the problem-solving approach without delving into specific numerical values or answers.
You can refer to this template for your understanding: This problem involves...To solve this type of problem...


In [9]:
print(len(prompt_list))

2


In [10]:
import re
from tqdm import tqdm

raw_step1_result_list = []

for i in tqdm(range(len(data))):
    try_times = 0
    while try_times < 5:
        try: 
            result = create_response_chat(
                MODEL,
                prompt_input=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt_list[i]}
                ],
                max_tokens=512,
                temperature=0
            )
            # print(result)
            raw_step1_result_list.append(result)
            break
        except Exception as e:
            try_times += 1
            if try_times == 5:
                raw_step1_result_list.append('None')
                skip_list[i] = 1
            openai_key.process_error(e)

100%|██████████| 2/2 [00:04<00:00,  2.30s/it]


In [11]:
with open(os.path.join(result_path, f"{suffix}_step1.json"), "w") as f:
    json.dump(raw_step1_result_list, f, indent=4)

In [12]:
with open(os.path.join(result_path, f"{suffix}_step1.json"), 'r', encoding='utf8') as input_file:
    raw_step1_result_list = json.load(input_file)

In [13]:
step1_result_list = []

def extract_main_info(text):
    lines = text.split('\n')
    lines = [line.strip() for line in lines if line.strip()]
    text = ' '.join(lines)
    return text

for i in range(len(raw_step1_result_list)):

    clean_str = extract_main_info(raw_step1_result_list[i])
    
    if clean_str != '':
        step1_result_list.append(clean_str)
    else:
        step1_result_list.append('None')
        skip_list[i] = 1

In [14]:
sum(skip_list)

0

## Step 2: Query-aware Demo Generation


In [15]:
step2_template = """In this task, you need to recall mathematical problems. When presented with a math problem, recall relevant problems as examples. These examples are helpful in answering the initial problem.

# Problem:
## The initial problem:
{Question}

## The Understanding you can refer to:
{Understanding}

# Demonstration of format:
{Demos_QA}# Instruction: Recall relevant problem.
Recall one example of math problem that is relevant to the initial problem. Your problems should be distinct from the initial problem (e.g., involving different numbers and names). 
- After "Question: ", describe the problem you generate in one line.
- After "Answer: ", Explain the step-by-step solution and enclose the ultimate answer in \\boxed{{}}."""

In [16]:
prompt_list = []

for generated_understanding, item in zip(step1_result_list, data):
    
    prompt = step2_template.format(
        Demos_QA=item["Demos_QA"],
        Question=item['Question'],
        Understanding=generated_understanding
    )
    
    prompt_list.append(prompt)
    
print(prompt_list[0])

In this task, you need to recall mathematical problems. When presented with a math problem, recall relevant problems as examples. These examples are helpful in answering the initial problem.

# Problem:
## The initial problem:
The units digit of a three-digit number is 6. What is the probability that the number is divisible by 6? Express your answer as a common fraction.

## The Understanding you can refer to:
This problem involves determining the probability of a number being divisible by a certain number based on its units digit. To solve this type of problem, analyze the divisibility rules of the target number and consider the possible outcomes based on the given conditions.

# Demonstration of format:
Question: At a school, all 60 students play on at least one of three teams: Basketball, Soccer, and Mathletics. 8 students play all three sports, half the students play basketball, and the ratio of the size of the math team to the size of the basketball team to the size of the soccer 

In [17]:
len(prompt_list)

2

In [18]:
raw_step2_result_list = []

for i in tqdm(range(len(prompt_list))):
    if skip_list[i] == 1:
        raw_step2_result_list.append('None')
        continue
    
    demo_candidate = []
    
    for _ in range(5):
        try_times = 0
        while try_times < 10:
            try: 
                result = create_response_chat(
                    MODEL,
                    prompt_input=[
                        {"role": "system", "content": "You are a helpful assistant."},
                        {"role": "user", "content": prompt_list[i]}
                    ],
                    max_tokens=512,
                    temperature=0.8
                )
                # print(result)
                demo_candidate.append(result)
                break
            except Exception as e:
                try_times += 1
                openai_key.process_error(e)
    
    raw_step2_result_list.append(demo_candidate)

  0%|          | 0/2 [00:00<?, ?it/s]

Rate limit reached for key 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
Rate limit reached for key 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
Rate limit reached for key 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
Rate limit reached for key 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
Rate limit reached for key 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
Rate limit reached for key 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
Rate limit reached for key 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
Rate limit reached for key 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
Rate limit reached for key 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
Rate limit reached for key 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
Rate limit reached for key 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
Rate limit reached for key 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
Rate limit reached for key sk-7kRoOiHnShzSoZ8fMgyxT3

 50%|█████     | 1/2 [00:13<00:13, 13.98s/it]

Rate limit reached for key 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
Rate limit reached for key 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
Rate limit reached for key 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
Rate limit reached for key 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
Rate limit reached for key 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
Rate limit reached for key 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
Rate limit reached for key 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
Rate limit reached for key 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
Rate limit reached for key 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
Rate limit reached for key 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
Rate limit reached for key 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
Rate limit reached for key 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
Rate limit reached for key sk-Mzd2vMiwGEFnt0HiWGsPT3

100%|██████████| 2/2 [00:28<00:00, 14.04s/it]

Rate limit reached for key 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





In [19]:
with open(os.path.join(result_path, f"{suffix}_step2.json"), "w") as f:
    json.dump(raw_step2_result_list, f, indent=4)

In [20]:
with open(os.path.join(result_path, f"{suffix}_step2.json"), 'r', encoding='utf8') as input_file:
    raw_step2_result_list = json.load(input_file)

In [21]:
step2_result_list = []

def extract_main_info(text):
    
    text = text.replace('Question:\n', 'Question: ').replace('Question: \n', 'Question: ')
    text = text.replace('Answer:\n', 'Answer: ').replace('Answer: \n', 'Answer: ')
    
    lines = text.split('\n')
    lines = [line.strip() for line in lines if line.strip() != '']
    
    Q_lines = [line for line in lines if re.match(r'^(question)', line, re.IGNORECASE)]
    A_lines = [line for line in lines if re.match(r'^(answer)', line, re.IGNORECASE)]
    
    Q_lines = [line for line in Q_lines if len(line) > 20]
    A_lines = [line for line in A_lines if len(line) > 20]
    
    if len(Q_lines) != 1 or len(A_lines) != 1:
        return ''
    
    index1 = lines.index(Q_lines[0])
    index2 = lines.index(A_lines[0])
    if index1 > index2:
        return ''
    
    Answer_lines = " ".join(lines[index2:])
    
    result = Q_lines[0] + '\n' + Answer_lines + '\n'
    
    return result

for i in range(len(raw_step2_result_list)):
    
    demo_candidate = []
    for demo in raw_step2_result_list[i]:
        
        clean_result = extract_main_info(demo)
        
        if clean_result != '':
            demo_candidate.append(clean_result)
    
    demo_candidate = list(set(demo_candidate))
    
    if len(demo_candidate) > 2:
        step2_result_list.append(demo_candidate)
    else:
        step2_result_list.append(['None'])
        skip_list[i] = 1

In [22]:
sum(skip_list)

2

In [23]:
with open(os.path.join(result_path, f"{suffix}_step2_skip.json"), "w") as f:
    json.dump(skip_list, f, indent=4)

## Step 3: Post-checking and Refining of Demos


In [24]:
step3_template = """In this task, you need to check the correctness of these math Q&A pairs and select the two best examples to keep, for answering the final problem.

# The final Problem:
{Question}

# Check List:
- The calculation process in solution must be correct and without ambiguity.
- The examples should be relevant and helpful in solving the final problem.

# Examples to be checked:
{generated_demonstration}# Instruction:
Select two best examples to keep. If there are not enough correct and helpful examples, just keep one.
For your answer:
- After "Selection: ", give the serial numbers of your choice in the format of <x>, <y>.
- After "Explanation: ", give the reason why you keep this example."""

In [25]:
prompt_list = []

for generated_demonstration, item in zip(step2_result_list, data):
    
    demo_string = ''
    for i, demo in enumerate(generated_demonstration):
        demo_string += f"Example <{i+1}>:\n{demo}\n"
    
    prompt = step3_template.format(
        Question=item['Question'],
        generated_demonstration=demo_string
    )
    
    prompt_list.append(prompt)
    
print(prompt_list[0])

In this task, you need to check the correctness of these math Q&A pairs and select the two best examples to keep, for answering the final problem.

# The final Problem:
The units digit of a three-digit number is 6. What is the probability that the number is divisible by 6? Express your answer as a common fraction.

# Check List:
- The calculation process in solution must be correct and without ambiguity.
- The examples should be relevant and helpful in solving the final problem.

# Examples to be checked:
Example <1>:
None
# Instruction:
Select two best examples to keep. If there are not enough correct and helpful examples, just keep one.
For your answer:
- After "Selection: ", give the serial numbers of your choice in the format of <x>, <y>.
- After "Explanation: ", give the reason why you keep this example.


In [26]:
raw_step3_result_list = []

for i in tqdm(range(len(prompt_list))):
    if skip_list[i] == 1:
        raw_step3_result_list.append('None')
        continue
    try_times = 0
    while try_times < 5:
        try: 
            result = create_response_chat(
                MODEL,
                prompt_input=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt_list[i]}
                ],
                max_tokens=64,
                temperature=0
            )
            # print(result)
            raw_step3_result_list.append(result)
            break
        except Exception as e:
            try_times += 1
            if try_times == 5:
                raw_step3_result_list.append('None')
                skip_list[i] = 1
            openai_key.process_error(e)

100%|██████████| 2/2 [00:00<?, ?it/s]


In [27]:

with open(os.path.join(result_path, f"{suffix}_step3.json"), "w") as f:
    json.dump(raw_step3_result_list, f, indent=4)

In [28]:

with open(os.path.join(result_path, f"{suffix}_step3.json"), 'r', encoding='utf8') as input_file:
    raw_step3_result_list = json.load(input_file)

In [29]:
step3_result_list = []

for i in range(len(raw_step3_result_list)):
    if skip_list[i] == 1:
        step3_result_list.append('None')
    else: 
        matches = re.findall(r'<[1-5]>', raw_step3_result_list[i])
        matches = list(set(matches))
        
        if not matches or len(matches) > 2:
            step3_result_list.append('None')
            skip_list[i] = 1
            continue
        
        extracted_numbers = [int(match[1]) for match in matches]
        result = ''
        
        for num in extracted_numbers:
            if num >= 1 and num <= len(step2_result_list[i]):
                result += step2_result_list[i][num - 1] + '\n'
            else:
                step3_result_list.append('None')
                skip_list[i] = 1
                continue
        step3_result_list.append(result)

In [30]:
sum(skip_list)

2

In [31]:
with open(os.path.join(result_path, f"{suffix}_step3_skip.json"), "w") as f:
    json.dump(skip_list, f, indent=4)

## Step 4: Response Generation


In [32]:
step4_template = """Your task is to tackle mathematical problems step by step. You can refer to these demonstration to give your reasoning process.

# Demonstration:
{seed_demonstration}{checked_demonstration}# Instruction: Solve the following problem step by step.
Question: {Question}
Answer: Explain the step-by-step solution and enclose the ultimate answer in \\boxed{{}} here."""

In [33]:
fewshot_template = """Your task is to tackle mathematical problems step by step. You can refer to these demonstration to give your reasoning process.

# Demonstration:
{seed_demonstration}# Instruction: Solve the following problem step by step.
Question: {Question}
Answer: Explain the step-by-step solution and enclose the ultimate answer in \\boxed{{}} here."""

In [34]:
prompt_list = []

for i in range(len(data)):
    if skip_list[i] == 1:
        prompt = fewshot_template.format(
            seed_demonstration=data[i]["Demos_QA"],
            Question=data[i]['Question']
        )   
    else:
        prompt = step4_template.format(
            seed_demonstration=data[i]["Demos_QA"],
            checked_demonstration=step3_result_list[i],
            Question=data[i]['Question']
        )
        
    prompt_list.append(prompt)
    
print(prompt_list[0])

Your task is to tackle mathematical problems step by step. You can refer to these demonstration to give your reasoning process.

# Demonstration:
Question: At a school, all 60 students play on at least one of three teams: Basketball, Soccer, and Mathletics. 8 students play all three sports, half the students play basketball, and the ratio of the size of the math team to the size of the basketball team to the size of the soccer team is $4:3:2$. How many students at the school play on exactly two teams?
Answer: We have enough information to solve for the size of each team. There are $\dfrac{60}{2}=30$ members of the basketball team, $\dfrac{4}{3}(30)=40$ members of the math team, and $\dfrac{2}{3}(30)=20$ members of the soccer team. Adding these up gives us 90, so clearly we're overcounting since there are only 60 students. The number of times that each student is counted in this total is equal to the number of teams that student plays on. This means that all 60 students will be counted 

In [35]:
step4_result_list = []

for prompt in tqdm(prompt_list):
    try_times = 0
    while try_times < 5:
        try: 
            result = create_response_chat(
                MODEL,
                prompt_input=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=2048,
                temperature=0
            )
            # print(result)
            step4_result_list.append(result)
            break
        except Exception as e:
            try_times += 1
            if try_times == 5:
                step4_result_list.append('None')
            openai_key.process_error(e)

  0%|          | 0/2 [00:00<?, ?it/s]

Rate limit reached for key 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
Rate limit reached for key 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
Rate limit reached for key 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
Rate limit reached for key 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


 50%|█████     | 1/2 [00:01<00:01,  1.39s/it]

Rate limit reached for key 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
Rate limit reached for key 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
Rate limit reached for key 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
Rate limit reached for key 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
Rate limit reached for key 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


100%|██████████| 2/2 [00:02<00:00,  1.37s/it]

Rate limit reached for key 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





In [36]:
with open(os.path.join(result_path, f"{suffix}_step4.json"), "w") as f:
    json.dump(step4_result_list, f, indent=4)

## Evaluation


In [37]:
with open(os.path.join(result_path, f"{suffix}_step4.json"), 'r', encoding='utf8') as input_file:
    result_list = json.load(input_file)

In [38]:
from utils.evaluate import evaluate_math
print(f"Accuracy: {evaluate_math(result_list, data)}%")

Accuracy: 0.0%
