In [None]:
import os

data_path = "../../data/math/test_demos.json"
result_path = "../../result/self_demos"
keys_file_path = "../../utils/raw_keys.txt"


if not os.path.exists(result_path):
    os.makedirs(result_path)

suffix = "math_gpt35"

## load dataset

In [None]:
import json

with open(data_path, 'r') as f:
    raw_data = json.load(f)
    
# raw_data = raw_data[:5]
len(raw_data)

In [None]:
skip_list = [0] * len(raw_data)

In [None]:
raw_data[0]

In [None]:
from tqdm import tqdm
import json

data = []

for raw_item in tqdm(raw_data):
    item = {}
    item['Question'] = raw_item['problem']
    item['Answer'] = raw_item['answer']
    
    item['Demos_Q'] = ''
    item['Demos_QA'] = ''
    
    for demo in raw_item['demos']:
        item['Demos_Q'] += f"Question: {demo['problem']}\n\n"
        item['Demos_QA'] += f"Question: {demo['problem']}\nAnswer: {demo['solution']}\n\n"
        
    data.append(item)
    
data[0]

In [None]:
from utils.openai import OpenAIKey, create_response_chat

MODEL = "gpt-3.5-turbo"
openai_key = OpenAIKey(keys_file_path)

## Step 1: Query Understanding


In [None]:
step1_template = """In this task, you need to give a general understanding of mathematical problems, which can be applied to all similar questions in the same scenario.
There are 7 categories of topics: Intermediate Algebra, Precalculus, Number Theory, Geometry, Prealgebra, Algebra, Counting & Probability.

# Problem:
{Question}

# Instruction: Generate a general understanding.
Give a general understanding of this problem in one line. Highlight the general solution methodologies to solve this type of problems. Focus on the problem-solving approach without delving into specific numerical values or answers.
You can refer to this template for your understanding: This problem involves...To solve this type of problem..."""

In [None]:
prompt_list = []

for item in data:
    
    prompt = step1_template.format(
        Question=item['Question']
    )
    
    prompt_list.append(prompt)
    
print(prompt_list[0])

In [None]:
print(len(prompt_list))

In [None]:
import re
from tqdm import tqdm

raw_step1_result_list = []

for i in tqdm(range(len(data))):
    try_times = 0
    while try_times < 5:
        try: 
            result = create_response_chat(
                MODEL,
                prompt_input=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt_list[i]}
                ],
                max_tokens=512,
                temperature=0
            )
            # print(result)
            raw_step1_result_list.append(result)
            break
        except Exception as e:
            try_times += 1
            if try_times == 5:
                raw_step1_result_list.append('None')
                skip_list[i] = 1
            openai_key.process_error(e)

In [None]:
with open(os.path.join(result_path, f"{suffix}_step1.json"), "w") as f:
    json.dump(raw_step1_result_list, f, indent=4)

In [None]:
with open(os.path.join(result_path, f"{suffix}_step1.json"), 'r', encoding='utf8') as input_file:
    raw_step1_result_list = json.load(input_file)

In [None]:
step1_result_list = []

def extract_main_info(text):
    lines = text.split('\n')
    lines = [line.strip() for line in lines if line.strip()]
    text = ' '.join(lines)
    return text

for i in range(len(raw_step1_result_list)):

    clean_str = extract_main_info(raw_step1_result_list[i])
    
    if clean_str != '':
        step1_result_list.append(clean_str)
    else:
        step1_result_list.append('None')
        skip_list[i] = 1

In [None]:
sum(skip_list)

## Step 2: Query-aware Demo Generation


In [None]:
step2_template = """In this task, you need to recall mathematical problems. When presented with a math problem, recall relevant problems as examples. These examples are helpful in answering the initial problem.

# Problem:
## The initial problem:
{Question}

## The Understanding you can refer to:
{Understanding}

# Demonstration of format:
{Demos_QA}# Instruction: Recall relevant problem.
Recall one example of math problem that is relevant to the initial problem. Your problems should be distinct from the initial problem (e.g., involving different numbers and names). 
- After "Question: ", describe the problem you generate in one line.
- After "Answer: ", Explain the step-by-step solution and enclose the ultimate answer in \\boxed{{}}."""

In [None]:
prompt_list = []

for generated_understanding, item in zip(step1_result_list, data):
    
    prompt = step2_template.format(
        Demos_QA=item["Demos_QA"],
        Question=item['Question'],
        Understanding=generated_understanding
    )
    
    prompt_list.append(prompt)
    
print(prompt_list[0])

In [None]:
len(prompt_list)

In [None]:
raw_step2_result_list = []

for i in tqdm(range(len(prompt_list))):
    if skip_list[i] == 1:
        raw_step2_result_list.append('None')
        continue
    
    demo_candidate = []
    
    for _ in range(5):
        try_times = 0
        while try_times < 10:
            try: 
                result = create_response_chat(
                    MODEL,
                    prompt_input=[
                        {"role": "system", "content": "You are a helpful assistant."},
                        {"role": "user", "content": prompt_list[i]}
                    ],
                    max_tokens=512,
                    temperature=0.8
                )
                # print(result)
                demo_candidate.append(result)
                break
            except Exception as e:
                try_times += 1
                openai_key.process_error(e)
    
    raw_step2_result_list.append(demo_candidate)

In [None]:
with open(os.path.join(result_path, f"{suffix}_step2.json"), "w") as f:
    json.dump(raw_step2_result_list, f, indent=4)

In [None]:
with open(os.path.join(result_path, f"{suffix}_step2.json"), 'r', encoding='utf8') as input_file:
    raw_step2_result_list = json.load(input_file)

In [None]:
step2_result_list = []

def extract_main_info(text):
    
    text = text.replace('Question:\n', 'Question: ').replace('Question: \n', 'Question: ')
    text = text.replace('Answer:\n', 'Answer: ').replace('Answer: \n', 'Answer: ')
    
    lines = text.split('\n')
    lines = [line.strip() for line in lines if line.strip() != '']
    
    Q_lines = [line for line in lines if re.match(r'^(question)', line, re.IGNORECASE)]
    A_lines = [line for line in lines if re.match(r'^(answer)', line, re.IGNORECASE)]
    
    Q_lines = [line for line in Q_lines if len(line) > 20]
    A_lines = [line for line in A_lines if len(line) > 20]
    
    if len(Q_lines) != 1 or len(A_lines) != 1:
        return ''
    
    index1 = lines.index(Q_lines[0])
    index2 = lines.index(A_lines[0])
    if index1 > index2:
        return ''
    
    Answer_lines = " ".join(lines[index2:])
    
    result = Q_lines[0] + '\n' + Answer_lines + '\n'
    
    return result

for i in range(len(raw_step2_result_list)):
    
    demo_candidate = []
    for demo in raw_step2_result_list[i]:
        
        clean_result = extract_main_info(demo)
        
        if clean_result != '':
            demo_candidate.append(clean_result)
    
    demo_candidate = list(set(demo_candidate))
    
    if len(demo_candidate) > 2:
        step2_result_list.append(demo_candidate)
    else:
        step2_result_list.append(['None'])
        skip_list[i] = 1

In [None]:
sum(skip_list)

In [None]:
with open(os.path.join(result_path, f"{suffix}_step2_skip.json"), "w") as f:
    json.dump(skip_list, f, indent=4)

## Step 3: Post-checking and Refining of Demos


In [None]:
step3_template = """In this task, you need to check the correctness of these math Q&A pairs and select the two best examples to keep, for answering the final problem.

# The final Problem:
{Question}

# Check List:
- The calculation process in solution must be correct and without ambiguity.
- The examples should be relevant and helpful in solving the final problem.

# Examples to be checked:
{generated_demonstration}# Instruction:
Select two best examples to keep. If there are not enough correct and helpful examples, just keep one.
For your answer:
- After "Selection: ", give the serial numbers of your choice in the format of <x>, <y>.
- After "Explanation: ", give the reason why you keep this example."""

In [None]:
prompt_list = []

for generated_demonstration, item in zip(step2_result_list, data):
    
    demo_string = ''
    for i, demo in enumerate(generated_demonstration):
        demo_string += f"Example <{i+1}>:\n{demo}\n"
    
    prompt = step3_template.format(
        Question=item['Question'],
        generated_demonstration=demo_string
    )
    
    prompt_list.append(prompt)
    
print(prompt_list[0])

In [None]:
raw_step3_result_list = []

for i in tqdm(range(len(prompt_list))):
    if skip_list[i] == 1:
        raw_step3_result_list.append('None')
        continue
    try_times = 0
    while try_times < 5:
        try: 
            result = create_response_chat(
                MODEL,
                prompt_input=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt_list[i]}
                ],
                max_tokens=64,
                temperature=0
            )
            # print(result)
            raw_step3_result_list.append(result)
            break
        except Exception as e:
            try_times += 1
            if try_times == 5:
                raw_step3_result_list.append('None')
                skip_list[i] = 1
            openai_key.process_error(e)

In [None]:

with open(os.path.join(result_path, f"{suffix}_step3.json"), "w") as f:
    json.dump(raw_step3_result_list, f, indent=4)

In [None]:

with open(os.path.join(result_path, f"{suffix}_step3.json"), 'r', encoding='utf8') as input_file:
    raw_step3_result_list = json.load(input_file)

In [None]:
step3_result_list = []

for i in range(len(raw_step3_result_list)):
    if skip_list[i] == 1:
        step3_result_list.append('None')
    else: 
        matches = re.findall(r'<[1-5]>', raw_step3_result_list[i])
        matches = list(set(matches))
        
        if not matches or len(matches) > 2:
            step3_result_list.append('None')
            skip_list[i] = 1
            continue
        
        extracted_numbers = [int(match[1]) for match in matches]
        result = ''
        
        for num in extracted_numbers:
            if num >= 1 and num <= len(step2_result_list[i]):
                result += step2_result_list[i][num - 1] + '\n'
            else:
                step3_result_list.append('None')
                skip_list[i] = 1
                continue
        step3_result_list.append(result)

In [None]:
sum(skip_list)

In [None]:
with open(os.path.join(result_path, f"{suffix}_step3_skip.json"), "w") as f:
    json.dump(skip_list, f, indent=4)

## Step 4: Response Generation


In [None]:
step4_template = """Your task is to tackle mathematical problems step by step. You can refer to these demonstration to give your reasoning process.

# Demonstration:
{seed_demonstration}{checked_demonstration}# Instruction: Solve the following problem step by step.
Question: {Question}
Answer: Explain the step-by-step solution and enclose the ultimate answer in \\boxed{{}} here."""

In [None]:
fewshot_template = """Your task is to tackle mathematical problems step by step. You can refer to these demonstration to give your reasoning process.

# Demonstration:
{seed_demonstration}# Instruction: Solve the following problem step by step.
Question: {Question}
Answer: Explain the step-by-step solution and enclose the ultimate answer in \\boxed{{}} here."""

In [None]:
prompt_list = []

for i in range(len(data)):
    if skip_list[i] == 1:
        prompt = fewshot_template.format(
            seed_demonstration=data[i]["Demos_QA"],
            Question=data[i]['Question']
        )   
    else:
        prompt = step4_template.format(
            seed_demonstration=data[i]["Demos_QA"],
            checked_demonstration=step3_result_list[i],
            Question=data[i]['Question']
        )
        
    prompt_list.append(prompt)
    
print(prompt_list[0])

In [None]:
step4_result_list = []

for prompt in tqdm(prompt_list):
    try_times = 0
    while try_times < 5:
        try: 
            result = create_response_chat(
                MODEL,
                prompt_input=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=2048,
                temperature=0
            )
            # print(result)
            step4_result_list.append(result)
            break
        except Exception as e:
            try_times += 1
            if try_times == 5:
                step4_result_list.append('None')
            openai_key.process_error(e)

In [None]:
with open(os.path.join(result_path, f"{suffix}_step4.json"), "w") as f:
    json.dump(step4_result_list, f, indent=4)

## Evaluation


In [None]:
with open(os.path.join(result_path, f"{suffix}_step4.json"), 'r', encoding='utf8') as input_file:
    result_list = json.load(input_file)

In [None]:
from utils.evaluate import evaluate_math
print(f"Accuracy: {evaluate_math(result_list, data)}%")