In [None]:
import os

data_path = "../../data/math/test_demos.json"
result_path = "../../result/zeroshot"
keys_file_path = "../../utils/raw_keys.txt"

if not os.path.exists(result_path):
    os.makedirs(result_path)
    
suffix = "math_gpt35_cot"

## load dataset

In [None]:
import json

with open(data_path, 'r') as f:
    raw_data = json.load(f)
    

In [None]:
from tqdm import tqdm
import json

data = []

for raw_item in tqdm(raw_data):
    item = {}
    item['Question'] = raw_item['problem']
    item['Answer'] = raw_item['answer']
    
    item['Demos_Q'] = ''
    item['Demos_QA'] = ''
    
    for demo in raw_item['demos']:
        item['Demos_Q'] += f"Question: {demo['problem']}\n\n"
        item['Demos_QA'] += f"Question: {demo['problem']}\nAnswer: {demo['solution']}\n\n"
        
    data.append(item)
    

In [None]:
from utils.openai import OpenAIKey, create_response_chat

MODEL = "gpt-3.5-turbo"
openai_key = OpenAIKey(keys_file_path)

## zero-shot method

In [None]:
zeroshot_template = """Your task is to tackle mathematical problems step by step.

# Instruction: Solve the following problem step by step.
Question: {Question}
Answer: Explain the solution and enclose the ultimate answer in \\boxed{{}} here."""

In [None]:
prompt_list = []

for item in data:
    prompt = zeroshot_template.format(
        Question=item['Question']
    )
    
    prompt_list.append(prompt)
    
print(prompt_list[0])

In [None]:
from tqdm import tqdm

result_list = []

for prompt in tqdm(prompt_list):
    try_times = 0
    while try_times < 20:
        try: 
            result = create_response_chat(
                MODEL,
                prompt_input=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=512,
                temperature=0
            )
            # print(result)
            result_list.append(result)
            break
        except Exception as e:
            # print(repr(e))
            try_times += 1
            if try_times == 20:
                result_list.append('None')
            openai_key.process_error(e)

In [None]:
with open(os.path.join(result_path, f"{suffix}.json"), "w") as f:
    json.dump(result_list, f, indent=4)

## Evaluation

In [None]:
with open(os.path.join(result_path, f"{suffix}.json"), 'r', encoding='utf8') as input_file:
    result_list = json.load(input_file)

In [None]:
from utils.evaluate import evaluate_math

print(f"Accuracy: {evaluate_math(result_list, data)}%")