# GPT-3.5-Turbo on GSM8K

In [33]:
!pip install openai datasets



In [37]:
from openai import OpenAI
import re
import time

import numpy as np

from tqdm import tqdm
from datasets import load_dataset

In [38]:
client = OpenAI(api_key = "sk-")

In [40]:
gsm8k = load_dataset('gsm8k', 'main')
validation_index = np.load('./lib_prompt/validation_index.npy')
validation_data = gsm8k['train'].select(validation_index)
gsm8k_test = gsm8k['test']

In [41]:
gsm8k['train'][0]['question']

'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?'

In [42]:
gsm8k_test = gsm8k['test']

In [43]:
prompt_complex = open('./lib_prompt/prompt_hardest.txt').read()

In [44]:
print(prompt_complex)

Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?
Let's think step by step
Angelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.
For the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.
Angelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.
However, they need to include time for breaks and lunch. Every hour they want 

In [46]:
from tenacity import (
    retry,
    stop_after_attempt,
    wait_chain,
    wait_fixed
) 

@retry(wait=wait_chain(*[wait_fixed(3) for i in range(3)] +
                       [wait_fixed(5) for i in range(2)] +
                       [wait_fixed(10)]))
def completion_with_backoff(**kwargs):
    return client.chat.completions.create(**kwargs)

In [49]:
def test_answer(pred_str, ans_str):
    pattern = '\d*\.?\d+'
    pred = re.findall(pattern, pred_str)
    if(len(pred) >= 1):
        # print(pred_str)
        pred = pred[-1]
        gold = re.findall(pattern, ans_str)
        # print(ans_str)
        gold = gold[-1]
        return pred == gold
    else: return False

def parse_pred_ans(filename):
    with open(filename) as fd: lines = fd.readlines()
    am, a = None, None
    num_q, acc = 0, 0
    current_mode = 'none'
    questions = []
    ans_pred = []
    ans_gold = []
    for l in lines:
        if(l.startswith('Q: ')):
            if(am is not None and a is not None):
                questions.append(q)
                ans_pred.append(am)
                ans_gold.append(a)
                if(test_answer(am, a)):
                    acc += 1
            current_mode = 'q'
            q = l
            num_q += 1
        elif(l.startswith('A_model:')):
            current_mode = 'am'
            am = l
        elif(l.startswith('A:')):
            current_mode = 'a'
            a = l
        else:
            if(current_mode == 'q'): q += l
            elif(current_mode == 'am'): am += l
            elif(current_mode == 'a'): a += l
            else:
                raise ValueError(current_mode)
                
    questions.append(q)
    ans_pred.append(am)
    ans_gold.append(a)
    if(test_answer(am, a)):
        acc += 1
    print('num_q %d correct %d ratio %.4f' % (num_q, acc, float(acc / num_q)))
    return questions, ans_pred, ans_gold

def test_finished(ans_model):
    if('answer is' in ans_model): return True
    else: return False

def extract_ans(ans_model):
    ans_model = ans_model.split('\n')
    ans = []
    residual = []
    for li, al in enumerate(ans_model):
        ans.append(al)
        if('answer is' in al):
            break
    residual = list(ans_model[li + 1:])
    ans = '\n'.join(ans)
    residual = '\n'.join(residual)
    return ans, residual

In [50]:
prompt_q = prompt_complex + '\nQuestion: ' + gsm8k_test[1]['question'] + '\n'

In [51]:
print(prompt_q)

Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?
Let's think step by step
Angelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.
For the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.
Angelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.
However, they need to include time for breaks and lunch. Every hour they want 

In [63]:
response = openai.chat.completions.create(
  model="gpt-4o",
  messages=[
        {"role": "system", "content": "Follow the given examples and answer the question."},
        {"role": "user", "content": prompt_q},
    ]
)

In [64]:
print(response.choices[0].message.content.strip())

Let's think step by step.

1. Let's denote the total number of vacuum cleaners Melanie started with as \( x \).

2. She sold a third of her vacuum cleaners at the green house. So, she sold \( \frac{x}{3} \) vacuum cleaners there.
   Remaining vacuum cleaners: \( x - \frac{x}{3} = \frac{2x}{3} \).

3. She sold 2 more vacuum cleaners at the red house.
   Remaining vacuum cleaners: \( \frac{2x}{3} - 2 \).

4. She sold half of what was left at the orange house.
   Vacuum cleaners sold at the orange house: \( \frac{1}{2} \left( \frac{2x}{3} - 2 \right) \).
   Remaining vacuum cleaners after orange house: \( \frac{1}{2} \left( \frac{2x}{3} - 2 \right) \).

5. After all the selling, she had 5 vacuum cleaners left.
   So, \( \frac{1}{2} \left( \frac{2x}{3} - 2 \right) = 5 \).

Now we solve for \( x \):

\[
\frac{1}{2} \left( \frac{2x}{3} - 2 \right) = 5
\]

Multiply both sides by 2:

\[
\frac{2x}{3} - 2 = 10
\]

Add 2 to both sides:

\[
\frac{2x}{3} = 12
\]

Multiply both sides by 3:

\[
2x = 

# Complex Prompt Random Sampling, Acc 77.1

In [62]:
i = 0
with open('outputs/test_gpt_3.5_turbo_complex.txt', 'w') as fd:
    for q, a in tqdm(zip(gsm8k_test['question'], gsm8k_test['answer']), 
                               total=len(gsm8k_test['question'])):
        
        prompt_q = prompt_complex + '\nQuestion: ' + q + '\n'  
        
        response = completion_with_backoff(
              model="gpt-4o",
              messages=[
                    {"role": "system", "content": "Follow the given examples and answer the question."},
                    {"role": "user", "content": prompt_q},
                ]
            )
        ans_model = response.choices[0].message.content.strip()
        ans_, residual = extract_ans(ans_model)
            
        fd.write('Q: %s\nA_model:\n%s\nA:\n%s\n\n' % (q, ans_, a))
        i += 1
        # if(i == 2): break

  1%|          | 13/1319 [01:02<1:44:37,  4.81s/it]


KeyboardInterrupt: 

In [34]:
_, _, _ = parse_pred_ans('outputs/test_gpt_3.5_turbo_complex.txt')

num_q 1319 correct 1017 ratio 0.7710


# Complex Prompt Greedy Decoding, Acc 78.85

In [35]:
i = 0
with open('outputs/test_gpt_3.5_turbo_complex_temp_0.txt', 'w') as fd:
    for q, a in tqdm(zip(gsm8k_test['question'], gsm8k_test['answer']), 
                               total=len(gsm8k_test['question'])):
        
        prompt_q = prompt_complex + '\nQuestion: ' + q + '\n'  
        
        response = completion_with_backoff(
              model="gpt-3.5-turbo",
              messages=[
                    {"role": "system", "content": "Follow the given examples and answer the question."},
                    {"role": "user", "content": prompt_q},
                ],
                temperature=0
            )
        ans_model = response['choices'][0]['message']['content']
        ans_, residual = extract_ans(ans_model)
            
        fd.write('Q: %s\nA_model:\n%s\nA:\n%s\n\n' % (q, ans_, a))
        i += 1
        # if(i == 2): break

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1319/1319 [2:04:18<00:00,  5.65s/it]


In [36]:
_, _, _ = parse_pred_ans('outputs/test_gpt_3.5_turbo_complex_temp_0.txt')

num_q 1319 correct 1040 ratio 0.7885


# Baseline Prompt Greedy Decoding, Acc 74.98

In [8]:
prompt_original = open('../gsm8k/lib_prompt/prompt_original.txt').read()

In [12]:
i = 0
with open('outputs/test_gpt_3.5_turbo_original_temp_0.txt', 'w') as fd:
    for q, a in tqdm(zip(gsm8k_test['question'], gsm8k_test['answer']), 
                               total=len(gsm8k_test['question'])):
        
        prompt_q = prompt_original + '\nQuestion: ' + q + '\n'  
        
        response = completion_with_backoff(
              model="gpt-3.5-turbo",
              messages=[
                    {"role": "system", "content": "Follow the given examples and answer the question."},
                    {"role": "user", "content": prompt_q},
                ],
                temperature=0
            )
        ans_model = response['choices'][0]['message']['content']
        ans_, residual = extract_ans(ans_model)
            
        fd.write('Q: %s\nA_model:\n%s\nA:\n%s\n\n' % (q, ans_, a))
        i += 1
        # if(i == 2): break

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1319/1319 [2:36:17<00:00,  7.11s/it]


In [13]:
_, _, _ = parse_pred_ans('outputs/test_gpt_3.5_turbo_original_temp_0.txt')

num_q 1319 correct 989 ratio 0.7498


# Baseline Prompt, Dialog In-Context Learning, Acc 76.8

In [24]:
def make_dialog_prompt(prompt):
    messages = []
    messages.append({"role": "system", "content": "Follow the given examples and answer the question."})
    cases = prompt.split("\n\n")
    for c in cases[:-1]:
        question = c.split("\n")[:2]
        messages.append({"role": "user", "content": "\n".join(question)})
        answer = c.split("\n")[2:]
        messages.append({"role": "assistant", "content": "\n".join(answer)})
    messages.append({"role": "user", "content": cases[-1] + "Let's think step by step"})
    return messages

In [30]:
i = 0
with open('outputs/test_gpt_3.5_turbo_original_dialog_icl.txt', 'w') as fd:
    for q, a in tqdm(zip(gsm8k_test['question'], gsm8k_test['answer']), 
                               total=len(gsm8k_test['question'])):
        
        prompt_q = prompt_original + '\nQuestion: ' + q + '\n'
        dialog_prompt = make_dialog_prompt(prompt_q)
        
        response = completion_with_backoff(
              model="gpt-3.5-turbo",
              messages=dialog_prompt,
              temperature=0
            )
        ans_model = response['choices'][0]['message']['content']
        ans_, residual = extract_ans(ans_model)
            
        fd.write('Q: %s\nA_model:\n%s\nA:\n%s\n\n' % (q, ans_, a))
        i += 1
        # if(i == 2): break

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1319/1319 [2:57:13<00:00,  8.06s/it]


In [31]:
_, _, _ = parse_pred_ans('outputs/test_gpt_3.5_turbo_original_dialog_icl.txt')

num_q 1319 correct 1013 ratio 0.7680


# Complex Prompt, Dialog In-Context Learning, Acc 

In [34]:
i = 0
with open('outputs/test_gpt_3.5_turbo_complex_dialog_icl.txt', 'w') as fd:
    for q, a in tqdm(zip(gsm8k_test['question'], gsm8k_test['answer']), 
                               total=len(gsm8k_test['question'])):
        
        prompt_q = prompt_complex + '\nQuestion: ' + q + '\n'
        dialog_prompt = make_dialog_prompt(prompt_q)
        
        response = completion_with_backoff(
              model="gpt-3.5-turbo",
              messages=dialog_prompt,
              temperature=0
            )
        ans_model = response['choices'][0]['message']['content']
        ans_, residual = extract_ans(ans_model)
            
        fd.write('Q: %s\nA_model:\n%s\nA:\n%s\n\n' % (q, ans_, a))
        i += 1
        # if(i == 2): break

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1319/1319 [2:43:01<00:00,  7.42s/it]


In [35]:
_, _, _ = parse_pred_ans('outputs/test_gpt_3.5_turbo_complex_dialog_icl.txt')

num_q 1319 correct 988 ratio 0.7491
