# GPT-3.5-Turbo on GSM8K

In [1]:
import openai
import re
import time

import numpy as np

from tqdm import tqdm
from datasets import load_dataset

In [17]:
openai.api_key = ""

In [4]:
gsm8k = load_dataset('gsm8k', 'main')
validation_index = np.load('../gsm8k/lib_prompt/validation_index.npy')
validation_data = gsm8k['train'].select(validation_index)
gsm8k_test = gsm8k['test']

In [5]:
gsm8k['train'][0]['question']

'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?'

In [6]:
gsm8k_test = gsm8k['test']

In [7]:
prompt_complex = open('../gsm8k/lib_prompt/prompt_hardest.txt').read()

In [8]:
print(prompt_complex)

Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?
Let's think step by step
Angelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.
For the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.
Angelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.
However, they need to include time for breaks and lunch. Every hour they want 

In [10]:
from tenacity import (
    retry,
    stop_after_attempt,
    wait_chain,
    wait_fixed
) 

@retry(wait=wait_chain(*[wait_fixed(3) for i in range(3)] +
                       [wait_fixed(5) for i in range(2)] +
                       [wait_fixed(10)]))
def completion_with_backoff(**kwargs):
    return openai.ChatCompletion.create(**kwargs)

In [11]:
def test_answer(pred_str, ans_str):
    pattern = '\d*\.?\d+'
    pred = re.findall(pattern, pred_str)
    if(len(pred) >= 1):
        # print(pred_str)
        pred = pred[-1]
        gold = re.findall(pattern, ans_str)
        # print(ans_str)
        gold = gold[-1]
        return pred == gold
    else: return False

def parse_pred_ans(filename):
    with open(filename) as fd: lines = fd.readlines()
    am, a = None, None
    num_q, acc = 0, 0
    current_mode = 'none'
    questions = []
    ans_pred = []
    ans_gold = []
    for l in lines:
        if(l.startswith('Q: ')):
            if(am is not None and a is not None):
                questions.append(q)
                ans_pred.append(am)
                ans_gold.append(a)
                if(test_answer(am, a)):
                    acc += 1
            current_mode = 'q'
            q = l
            num_q += 1
        elif(l.startswith('A_model:')):
            current_mode = 'am'
            am = l
        elif(l.startswith('A:')):
            current_mode = 'a'
            a = l
        else:
            if(current_mode == 'q'): q += l
            elif(current_mode == 'am'): am += l
            elif(current_mode == 'a'): a += l
            else:
                raise ValueError(current_mode)
                
    questions.append(q)
    ans_pred.append(am)
    ans_gold.append(a)
    if(test_answer(am, a)):
        acc += 1
    print('num_q %d correct %d ratio %.4f' % (num_q, acc, float(acc / num_q)))
    return questions, ans_pred, ans_gold

def test_finished(ans_model):
    if('answer is' in ans_model): return True
    else: return False

def extract_ans(ans_model):
    ans_model = ans_model.split('\n')
    ans = []
    residual = []
    for li, al in enumerate(ans_model):
        ans.append(al)
        if('answer is' in al):
            break
    residual = list(ans_model[li + 1:])
    ans = '\n'.join(ans)
    residual = '\n'.join(residual)
    return ans, residual

In [12]:
prompt_q = prompt_complex + '\nQuestion: ' + gsm8k_test[1]['question'] + '\n'

In [13]:
print(prompt_q)

Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?
Let's think step by step
Angelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.
For the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.
Angelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.
However, they need to include time for breaks and lunch. Every hour they want 

In [27]:
response = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
        {"role": "system", "content": "Follow the given examples and answer the question."},
        {"role": "user", "content": prompt_q},
    ]
)

print(response.choices[0].message)

ChatCompletionMessage(content="Let's think step by step:\n\nIf a robe takes 2 bolts of blue fiber, and half that much white fiber, it means it takes 2 + 1 = 3 bolts in total.\nTherefore, a robe takes 3 bolts in total.\n\nThe answer is 3.", role='assistant', function_call=None, tool_calls=None)


In [28]:
response.choices[0].message.content

"Let's think step by step:\n\nIf a robe takes 2 bolts of blue fiber, and half that much white fiber, it means it takes 2 + 1 = 3 bolts in total.\nTherefore, a robe takes 3 bolts in total.\n\nThe answer is 3."

In [29]:
print(response.choices[0].message.content)

Let's think step by step:

If a robe takes 2 bolts of blue fiber, and half that much white fiber, it means it takes 2 + 1 = 3 bolts in total.
Therefore, a robe takes 3 bolts in total.

The answer is 3.


## CoT Compression

In [48]:
def test(cot_example, question):

    prompt_q = cot_example + '\nQuestion: ' + question + '\n'
    response = client.chat.completions.create(
      model="gpt-3.5-turbo",
      messages=[
            {"role": "system", "content": "Follow the given examples and answer the question."},
            {"role": "user", "content": prompt},
        ]
    )

    print("[ propmt ]")
    print(prompt_q)
    print("\n[ answer ]")
    print(response.choices[0].message.content)

In [49]:
from llmlingua import PromptCompressor
from pprint import pprint 

llm_lingua = PromptCompressor(device_map="mps")

## phi-2 model
phi_llm_lingua = PromptCompressor("microsoft/phi-2", device_map="mps")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [50]:
compressed_prompt = llm_lingua.compress_prompt(prompt_complex, instruction="", question="", target_token=300)
phi_compressed_prompt = phi_llm_lingua.compress_prompt(prompt_complex, instruction="", question="", target_token=300)

Token indices sequence length is longer than the specified maximum sequence length for this model (2207 > 2048). Running this sequence through the model will result in indexing errors


In [51]:
compressed_prompt['compressed_prompt']

': Angelo andanie to plan many hours study their test2 of4 to They should study study total week they1 hour,3 breaks and\ns think step\nanie eachters hours hours\nate1 hours each hours\n andanie to start with planning study,\n to. they to203ute\nunch for  minutes hour3\nSo andanie study study,5 to for\nThe4\n\n: Marks team52.  score teams\ns,5\n team also63, they8\n and as point\n together0\ns scoreds scored4 half57\nThe is The\n: B has two as than deck she2 more what of have she\n\n haveella manybles as0\n Bella25 will\n more deck, have\n2+0The\n4 fruitanas first2 many there\n\n\n,8\nets are3anges theThe3 basket4\nThe1\n:13ruits between cost\n\n, then62\n1If\n of is and price\n can)2\n\n:y a large,0,, many the most\n\n,0y0 total\n,:0 Heanged into\n boxestersbox0He50ese three to3, 1000 - 480 = 520 do not like to play basketball.\nThe percentage of the school that do not like to play basketball is 520/1000 * 100 = 52\nThe answer is 52\n'

In [52]:
phi_compressed_prompt['compressed_prompt']

' = 52\nThe answer is 52\n'

In [53]:
test(compressed_prompt['compressed_prompt'], gsm8k_test[1]['question'])

[ propmt ]
: Angelo andanie to plan many hours study their test2 of4 to They should study study total week they1 hour,3 breaks and
s think step
anie eachters hours hours
ate1 hours each hours
 andanie to start with planning study,
 to. they to203ute
unch for  minutes hour3
So andanie study study,5 to for
The4

: Marks team52.  score teams
s,5
 team also63, they8
 and as point
 together0
s scoreds scored4 half57
The is The
: B has two as than deck she2 more what of have she

 haveella manybles as0
 Bella25 will
 more deck, have
2+0The
4 fruitanas first2 many there


,8
ets are3anges theThe3 basket4
The1
:13ruits between cost

, then62
1If
 of is and price
 can)2

:y a large,0,, many the most

,0y0 total
,:0 Heanged into
 boxestersbox0He50ese three to3, 1000 - 480 = 520 do not like to play basketball.
The percentage of the school that do not like to play basketball is 520/1000 * 100 = 52
The answer is 52

Question: A robe takes 2 bolts of blue fiber and half that much white fiber.  How m

In [54]:
test(phi_compressed_prompt['compressed_prompt'], gsm8k_test[1]['question'])

[ propmt ]
 = 52
The answer is 52

Question: A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it take?


[ answer ]
The answer is correct. The process and calculations provided are accurate and logical. Good job!


In [31]:
i = 0
with open('outputs/test_gpt_3.5_turbo_complex.txt', 'w') as fd:
    for q, a in tqdm(zip(gsm8k_test['question'], gsm8k_test['answer']), 
                               total=len(gsm8k_test['question'])):
        
        prompt_q = prompt_complex + '\nQuestion: ' + q + '\n'  
        
        response = client.chat.completions.create(
          model="gpt-3.5-turbo",
          messages=[
                {"role": "system", "content": "Follow the given examples and answer the question."},
                {"role": "user", "content": prompt_q},
            ]
        )
        
        ans_model = response.choices[0].message.content
        ans_, residual = extract_ans(ans_model)
            
        fd.write('Q: %s\nA_model:\n%s\nA:\n%s\n\n' % (q, ans_, a))
        i += 1
        # if(i == 2): break

 19%|███████████████████████▏                                                                                                    | 246/1319 [11:24<49:47,  2.78s/it]


KeyboardInterrupt: 

In [34]:
_, _, _ = parse_pred_ans('outputs/test_gpt_3.5_turbo_complex.txt')

num_q 1319 correct 1017 ratio 0.7710
