# GPT-3.5-Turbo on GSM8K

In [1]:
import openai
import re
import time
import getpass

import numpy as np

from tqdm import tqdm
from datasets import load_dataset

In [2]:
openai.api_key = getpass.getpass()

In [3]:
gsm8k = load_dataset('gsm8k', 'main')
validation_index = np.load('../gsm8k/lib_prompt/validation_index.npy')
validation_data = gsm8k['train'].select(validation_index)
gsm8k_test = gsm8k['test']

In [4]:
gsm8k['train'][0]['question']

'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?'

In [5]:
gsm8k_test = gsm8k['test']

In [6]:
prompt_complex = open('../gsm8k/lib_prompt/prompt_hardest.txt').read()

In [7]:
print(prompt_complex)

Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?
Let's think step by step
Angelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.
For the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.
Angelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.
However, they need to include time for breaks and lunch. Every hour they want 

In [8]:
from tenacity import (
    retry,
    stop_after_attempt,
    wait_chain,
    wait_fixed
) 

@retry(wait=wait_chain(*[wait_fixed(3) for i in range(3)] +
                       [wait_fixed(5) for i in range(2)] +
                       [wait_fixed(10)]))
def completion_with_backoff(**kwargs):
    return openai.chat.completions.create(**kwargs)

In [9]:
def test_answer(pred_str, ans_str):
    pattern = '\d*\.?\d+'
    pred = re.findall(pattern, pred_str)
    if(len(pred) >= 1):
        # print(pred_str)
        pred = pred[-1]
        gold = re.findall(pattern, ans_str)
        # print(ans_str)
        gold = gold[-1]
        return pred == gold
    else: return False

def parse_pred_ans(filename):
    with open(filename) as fd: lines = fd.readlines()
    am, a = None, None
    num_q, acc = 0, 0
    current_mode = 'none'
    questions = []
    ans_pred = []
    ans_gold = []
    for l in lines:
        if(l.startswith('Q: ')):
            if(am is not None and a is not None):
                questions.append(q)
                ans_pred.append(am)
                ans_gold.append(a)
                if(test_answer(am, a)):
                    acc += 1
            current_mode = 'q'
            q = l
            num_q += 1
        elif(l.startswith('A_model:')):
            current_mode = 'am'
            am = l
        elif(l.startswith('A:')):
            current_mode = 'a'
            a = l
        else:
            if(current_mode == 'q'): q += l
            elif(current_mode == 'am'): am += l
            elif(current_mode == 'a'): a += l
            else:
                raise ValueError(current_mode)
                
    questions.append(q)
    ans_pred.append(am)
    ans_gold.append(a)
    if(test_answer(am, a)):
        acc += 1
    print('num_q %d correct %d ratio %.4f' % (num_q, acc, float(acc / num_q)))
    return questions, ans_pred, ans_gold

def test_finished(ans_model):
    if('answer is' in ans_model): return True
    else: return False

def extract_ans(ans_model):
    ans_model = ans_model.split('\n')
    ans = []
    residual = []
    for li, al in enumerate(ans_model):
        ans.append(al)
        if('answer is' in al):
            break
    residual = list(ans_model[li + 1:])
    ans = '\n'.join(ans)
    residual = '\n'.join(residual)
    return ans, residual

In [10]:
prompt_q = prompt_complex + '\nQuestion: ' + gsm8k_test[1]['question'] + '\n'

In [11]:
print(prompt_q)

Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?
Let's think step by step
Angelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.
For the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.
Angelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.
However, they need to include time for breaks and lunch. Every hour they want 

In [12]:
response = openai.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
        {"role": "system", "content": "Follow the given examples and answer the question."},
        {"role": "user", "content": prompt_q},
    ]
)

In [13]:
response.choices[0].message.content

'It takes 2 bolts of blue fiber and half that much white fiber, which means it takes 2 bolts + (1/2)*2 bolts = 2 + 1 = 3 bolts in total. \nThus, the robe takes 3 bolts in total.'

In [14]:
print(response.choices[0].message.content)

It takes 2 bolts of blue fiber and half that much white fiber, which means it takes 2 bolts + (1/2)*2 bolts = 2 + 1 = 3 bolts in total. 
Thus, the robe takes 3 bolts in total.


# Complex Prompt Random Sampling, Acc 77.1

In [15]:
i = 0
with open('outputs/test_gpt_3.5_turbo_complex.txt', 'w') as fd:
    for q, a in tqdm(zip(gsm8k_test['question'], gsm8k_test['answer']), 
                               total=len(gsm8k_test['question'])):
        
        prompt_q = prompt_complex + '\nQuestion: ' + q + '\n'  
        
        response = completion_with_backoff(
              model="gpt-3.5-turbo",
              messages=[
                    {"role": "system", "content": "Follow the given examples and answer the question."},
                    {"role": "user", "content": prompt_q},
                ]
            )
        ans_model = response.choices[0].message.content
        ans_, residual = extract_ans(ans_model)
            
        fd.write('Q: %s\nA_model:\n%s\nA:\n%s\n\n' % (q, ans_, a))
        i += 1
        if(i == 2): break

  0%|          | 1/1319 [00:05<1:58:46,  5.41s/it]


In [16]:
_, _, _ = parse_pred_ans('outputs/test_gpt_3.5_turbo_complex.txt')

num_q 2 correct 2 ratio 1.0000


# Complex Prompt Greedy Decoding, Acc 78.85

In [17]:
i = 0
with open('outputs/test_gpt_3.5_turbo_complex_temp_0.txt', 'w') as fd:
    for q, a in tqdm(zip(gsm8k_test['question'], gsm8k_test['answer']), 
                               total=len(gsm8k_test['question'])):
        
        prompt_q = prompt_complex + '\nQuestion: ' + q + '\n'  
        
        response = completion_with_backoff(
              model="gpt-3.5-turbo",
              messages=[
                    {"role": "system", "content": "Follow the given examples and answer the question."},
                    {"role": "user", "content": prompt_q},
                ],
                temperature=0
            )
        ans_model = response.choices[0].message.content
        ans_, residual = extract_ans(ans_model)
            
        fd.write('Q: %s\nA_model:\n%s\nA:\n%s\n\n' % (q, ans_, a))
        i += 1
        if(i == 2): break

  0%|          | 1/1319 [00:03<1:22:43,  3.77s/it]


In [18]:
_, _, _ = parse_pred_ans('outputs/test_gpt_3.5_turbo_complex_temp_0.txt')

num_q 2 correct 2 ratio 1.0000


# Baseline Prompt Greedy Decoding, Acc 74.98

In [19]:
prompt_original = open('../gsm8k/lib_prompt/prompt_original.txt').read()

In [20]:
i = 0
with open('outputs/test_gpt_3.5_turbo_original_temp_0.txt', 'w') as fd:
    for q, a in tqdm(zip(gsm8k_test['question'], gsm8k_test['answer']), 
                               total=len(gsm8k_test['question'])):
        
        prompt_q = prompt_original + '\nQuestion: ' + q + '\n'  
        
        response = completion_with_backoff(
              model="gpt-3.5-turbo",
              messages=[
                    {"role": "system", "content": "Follow the given examples and answer the question."},
                    {"role": "user", "content": prompt_q},
                ],
                temperature=0
            )
        ans_model = response.choices[0].message.content
        ans_, residual = extract_ans(ans_model)
            
        fd.write('Q: %s\nA_model:\n%s\nA:\n%s\n\n' % (q, ans_, a))
        i += 1
        if(i == 2): break

  0%|          | 1/1319 [00:04<1:41:50,  4.64s/it]


In [21]:
_, _, _ = parse_pred_ans('outputs/test_gpt_3.5_turbo_original_temp_0.txt')

num_q 2 correct 2 ratio 1.0000


# Baseline Prompt, Dialog In-Context Learning, Acc 76.8

In [22]:
def make_dialog_prompt(prompt):
    messages = []
    messages.append({"role": "system", "content": "Follow the given examples and answer the question."})
    cases = prompt.split("\n\n")
    for c in cases[:-1]:
        question = c.split("\n")[:2]
        messages.append({"role": "user", "content": "\n".join(question)})
        answer = c.split("\n")[2:]
        messages.append({"role": "assistant", "content": "\n".join(answer)})
    messages.append({"role": "user", "content": cases[-1] + "Let's think step by step"})
    return messages

In [23]:
i = 0
with open('outputs/test_gpt_3.5_turbo_original_dialog_icl.txt', 'w') as fd:
    for q, a in tqdm(zip(gsm8k_test['question'], gsm8k_test['answer']), 
                               total=len(gsm8k_test['question'])):
        
        prompt_q = prompt_original + '\nQuestion: ' + q + '\n'
        dialog_prompt = make_dialog_prompt(prompt_q)
        
        response = completion_with_backoff(
              model="gpt-3.5-turbo",
              messages=dialog_prompt,
              temperature=0
            )
        ans_model = response.choices[0].message.content
        ans_, residual = extract_ans(ans_model)
            
        fd.write('Q: %s\nA_model:\n%s\nA:\n%s\n\n' % (q, ans_, a))
        i += 1
        if(i == 2): break

  0%|          | 0/1319 [00:00<?, ?it/s]

  0%|          | 1/1319 [00:03<1:12:43,  3.31s/it]


In [24]:
_, _, _ = parse_pred_ans('outputs/test_gpt_3.5_turbo_original_dialog_icl.txt')

num_q 2 correct 2 ratio 1.0000


# Complex Prompt, Dialog In-Context Learning, Acc 

In [25]:
_, _, _ = parse_pred_ans('outputs/test_gpt_3.5_turbo_complex_dialog_icl.txt')

num_q 2 correct 2 ratio 1.0000


In [26]:
from llmlingua import PromptCompressor

llm_lingua = PromptCompressor("microsoft/phi-2", device_map = 'cpu')
compressed_prompt = llm_lingua.compress_prompt(prompt_original, instruction="", question="", target_token=200)

# > {'compressed_prompt': 'Question: Sam bought a dozen boxes, each with 30 highlighter pens inside, for $10 each box. He reanged five of boxes into packages of sixlters each and sold them $3 per. He sold the rest theters separately at the of three pens $2. How much did make in total, dollars?\nLets think step step\nSam bought 1 boxes x00 oflters.\nHe bought 12 * 300ters in total\nSam then took 5 boxes 6ters0ters.\nHe sold these boxes for 5 *5\nAfterelling these  boxes there were 3030 highlighters remaining.\nThese form 330 / 3 = 110 groups of three pens.\nHe sold each of these groups for $2 each, so made 110 * 2 = $220 from them.\nIn total, then, he earned $220 + $15 = $235.\nSince his original cost was $120, he earned $235 - $120 = $115 in profit.\nThe answer is 115',
#  'origin_tokens': 2365,
#  'compressed_tokens': 211,
#  'ratio': '11.2x',
#  'saving': ', Saving $0.1 in GPT-4.'}

## Or use the phi-2 model,
#llm_lingua = PromptCompressor("microsoft/phi-2", device_map = 'cpu')

## Or use the quantation model, like TheBloke/Llama-2-7b-Chat-GPTQ, only need <8GB GPU memory.
## Before that, you need to pip install optimum auto-gptq
#llm_lingua = PromptCompressor("TheBloke/Llama-2-7b-Chat-GPTQ", model_config={"revision": "main"}, device_map = 'cpu')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [27]:
compressed_prompt['compressed_prompt']

"Question: are 15 in the. Grove workers will plant trees inve. After they, be trees the plant\nLet's think step\n trees originally\nThen there after some\nSo there 6\nThe 6\n\nQuestion: If are 3 the lot and cars are lot are\n arrive\n\n: Leah had 32ates and her sister had 42. they ate 35 many pieces left\nOriginallyates\nSo had\nThe: Jason had 20. gave Denny. has\n step stepJason\n he 12 gave 8: has five. For got two toys his does\n, then that is\nThe is were nine the. installed, from are\n\n ofSo\n 20: had 58. On lost lost\n tues: $. She bought five bagels for $3 each. How much money does she have left?\nLet's think step by step\nOlivia had 23 dollars.\n5 bagels for 3 dollars each will be 5 x 3 = 15 dollars.\nSo she has 23 - 15 dollars left.\n23 - 15 is 8.\nThe answer is 8.\n"

In [34]:
i = 0
with open('outputs/test_gpt_3.5_turbo_compressed_original_dialog_icl.txt', 'w') as fd:
    for q, a in tqdm(zip(gsm8k_test['question'], gsm8k_test['answer']), 
                               total=len(gsm8k_test['question'])):
        
        prompt_q = compressed_prompt['compressed_prompt'] + '\nQuestion: ' + q + '\n'
        dialog_prompt = make_dialog_prompt(prompt_q)
        
        response = completion_with_backoff(
              model="gpt-3.5-turbo",
              messages=dialog_prompt,
              temperature=0
            )
        ans_model = response.choices[0].message.content
        ans_, residual = extract_ans(ans_model)
            
        fd.write('Q: %s\nA_model:\n%s\nA:\n%s\n\n' % (q, ans_, a))
        i += 1
        if(i == 10): break

  0%|          | 0/1319 [00:00<?, ?it/s]

  1%|          | 9/1319 [00:23<56:52,  2.61s/it]


In [29]:
_, _, _ = parse_pred_ans('outputs/test_gpt_3.5_turbo_compressed_original_dialog_icl.txt')

num_q 2 correct 2 ratio 1.0000


# 수학 선생님과 역사 선생님

In [30]:
import requests

def generate_response(prompt, model="llama3"):
    url = "http://localhost:11434/api/generate"
    payload = {
        "model": model,
        "prompt": prompt,
        "stream": False
    }
    headers = {
        "Content-Type": "application/json"
    }
    
    response = requests.post(url, json=payload, headers=headers)

    return response.json()

In [31]:
model = "llama3"
question = "뉴턴의 법칙은 무엇입니까? 1차 세계대전은 어떻게 끝났습니까?"
prompt = f"""
다음 질문에 대해 수학 선생님과 역사 선생님의 대답을 각각 말해주세요.
총 5줄 이하로 한글로 말해주세요.

질문 : "{question}"
"""

response_role = generate_response(prompt, model=model)

print(f"Q. {question} \n A. ", response_role['response'])

Q. 뉴턴의 법칙은 무엇입니까? 1차 세계대전은 어떻게 끝났습니까? 
 A.  Here are the answers:

**Math Teacher:** 뉴턴의 법칙은 물체가 운동할 때 가속도는 그물체의 질량과 힘이 작용하는 방향과 inversely proportional한 관계에 있다. (Newton's laws state that an object's acceleration is inversely proportional to its mass and the force acting upon it, in the direction of the force.)

**History Teacher:** 제1차 세계대전은 1918년에 끝났다. 독일이 무조건항복을 선언한 후에 전쟁이 종료되었으며, treaties like Versailles Treaty were signed. (The First World War ended in 1918 when Germany unconditionally surrendered and the Treaty of Versailles was signed.)


In [32]:
model = "phi3"
response_role = generate_response(prompt, model=model)

print(f"Q. {question} \n A. ", response_role['response'])

Q. 뉴턴의 법칙은 무엇입니까? 1차 세계대전은 어떻게 끝났습니까? 
 A.   수학 선생님:
"뉴턴의 법칙은 수학에서는 자주 사용되어 있지만, 정부의 경제적인 권한을 기준으로 보여집니다. 1차 세계대전의 끝은 1592년에 이미 달아졌습니다."

역사 선생님:
"뉴턴의 법칙은 국가 발전과 정치를 관리하는 기본 원침입니다. 1차 세계대전의 끝은 1592년에 한편, 이러한 참조로 제공되지 않습니다."


# 정렬 문제

In [33]:
model = "llama3"
question = [1,5,2,3,7,8,23,7,1,4,6]
prompt = f"""
당신은 리스트를 입력으로 받고 해당 리스트를 정렬하는 역할을 부여받았습니다.
오름차순으로 정렬한 리스트를 답변해주세요.

리스트: "{question}"
"""

response_role = generate_response(prompt, model=model)

print(f"Q. {question} \n A. ", response_role['response'])

Q. [1, 5, 2, 3, 7, 8, 23, 7, 1, 4, 6] 
 A.  😊

The list is: [1, 5, 2, 3, 7, 8, 23, 7, 1, 4, 6]

In ascending order (오름차순), the sorted list is:

[1, 1, 2, 3, 4, 5, 6, 7, 7, 8, 23]
