In [2]:
!pip install groq python-dotenv numpy tqdm datasets



In [3]:
from groq import Groq
from dotenv import load_dotenv
from datasets import load_dataset

import os
from tqdm import tqdm
import re
import random
import pprint

from typing import List, Dict, Any

load_dotenv()
random.seed(0)

client = Groq()
gsm8k_dataset = load_dataset("gsm8k", "main")

gsm8k_train = gsm8k_dataset["train"]
gsm8k_test  = gsm8k_dataset["test"]

In [81]:
def generate_response_using_Llama(
        prompt: str,
        model: str = "llama-3.1-8b-instant"
    ):
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful assistant that solves math problems."
                },
                {
                    "role": "user", 
                    "content": prompt
                }
            ],
            model=model,
            temperature=0.3, ### 수정해도 됩니다!
            stream=False
        )
        return chat_completion.choices[0].message.content
    
    except Exception as e:
        print(f"API call error: {str(e)}")
        return None

#### 응답 잘 나오는지 확인해보기

In [5]:
response = generate_response_using_Llama(
    prompt="Hello world!",
)
print(response)

Hello. What math problem would you like help with today?


#### GSM8K 데이터셋 확인해보기

In [6]:
print("[Question]")
for l in gsm8k_test['question'][0].split("."):
    print(l)
print("="*100)
print("[Answer]")
print(gsm8k_test['answer'][0])

[Question]
Janet’s ducks lay 16 eggs per day
 She eats three for breakfast every morning and bakes muffins for her friends every day with four
 She sells the remainder at the farmers' market daily for $2 per fresh duck egg
 How much in dollars does she make every day at the farmers' market?
[Answer]
Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.
She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.
#### 18


#### Util 함수들
- extract_final_answer: LLM의 응답을 parse하여 최종 결과만 추출 (정답과 비교하기 위해)
- run_benchmark_test: 벤치마크 테스트
- save_final_result: 결과물 제출을 위한 함수

In [7]:
### 수정해도 됩니다!
def extract_final_answer(response: str):
    regex = r"(?:Answer:|Model response:)\s*\$?([0-9,]+)\b|([0-9,]+)\s*(meters|cups|miles|minutes)"
    matches = re.finditer(regex, response, re.MULTILINE)
    results = [match.group(1) if match.group(1) else match.group(2).replace(",", "") for match in matches]

    if len(results) == 0:
        additional_regex = r"\$?([0-9,]+)"
        additional_matches = re.finditer(additional_regex, response, re.MULTILINE)
        results.extend([match.group(1).replace(",", "") for match in additional_matches])

    return results[-1] if results else None

In [8]:
### 수정해도 됩니다!
def run_benchmark_test(
        dataset,
        prompt: str,
        model: str = "llama-3.1-8b-instant",
        num_samples: int = 50,
        VERBOSE: bool = False
    ):
    correct = 0
    total   = 0
    results = []

    for i in tqdm(range(min(num_samples, len(dataset)))):
        question = dataset[i]["question"]
        correct_answer = float(re.findall(r'\d+(?:\.\d+)?', dataset[i]["answer"].split('####')[-1])[0])

        response = generate_response_using_Llama(
            prompt=prompt.format(question=question),
            model=model
        )

        if response:
            if VERBOSE:
                print("="*50)
                print(response)
                print("="*50)
            predicted_answer = extract_final_answer(response)

            if isinstance(predicted_answer, str):
                predicted_answer = float(predicted_answer.replace(",", ""))
            
            diff = abs(predicted_answer - correct_answer)
            is_correct = diff < 1e-5 if predicted_answer is not None else False
            
            if is_correct:
                correct += 1
            total += 1
            
            results.append({
                'question': question,
                'correct_answer': correct_answer,
                'predicted_answer': predicted_answer,
                'response': response,
                'correct': is_correct
            })

            if (i + 1) % 5 == 0:
                current_acc = correct/total if total > 0 else 0
                print(f"Progress: [{i+1}/{num_samples}]")
                print(f"Current Acc.: [{current_acc:.2%}]")

    return results, correct/total if total > 0 else 0

In [9]:
def save_final_result(results: List[Dict[str, Any]], accuracy: float, filename: str) -> None:
    result_str = f"====== ACCURACY: {accuracy} ======\n\n"
    result_str += f"[Details]\n"
    
    for idx, result in enumerate(results):
        result_str += f"Question {idx+1}: {result['question']}\n"
        result_str += f"Correct Answer: {result['correct_answer']}\n"
        result_str += f"Predicted Answer: {result['predicted_answer']}\n"
        result_str += f"Correct: {result['correct']}\n\n"
    
    with open(filename, "w", encoding="utf-8") as f:
        f.write(result_str)

#### Direct prompting with few-shot example

In [85]:
def construct_direct_prompt(num_examples: int = 3) -> str:
    train_dataset = gsm8k_train

    sampled_indices = random.sample(
        [i for i in range(len(train_dataset['question']))],
        num_examples
    )

    prompt = (
        "Instruction:\nSolve the following mathematical question and generate ONLY the answer after a tag, 'Answer:' without any rationale.\n"
    )

    for i in range(num_examples):
        cur_question = train_dataset['question'][i]
        cur_answer = train_dataset['answer'][i].split("####")[-1].strip()

        prompt += f"\n[Example {i+1}]\n"
        prompt += f"Question:\n{cur_question}\n"
        prompt += f"Answer:{cur_answer}\n"

    prompt += "\nQuestion:\n{question}\nAnswer:"

    return prompt

In [48]:
### 어떤 방식으로 저장되는지 확인해보세요!
PROMPT = construct_direct_prompt(3)
VERBOSE = False

results, accuracy = run_benchmark_test(
    dataset=gsm8k_test,
    prompt=PROMPT,
    VERBOSE=VERBOSE,
    num_samples=10
)
save_final_result(results, accuracy, "example.txt")

 50%|█████     | 5/10 [00:09<00:08,  1.79s/it]

Progress: [5/10]
Current Acc.: [80.00%]


100%|██████████| 10/10 [00:12<00:00,  1.21s/it]

Progress: [10/10]
Current Acc.: [60.00%]





In [17]:
# TODO: 0 shot, 3 shot, 5 shot direct prompting을 통해 벤치마크 테스트를 한 후, 각각 direct_prompting_{shot: int}.txt로 저장해주세요!
# 예시: shot이 5인 경우 direct_prompting_5.txt
# 항상 num_samples=50 입니다!

In [86]:
shot = 0

PROMPT = construct_direct_prompt(shot)
VERBOSE = False

results, accuracy = run_benchmark_test(
    dataset=gsm8k_test,
    prompt=PROMPT,
    VERBOSE=VERBOSE,
    num_samples=50
)
save_final_result(results, accuracy, f"direct_prompting_{shot}.txt")

 10%|█         | 5/50 [00:01<00:16,  2.78it/s]

Progress: [5/50]
Current Acc.: [60.00%]


 20%|██        | 10/50 [00:03<00:15,  2.66it/s]

Progress: [10/50]
Current Acc.: [60.00%]


 30%|███       | 15/50 [00:05<00:16,  2.06it/s]

Progress: [15/50]
Current Acc.: [73.33%]


 40%|████      | 20/50 [00:08<00:17,  1.76it/s]

Progress: [20/50]
Current Acc.: [70.00%]


 50%|█████     | 25/50 [00:09<00:08,  2.82it/s]

Progress: [25/50]
Current Acc.: [72.00%]


 60%|██████    | 30/50 [00:11<00:06,  2.95it/s]

Progress: [30/50]
Current Acc.: [73.33%]


 70%|███████   | 35/50 [00:12<00:04,  3.32it/s]

Progress: [35/50]
Current Acc.: [77.14%]


 80%|████████  | 40/50 [00:14<00:03,  2.61it/s]

Progress: [40/50]
Current Acc.: [75.00%]


 90%|█████████ | 45/50 [00:16<00:01,  3.05it/s]

Progress: [45/50]
Current Acc.: [75.56%]


100%|██████████| 50/50 [00:18<00:00,  2.76it/s]

Progress: [50/50]
Current Acc.: [78.00%]





In [None]:
shot = 3

PROMPT = construct_direct_prompt(shot)
VERBOSE = False

results, accuracy = run_benchmark_test(
    dataset=gsm8k_test,
    prompt=PROMPT,
    VERBOSE=VERBOSE,
    num_samples=50
)
save_final_result(results, accuracy, f"direct_prompting_{shot}.txt")

In [None]:
shot = 5

PROMPT = construct_direct_prompt(shot)
VERBOSE = False

results, accuracy = run_benchmark_test(
    dataset=gsm8k_test,
    prompt=PROMPT,
    VERBOSE=VERBOSE,
    num_samples=50
)
save_final_result(results, accuracy, f"direct_prompting_{shot}.txt")

### Chain-of-Thought prompting with few-shot example
```text
[Question]
Janet’s ducks lay 16 eggs per day
 She eats three for breakfast every morning and bakes muffins for her friends every day with four
 She sells the remainder at the farmers' market daily for $2 per fresh duck egg
 How much in dollars does she make every day at the farmers' market?
====================================================================================================
[Answer]
Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.
She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.
#### 18
```

[Answer] 아래의 정답을 도출하는 과정을 예시로 달아주면 CoT의 few shot이 되겠죠?

In [88]:
def construct_CoT_prompt(num_examples: int = 3) -> str:
    train_dataset = gsm8k_train

    sampled_indices = random.sample(
        [i for i in range(len(train_dataset['question']))],
        num_examples
    )
    prompt = ("Solve the following math problems step-by-step.\n"
        "CRITICAL: You MUST end your response with '#### ' followed by the numeric answer.\n"
        "Do not include any text after the '#### [number]' format.\n"
        "Example: 'The final total is 50. #### 50'\n\n")

    for i in sampled_indices:
        question = train_dataset['question'][i]
        answer = train_dataset['answer'][i]
        prompt += f"Question:\n{question}\n"
        prompt += f"Answer:\n{answer}\n"

    prompt += "\nQuestion:\n{question}\nAnswer:"

    return prompt

In [12]:
# TODO: 0 shot, 3 shot, 5 shot CoT prompting을 통해 벤치마크 테스트를 한 후, 각각 CoT_prompting_{shot: int}.txt로 저장해주세요!
# 예시: shot이 5인 경우 CoT_prompting_5.txt
# 항상 num_samples=50 입니다!

shots = [0, 3, 5]
for shot in shots:
    PROMPT = construct_CoT_prompt(shot)
    VERBOSE = False

    results, accuracy = run_benchmark_test(
        dataset=gsm8k_test,
        prompt=PROMPT,
        VERBOSE=VERBOSE,
        num_samples=50
    )
    save_final_result(results, accuracy, f"CoT_prompting_{shot}.txt")

 10%|█         | 5/50 [00:02<00:24,  1.82it/s]

Progress: [5/50]
Current Acc.: [80.00%]


 20%|██        | 10/50 [01:12<08:55, 13.39s/it]

Progress: [10/50]
Current Acc.: [70.00%]


 30%|███       | 15/50 [02:13<06:38, 11.38s/it]

Progress: [15/50]
Current Acc.: [73.33%]


 40%|████      | 20/50 [03:24<06:44, 13.50s/it]

Progress: [20/50]
Current Acc.: [70.00%]


 50%|█████     | 25/50 [04:41<05:58, 14.34s/it]

Progress: [25/50]
Current Acc.: [68.00%]


 60%|██████    | 30/50 [06:00<05:16, 15.82s/it]

Progress: [30/50]
Current Acc.: [70.00%]


 70%|███████   | 35/50 [07:05<03:24, 13.66s/it]

Progress: [35/50]
Current Acc.: [71.43%]


 80%|████████  | 40/50 [08:24<02:33, 15.34s/it]

Progress: [40/50]
Current Acc.: [70.00%]


 90%|█████████ | 45/50 [09:37<01:20, 16.08s/it]

Progress: [45/50]
Current Acc.: [68.89%]


100%|██████████| 50/50 [10:45<00:00, 12.91s/it]

Progress: [50/50]
Current Acc.: [70.00%]





In [90]:
shots = [5]
for shot in shots:
    PROMPT = construct_CoT_prompt(shot)
    VERBOSE = False

    results, accuracy = run_benchmark_test(
        dataset=gsm8k_test,
        prompt=PROMPT,
        VERBOSE=VERBOSE,
        num_samples=50
    )
    save_final_result(results, accuracy, f"CoT_prompting_{shot}.txt")

 10%|█         | 5/50 [00:02<00:19,  2.26it/s]

Progress: [5/50]
Current Acc.: [80.00%]


 20%|██        | 10/50 [00:05<00:25,  1.59it/s]

Progress: [10/50]
Current Acc.: [60.00%]


 30%|███       | 15/50 [00:07<00:17,  1.95it/s]

Progress: [15/50]
Current Acc.: [73.33%]


 40%|████      | 20/50 [00:10<00:14,  2.05it/s]

Progress: [20/50]
Current Acc.: [75.00%]


 50%|█████     | 25/50 [00:11<00:09,  2.53it/s]

Progress: [25/50]
Current Acc.: [72.00%]


 60%|██████    | 30/50 [00:14<00:08,  2.44it/s]

Progress: [30/50]
Current Acc.: [73.33%]


 70%|███████   | 35/50 [00:16<00:06,  2.39it/s]

Progress: [35/50]
Current Acc.: [77.14%]


 80%|████████  | 40/50 [00:18<00:04,  2.03it/s]

Progress: [40/50]
Current Acc.: [72.50%]


 90%|█████████ | 45/50 [00:20<00:02,  1.94it/s]

Progress: [45/50]
Current Acc.: [73.33%]


100%|██████████| 50/50 [00:22<00:00,  2.18it/s]

Progress: [50/50]
Current Acc.: [76.00%]





### Construct your prompt!!

목표: 본인만의 프롬프트를 통해 정답률을 더 끌어올려보기!
- gsm8k의 train 데이터셋에서 예시를 가져온 다음 (자유롭게!)
- 그 예시들에 대한 풀이 과정을 만들어주세요!
- 모든 것들이 자유입니다! Direct Prompting, CoT Prompting을 한 결과보다 정답률만 높으면 돼요.

In [91]:
for i in range(100):
    print("[Question]")
    for l in gsm8k_train['question'][i].split("."):
        print(l)
    print("="*100)
    print("[Answer]")
    print(gsm8k_train['answer'][i])

[Question]
Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May
 How many clips did Natalia sell altogether in April and May?
[Answer]
Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 72
[Question]
Weng earns $12 an hour for babysitting
 Yesterday, she just did 50 minutes of babysitting
 How much did she earn?
[Answer]
Weng earns 12/60 = $<<12/60=0.2>>0.2 per minute.
Working 50 minutes, she earned 0.2 x 50 = $<<0.2*50=10>>10.
#### 10
[Question]
Betty is saving money for a new wallet which costs $100
 Betty has only half of the money she needs
 Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents
 How much more money does Betty need to buy the wallet?
[Answer]
In the beginning, Betty has only 100 / 2 = $<<100/2=50>>50.
Betty's grandparents gave her 15 * 2 = $<<15*2=30>>30.
This means, Betty needs 100 - 50 - 30 - 15 = $<<10

In [None]:
def construct_my_prompt(num_examples: int):
    FIXED_EXAMPLES = [
        {
        "question": "The file, 90 megabytes in size, downloads at the rate of 5 megabytes per second for its first 60 megabytes, and then 10 megabytes per second thereafter. How long, in seconds, does it take to download entirely?",
        "answer": "The first 60 megabytes take 60/5=12 seconds. There are 90-60=30 remaining megabytes. The remaining 30 megabytes take 30/10=3 seconds. And 12+3=15 seconds. #### 15"
        },
        {
            "question": "Jesse and Mia are competing in a week long race. They have one week to run 30 miles. On the first three days Jesse averages (2/3) of a mile. On day four she runs 10 miles. Mia averages 3 miles a day over the first 4 days. What is the average of their average that they have to run over the final three days?",
            "answer": "Jesse runs 2 miles in the first three days because 3 x (2/3) = 2. Jesse has 18 miles left to run because 30 - 10 - 2 = 18. Jesse has to run an average of 6 miles a day because 18 / 3 = 6. Mia runs 12 miles over the first four days because 4 x 3 = 12. She has 18 miles left to run because 30 - 12 = 18. She has to run six miles a day because 18 / 3 = 6. The total they both have to run is 12 miles a day. The average they have to run per day on average is 6 miles because 12 / 2 = 6. #### 6"
        },
        {
            "question": "Gerald spends $100 a month on baseball supplies. His season is 4 months long. He wants to use the months he's not playing baseball to save up by raking, shoveling, and mowing lawns. He charges $10 for each. How many chores does he need to average a month to save up for his supplies?",
            "answer": "He needs to save up $400 because 4 x 100 = 400. He has 8 months to earn this money because 12 - 4 = 8. He needs to earn $50 a month because 400 / 8 = 50. He needs to do 5 tasks a month because 50 / 10 = 5. #### 5"
        },
        {
            "question": "Tim rides his bike back and forth to work for each of his 5 workdays. His work is 20 miles away. He also goes for a weekend bike ride of 200 miles. If he can bike at 25 mph how much time does he spend biking a week?",
            "answer": "He bikes 20*2=40 miles each day for work. So he bikes 40*5=200 miles for work. That means he bikes a total of 200+200=400 miles for work. So he bikes a total of 400/25=16 hours. #### 16"
        },
        {
            "question": "Bailey starts with a certain amount of money. Then she receives a weekly allowance of $5 for 8 weeks. At the end of the 8 weeks, if she has a total of $100, how much money did Bailey start with?",
            "answer": "In 8 weeks, Bailey receives $5 * 8 = $40 in allowance. Bailey started with $100 - $40 = $60. #### 60"
        }
    ]

    prompt = (
        "Instruction:\n"
        "You are a world-renowned mathematics coach known for your logical precision and clarity. One mistake means elimination.\n"
            "1. Read the question and identify every single numerical constraint.\n"
            "2. Reason through the problem step-by-step, explaining your logic clearly.\n"
            "3. Before finalizing, verify the calculation one last time.\n\n"   

            "CRITICAL: You MUST end your response exactly at the numeric answer. "
            "Absolute silence after the '#### [number]' is required. No 'Hope this helps' or 'Therefore'.\n\n"
    )

    for i in range(min(num_examples, len(FIXED_EXAMPLES))):
        example = FIXED_EXAMPLES[i]
        
        prompt += f"Question:\n{example['question']}\n"
        prompt += f"Answer:\n{example['answer']}\n"
        prompt += "===\n"
    
    prompt += "\nQuestion:\n{question}\nAnswer:"

    return prompt

In [84]:
shot = 0
PROMPT = construct_my_prompt(shot)
VERBOSE = False

results, accuracy = run_benchmark_test(
    dataset=gsm8k_test,
    prompt=PROMPT,
    VERBOSE=VERBOSE,
    num_samples=50
)
save_final_result(results, accuracy, f"My_prompting_{shot}.txt")

 10%|█         | 5/50 [00:10<02:23,  3.19s/it]

Progress: [5/50]
Current Acc.: [60.00%]


 20%|██        | 10/50 [00:20<01:02,  1.55s/it]

Progress: [10/50]
Current Acc.: [50.00%]


 30%|███       | 15/50 [00:23<00:24,  1.42it/s]

Progress: [15/50]
Current Acc.: [60.00%]


 40%|████      | 20/50 [00:26<00:20,  1.50it/s]

Progress: [20/50]
Current Acc.: [70.00%]


 50%|█████     | 25/50 [00:29<00:13,  1.91it/s]

Progress: [25/50]
Current Acc.: [76.00%]


 60%|██████    | 30/50 [00:31<00:11,  1.78it/s]

Progress: [30/50]
Current Acc.: [80.00%]


 70%|███████   | 35/50 [00:34<00:06,  2.28it/s]

Progress: [35/50]
Current Acc.: [82.86%]


 80%|████████  | 40/50 [00:37<00:05,  1.70it/s]

Progress: [40/50]
Current Acc.: [80.00%]


 90%|█████████ | 45/50 [00:46<00:06,  1.28s/it]

Progress: [45/50]
Current Acc.: [80.00%]


100%|██████████| 50/50 [00:49<00:00,  1.02it/s]

Progress: [50/50]
Current Acc.: [82.00%]





In [52]:
shot = 3
PROMPT = construct_my_prompt(shot)
VERBOSE = False

results, accuracy = run_benchmark_test(
    dataset=gsm8k_test,
    prompt=PROMPT,
    VERBOSE=VERBOSE,
    num_samples=50
)
save_final_result(results, accuracy, f"My_prompting_{shot}.txt")

 10%|█         | 5/50 [00:02<00:25,  1.77it/s]

Progress: [5/50]
Current Acc.: [100.00%]


 20%|██        | 10/50 [00:06<00:24,  1.64it/s]

Progress: [10/50]
Current Acc.: [80.00%]


 30%|███       | 15/50 [00:09<00:21,  1.60it/s]

Progress: [15/50]
Current Acc.: [80.00%]


 40%|████      | 20/50 [00:11<00:17,  1.68it/s]

Progress: [20/50]
Current Acc.: [80.00%]


 50%|█████     | 25/50 [00:14<00:12,  2.03it/s]

Progress: [25/50]
Current Acc.: [80.00%]


 60%|██████    | 30/50 [00:17<00:10,  1.92it/s]

Progress: [30/50]
Current Acc.: [83.33%]


 70%|███████   | 35/50 [00:20<00:08,  1.85it/s]

Progress: [35/50]
Current Acc.: [85.71%]


 80%|████████  | 40/50 [00:23<00:06,  1.67it/s]

Progress: [40/50]
Current Acc.: [82.50%]


 90%|█████████ | 45/50 [00:26<00:02,  1.75it/s]

Progress: [45/50]
Current Acc.: [84.44%]


100%|██████████| 50/50 [00:28<00:00,  1.74it/s]

Progress: [50/50]
Current Acc.: [84.00%]





In [None]:
shot = 5
PROMPT = construct_my_prompt(shot)
VERBOSE = False

results, accuracy = run_benchmark_test(
    dataset=gsm8k_test,
    prompt=PROMPT,
    VERBOSE=VERBOSE,
    num_samples=50
)
save_final_result(results, accuracy, f"My_prompting_{shot}.txt")

### 보고서 작성하기
#### 아래의 내용이 포함되면 됩니다!

1. Direct Prompting, CoT Prompting, My Prompting을 0 shot, 3 shot, 5 shot 정답률을 표로 보여주세요!
2. CoT Prompting이 Direct Prompting에 비해 왜 좋을 수 있는지에 대해서 서술해주세요!
3. 본인이 작성한 프롬프트 기법이 CoT에 비해서 왜 더 좋을 수 있는지에 대해서 설명해주세요!
4. 최종적으로, `PROMPTING.md`에 보고서를 작성해주세요!