In [2]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from datasets import load_dataset
import torch
import re
import random
import sys
import os

parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

from utils.helper import get_device, extract_answer_gsm8k, create_cot_prompt

# Load model and tokenizer
model_name = "gpt2"  # You can also try "gpt2-medium", "gpt2-large", or "gpt2-xl"
tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name, torch_dtype="auto", device_map="auto")

# set device based on machine used to run the code can be cuda, mps or cpu
device = get_device()

# Make sure padding token is set
# GPT-2 doesn't have a pad token by default, so we use the EOS token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

# Load GSM8K dataset
gsm8k = load_dataset("gsm8k", "main")
test_set = gsm8k["test"]
train_set = gsm8k["train"]

# For demonstration, let's use a sample from GSM8K
sample_idx = 5  # You can change this to try different examples
sample = test_set[sample_idx]
question = sample["question"]
reference_answer = sample["answer"]

print(f"Question: {question}")
print(f"Reference answer: {reference_answer}")

# Create 5-shot prompt
cot_prompt = create_cot_prompt(list(train_set), n_shot=5)
prompt = f"Question: {cot_prompt} Let's think step by step. At the end, you MUST write the answer as an integer after '####'. Ensure your answer is fully written before stopping.\n"
print(prompt)

# Generate answer with GPT-2
# GPT-2 may need more specific generation parameters
inputs = tokenizer(prompt, return_tensors="pt").to(device)
generated_ids = model.generate(
    inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=512,
    temperature=0.7,  # Higher temperature for GPT-2 often helps with math
    do_sample=False,
    top_p=0.9,
    top_k=50,  # Adding top-k filtering for better control
    repetition_penalty=1.2,  # Help reduce repetitions
    num_return_sequences=1,
    pad_token_id=tokenizer.eos_token_id
)

response = tokenizer.decode(generated_ids[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)

# Post-process the response to handle repetition or early stopping
def post_process_response(response):
    # If response contains a #### followed by new content, trim it
    if "####" in response:
        parts = response.split("####")
        # Keep the content before #### and the first part after ####
        main_answer = parts[0] + "####" + parts[1].split("\n\n")[0]
        return main_answer
    return response

# Apply post-processing
processed_response = post_process_response(response)

print("\nGPT-2 Raw Solution:")
print(response)

print("\nGPT-2 Processed Solution:")
print(processed_response)

# Extract the numerical answer for evaluation
predicted_answer = extract_answer_gsm8k(processed_response)
reference_numeric = extract_answer_gsm8k(reference_answer)

print(f"\nExtracted answer: {predicted_answer}")
print(f"Reference numeric answer: {reference_numeric}")

# Check if the answer is correct
if predicted_answer is not None and reference_numeric is not None:
    # Allow for small floating point differences
    if abs(predicted_answer - reference_numeric) < 1e-6:
        print("✓ Correct!")
    else:
        print("✗ Incorrect")
else:
    print("Could not extract a numeric answer")

Question: Kylar went to the store to buy glasses for his new apartment. One glass costs $5, but every second glass costs only 60% of the price. Kylar wants to buy 16 glasses. How much does he need to pay for them?
Reference answer: The discount price of one glass is 60/100 * 5 = $<<60/100*5=3>>3.
If every second glass is cheaper, that means Kylar is going to buy 16 / 2 = <<16/2=8>>8 cheaper glasses.
So for the cheaper glasses, Kylar is going to pay 8 * 3 = $<<8*3=24>>24.
And for the regular-priced glasses, Kylar will pay 8 * 5 = $<<8*5=40>>40.
So in total Kylar needs to pay 24 + 40 = $<<24+40=64>>64 for the glasses he wants to buy.
#### 64
Question: Question: For every 12 cans you recycle, you receive $0.50, and for every 5 kilograms of newspapers, you receive $1.50. If your family collected 144 cans and 20 kilograms of newspapers, how much money would you receive?
Answer: There are 144/12 = <<144/12=12>>12 sets of 12 cans that the family collected.
So, the family would receive $0.50 x




GPT-2 Raw Solution:

GPT-2 Processed Solution:
# 1 2 ####

Extracted answer: 2.0
Reference numeric answer: 64.0
✗ Incorrect


In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import re
import random
import json
import os
from typing import List, Dict, Any

# Load model and tokenizer
model_name = "gpt2"  # You can also try "gpt2-medium", "gpt2-large", or "gpt2-xl"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name, torch_dtype="auto", device_map="auto")

# Make sure padding token is set
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

# Function to download GSM8K dataset from GitHub
def download_gsm8k():
    import requests
    import os
    
    # Create data directory if it doesn't exist
    os.makedirs("gsm8k_data", exist_ok=True)
    
    # URLs for train and test data
    train_url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/train.jsonl"
    test_url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
    
    # Download train set
    train_response = requests.get(train_url)
    with open("gsm8k_data/train.jsonl", "wb") as f:
        f.write(train_response.content)
    
    # Download test set
    test_response = requests.get(test_url)
    with open("gsm8k_data/test.jsonl", "wb") as f:
        f.write(test_response.content)
    
    print("GSM8K dataset downloaded successfully.")

# Function to load the GSM8K dataset from files
def load_gsm8k_from_file():
    train_data = []
    test_data = []
    
    # Check if files exist, download if not
    if not (os.path.exists("gsm8k_data/train.jsonl") and os.path.exists("gsm8k_data/test.jsonl")):
        download_gsm8k()
    
    # Load train data
    with open("gsm8k_data/train.jsonl", "r") as f:
        for line in f:
            train_data.append(json.loads(line))
    
    # Load test data
    with open("gsm8k_data/test.jsonl", "r") as f:
        for line in f:
            test_data.append(json.loads(line))
    
    print(f"Loaded {len(train_data)} training examples and {len(test_data)} test examples")
    return train_data, test_data

# Load our GSM8K dataset
train_set, test_set = load_gsm8k_from_file()

# Function to extract the numerical answer
def extract_answer(text):
    if "####" in text:
        answer_part = text.split("####")[-1].strip()
        numbers = re.findall(r'[-+]?\d*\.?\d+', answer_part)
        if numbers:
            return float(numbers[0])
    
    # Fallback: extract the last number in the full text
    numbers = re.findall(r'[-+]?\d*\.?\d+', text)
    if numbers:
        return float(numbers[-1])
    
    return None

# Function to create 5-shot prompt
def create_five_shot_prompt(train_examples: List[Dict[str, Any]], question: str):
    # Set random seed for reproducibility
    random.seed(42)
    
    # Sample 5 examples from the training set
    shot_examples = random.sample(train_examples, 5)
    
    # Build the few-shot prompt
    prompt = ""
    for ex in shot_examples:
        prompt += f"Question: {ex['question']}\n"
        prompt += f"Answer: {ex['answer']}\n\n"
    
    # Add the current question with clear instructions for GPT-2
    prompt += f"Question: {question}\n"
    prompt += f"Answer: Let's solve this step by step. I'll write the final answer after ####.\n"
    
    return prompt

# For demonstration, let's use a sample from GSM8K
sample_idx = 5  # You can change this to try different examples
sample = test_set[sample_idx]
question = sample["question"]
reference_answer = sample["answer"]

print(f"Question: {question}")
print(f"Reference answer: {reference_answer}")

# Create 5-shot prompt
five_shot_prompt = create_five_shot_prompt(train_set, question)

print("\n5-Shot Prompt (excerpt):")
# Print the beginning and end of the prompt if it's long
if len(five_shot_prompt) > 500:
    print(five_shot_prompt[:250] + "...\n..." + five_shot_prompt[-250:])
else:
    print(five_shot_prompt)

# Generate answer with GPT-2
inputs = tokenizer(five_shot_prompt, return_tensors="pt").to("cuda")
generated_ids = model.generate(
    inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=512,
    temperature=0.7,
    do_sample=True,
    top_p=0.9,
    top_k=50,
    repetition_penalty=1.2,
    num_return_sequences=1,
    pad_token_id=tokenizer.eos_token_id
)

response = tokenizer.decode(generated_ids[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)

# Post-process the response to handle repetition or early stopping
def post_process_response(response):
    # If response contains a #### followed by new content, trim it
    if "####" in response:
        parts = response.split("####")
        # Keep the content before #### and the first part after ####
        main_answer = parts[0] + "####" + parts[1].split("\n\n")[0]
        return main_answer
    return response

# Apply post-processing
processed_response = post_process_response(response)

print("\nGPT-2 Raw Solution:")
print(response)

print("\nGPT-2 Processed Solution:")
print(processed_response)

# Extract the numerical answer for evaluation
predicted_answer = extract_answer(processed_response)
reference_numeric = extract_answer(reference_answer)

print(f"\nExtracted answer: {predicted_answer}")
print(f"Reference numeric answer: {reference_numeric}")

# Check if the answer is correct
if predicted_answer is not None and reference_numeric is not None:
    # Allow for small floating point differences
    if abs(predicted_answer - reference_numeric) < 1e-6:
        print("✓ Correct!")
    else:
        print("✗ Incorrect")
else:
    print("Could not extract a numeric answer")
    

GSM8K dataset downloaded successfully.
Loaded 7473 training examples and 1319 test examples
Question: Kylar went to the store to buy glasses for his new apartment. One glass costs $5, but every second glass costs only 60% of the price. Kylar wants to buy 16 glasses. How much does he need to pay for them?
Reference answer: The discount price of one glass is 60/100 * 5 = $<<60/100*5=3>>3.
If every second glass is cheaper, that means Kylar is going to buy 16 / 2 = <<16/2=8>>8 cheaper glasses.
So for the cheaper glasses, Kylar is going to pay 8 * 3 = $<<8*3=24>>24.
And for the regular-priced glasses, Kylar will pay 8 * 5 = $<<8*5=40>>40.
So in total Kylar needs to pay 24 + 40 = $<<24+40=64>>64 for the glasses he wants to buy.
#### 64

5-Shot Prompt (excerpt):
Question: For every 12 cans you recycle, you receive $0.50, and for every 5 kilograms of newspapers, you receive $1.50. If your family collected 144 cans and 20 kilograms of newspapers, how much money would you receive?
Answer: There 