In [1]:
import torch
import numpy as np
import pandas as pd
import random
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import re
import wandb
from datetime import datetime

# Configuration
config = {
    # Model settings
    "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    "max_length": 512,  # Increased to handle longer few-shot prompts
    
    # Dataset settings
    "num_samples": 100,  # Set to None to use the full dataset
    "n_shot": 8,  # Number of examples for few-shot prompting
    
    # Generation settings
    "temperature": 0.7,
    "top_p": 0.9,
    "num_beams": 4,
    
    # Experiment tracking
    "run_name": f"deepseek-gsm8k-8shot-{datetime.now().strftime('%Y%m%d-%H%M%S')}",
    "tags": ["gsm8k", "evaluation", "deepseek", "qwen", "few-shot"]
}

# Initialize Weights & Biases with config
wandb.init(
    project="deepseek-gsm8k-benchmark", 
    name=config["run_name"],
    config=config,
    tags=config["tags"]
)

# Set device
DEVICE = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {DEVICE}")

# Load GSM8K dataset
def load_gsm8k():
    print("Loading GSM8K dataset...")
    dataset = load_dataset("gsm8k", "main")
    
    # We'll use examples from train split for few-shot demonstrations
    train_set = dataset["train"]
    test_set = dataset["test"]
    
    if config["num_samples"]:
        # Randomly sample a subset if num_samples is specified
        indices = np.random.choice(len(test_set), min(config["num_samples"], len(test_set)), replace=False)
        test_set = test_set.select(indices)
    
    print(f"Loaded {len(train_set)} training examples and {len(test_set)} test examples")
    return train_set, test_set

# Load model and tokenizer
def load_model():
    print(f"Loading {config['model_name']} model and tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(config["model_name"], use_fast=True)
    model = AutoModelForCausalLM.from_pretrained(config["model_name"])
    
    # Handle padding token if not already set
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id
    
    model.to(DEVICE)
    return model, tokenizer

# Extract answer from model's response
def extract_answer(text):
    # First, try to extract the answer after #### delimiter
    if "####" in text:
        answer_part = text.split("####")[-1].strip()
        
        # Remove thousand separators and units
        answer_part = re.sub(r'[$,]', '', answer_part)
        answer_part = re.sub(r'(\d+)\s*(?:dollars|units|kg|m|cm|km|mph|lbs|inches|feet|tons|hours|minutes)', r'\1', answer_part)
        
        # Extract the numerical answer
        numbers = re.findall(r'[-+]?\d*\.?\d+', answer_part)
        if numbers:
            return float(numbers[0])  # Take the first number after ####
    
    # Fallback: extract the last number in the full text
    # Remove thousand separators and units first
    cleaned_text = re.sub(r'[$,]', '', text)
    cleaned_text = re.sub(r'(\d+)\s*(?:dollars|units|kg|m|cm|km|mph|lbs|inches|feet|tons|hours|minutes)', r'\1', cleaned_text)
    numbers = re.findall(r'[-+]?\d*\.?\d+', cleaned_text)
    if numbers:
        return float(numbers[-1])
    
    # If no number found
    return None

# Create 8-shot prompt with examples from the training set
def create_few_shot_prompt(train_examples, question, n_shot=8):
    # Set random seed for reproducibility
    random.seed(42)
    
    # Sample n examples from the training set
    shot_examples = random.sample(train_examples, n_shot)
    
    # Build the few-shot prompt
    prompt = ""
    for ex in shot_examples:
        prompt += f"Question: {ex['question']}\n"
        prompt += f"Answer: {ex['answer']}\n\n"
    
    # Add the current question and instruction
    prompt += f"Question: {question}\n"
    prompt += f"Answer: Let's think step by step. At the end, I'll write the answer as a number after '####'.\n"
    
    return prompt

# Generate answer with DeepSeek model using 8-shot prompting
def generate_answer(model, tokenizer, prompt):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(DEVICE)
    
    with torch.no_grad():
        output = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=len(inputs["input_ids"][0]) + config["max_length"],
            temperature=config["temperature"],
            top_p=config["top_p"],
            num_beams=config["num_beams"],
            num_return_sequences=1,
            pad_token_id=tokenizer.pad_token_id,
        )
    
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Extract just the generated response (after the last "Answer: ")
    try:
        generated_text = response.split("Answer: ")[-1].strip()
    except:
        generated_text = response
        
    return generated_text

# Main evaluation function
def evaluate_gsm8k_few_shot():
    # Load dataset
    train_set, test_set = load_gsm8k()
    
    # Prepare training examples list for few-shot prompting
    train_examples = []
    for ex in train_set:
        train_examples.append({
            "question": ex["question"],
            "answer": ex["answer"]
        })
    
    # Load model and tokenizer
    model, tokenizer = load_model()
    
    results = []
    correct_count = 0
    
    for idx, example in enumerate(tqdm(test_set, desc="Evaluating")):
        question = example["question"]
        target_answer = float(extract_answer(example["answer"]))
        
        # Create 8-shot prompt
        prompt = create_few_shot_prompt(train_examples, question, n_shot=config["n_shot"])
        
        # Generate response
        model_response = generate_answer(model, tokenizer, prompt)
        
        # Log the full response
        print(f"\nQuestion {idx+1}: {question}")
        print(f"Model response: {model_response}")
        
        # Extract answer from response
        predicted_answer = extract_answer(model_response)
        
        # Check if correct (allowing for minor floating point differences)
        is_correct = False
        if predicted_answer is not None and target_answer is not None:
            # For integer answers, check exact match
            if target_answer.is_integer() and predicted_answer.is_integer():
                is_correct = int(predicted_answer) == int(target_answer)
            else:
                # For floating point, allow small relative error
                relative_error = abs(predicted_answer - target_answer) / (abs(target_answer) + 1e-10)
                is_correct = relative_error < 0.01  # 1% relative error tolerance
        
        if is_correct:
            correct_count += 1
        
        results.append({
            "question": question,
            "target_answer": target_answer,
            "model_response": model_response,
            "predicted_answer": predicted_answer,
            "is_correct": is_correct
        })
        
        # Log to wandb
        wandb.log({
            "running_accuracy": correct_count / (idx + 1),
            "example_idx": idx
        })
    
    # Calculate and log final accuracy
    accuracy = correct_count / len(test_set)
    print(f"\nFinal accuracy: {accuracy:.2%} ({correct_count}/{len(test_set)})")
    
    # Log final metrics to wandb
    wandb.log({
        "final_accuracy": accuracy,
    })
    
    # Save detailed results to CSV
    results_df = pd.DataFrame(results)
    results_file = f"deepseek_gsm8k_{config['n_shot']}shot_results.csv"
    results_df.to_csv(results_file, index=False)
    print(f"Detailed results saved to {results_file}")
    
    # Log results file to wandb
    wandb.save(results_file)
    
    return accuracy, results

if __name__ == "__main__":
    try:
        accuracy, results = evaluate_gsm8k_few_shot()
        print(f"Evaluation completed successfully with accuracy: {accuracy:.2%}")
    except Exception as e:
        print(f"Error during evaluation: {e}")
        wandb.log({"error": str(e)})
    finally:
        wandb.finish()

  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mjonathantiedchen[0m ([33mmaster_thesis_math_lm[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Using device: mps
Loading GSM8K dataset...
Loaded 7473 training examples and 100 test examples
Loading deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B model and tokenizer...


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Evaluating:   0%|          | 0/100 [00:16<?, ?it/s]


Error during evaluation: MPS backend out of memory (MPS allocated: 8.82 GB, other allocations: 9.15 GB, max allowed: 18.13 GB). Tried to allocate 229.28 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).


0,1
error,MPS backend out of m...


In [None]:
import torch
import numpy as np
import pandas as pd
import random
from tqdm import tqdm
from datasets import load_dataset
import re
import wandb
from datetime import datetime
from utils import get_device, load_model, load_gsm8k, extract_answer, create_cot_prompt, generate_answer_hf, is_correct_check

# Configuration
config = {
    # Model settings
    "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    "max_length": 512,  # Increased for few-shot CoT
    
    # Dataset settings
    "num_samples": 100,  # Set to None to use the full dataset
    "n_shot": 4,  # Number of examples for few-shot prompting
    
    # Generation settings
    "temperature": 0.7,
    "top_p": 0.9,
    "num_beams": 4,
    
    # Experiment tracking
    "run_name": f"gpt2-gsm8k-8shot-cot-{datetime.now().strftime('%Y%m%d-%H%M%S')}",
    "tags": ["gsm8k", "evaluation", "r1", "few-shot", "cot"]
}


# Initialize Weights & Biases with config
wandb.init(
    project="deepseek-gsm8k-benchmark", 
    name=config["run_name"],
    config=config,
    tags=config["tags"]
)


# Set device
DEVICE = get_device()

print(f"Using device: {DEVICE}")

# Main evaluation function
def evaluate_gsm8k():
    # Load dataset
    train_set, test_set = load_gsm8k(config)
    
    # Prepare training examples list for few-shot prompting
    train_examples = []
    for ex in train_set:
        train_examples.append({
            "question": ex["question"],
            "answer": ex["answer"]
        })
    
    # Load model and tokenizer
    model, tokenizer = load_model(config, DEVICE)
    
    results = []
    correct_count = 0
    
    for idx, example in enumerate(tqdm(test_set, desc="Evaluating")):
        question = example["question"]
        
        # Extract target answer, forcing it to be a float
        target_answer = extract_answer(example["answer"])
        print(f"\n\n{'-'*80}")
        print(f"Example {idx+1}:")
        print(f"Question: {question}")
        print(f"Target answer: {target_answer}")
        
        # Create and print the full CoT prompt
        prompt = create_cot_prompt(train_examples, question, n_shot=config["n_shot"])
        print(f"\nFull CoT Prompt:\n{'-'*40}\n{prompt}\n{'-'*40}\n")

        # Generate and print the full model response
        model_response = generate_answer_hf(model, tokenizer, prompt, config, DEVICE, model_type="gpt2")
        print(f"\nFull Model Response:\n{'-'*40}\n{model_response}\n{'-'*40}\n")
        
        # Extract answer from response
        predicted_answer = extract_answer(model_response)
        print(f"Extracted predicted answer: {predicted_answer}")
        
        # Fix for the integer error: Ensure both are converted to float for comparison
        is_correct = is_correct_check(predicted_answer, target_answer)
        
        print(f"Is correct? {is_correct}")
        
        if is_correct:
            correct_count += 1
        
        results.append({
            "question": question,
            "target_answer": target_answer,
            "model_response": model_response,
            "predicted_answer": predicted_answer,
            "is_correct": is_correct
        })
        
        # Log to wandb
        wandb.log({
            "running_accuracy": correct_count / (idx + 1),
            "example_idx": idx
        })
        
        # Early stop option for debugging (remove for full evaluation)
        if idx == 3 and 'debug' in config and config['debug']:
            break
    
    # Calculate and log final accuracy
    accuracy = correct_count / len(results)
    print(f"\nFinal accuracy: {accuracy:.2%} ({correct_count}/{len(results)})")
    
    # Log final metrics to wandb
    wandb.log({
        "final_accuracy": accuracy,
    })
    
    # Save detailed results to CSV
    results_df = pd.DataFrame(results)
    results_file = f"{config['model_name']}_gsm8k_{config['n_shot']}shot_results.csv"
    results_df.to_csv(results_file, index=False)
    print(f"Detailed results saved to {results_file}")
    
    # Log results file to wandb
    wandb.save(results_file)
    
    return accuracy, results

if __name__ == "__main__":
    try:
        accuracy, results = evaluate_gsm8k()
        print(f"Evaluation completed successfully with accuracy: {accuracy:.2%}")
    except Exception as e:
        print(f"Error during evaluation: {e}")
        wandb.log({"error": str(e)})
    finally:
        wandb.finish()