In [None]:
import torch
import numpy as np
import pandas as pd
import random
from tqdm import tqdm
from datasets import load_dataset
import re
import wandb
from datetime import datetime
from utils import get_device, load_model, load_gsm8k, extract_answer, create_cot_prompt, generate_answer_hf, is_correct_check

# Configuration
config = {
    # Model settings
    "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    "max_length": 2048,  # Increased for few-shot CoT
    "max_context": 8000,

    
    # Dataset settings
    "num_samples": 100,  # Set to None to use the full dataset
    "n_shot": 4,  # Number of examples for few-shot prompting
    
    # Generation settings
    "temperature": 0.3,
    "top_p": 0.7,
    "num_beams": 5,
    
    # Experiment tracking
    "run_name": f"gpt2-gsm8k-8shot-cot-{datetime.now().strftime('%Y%m%d-%H%M%S')}",
    "tags": ["gsm8k", "evaluation", "r1", "few-shot", "cot"]
}


# Initialize Weights & Biases with config
wandb.init(
    project="deepseek-gsm8k-benchmark", 
    name=config["run_name"],
    config=config,
    tags=config["tags"]
)


# Set device
DEVICE = get_device()

print(f"Using device: {DEVICE}")

# Main evaluation function
def evaluate_gsm8k():
    # Load dataset
    train_set, test_set = load_gsm8k(config)
    
    # Prepare training examples list for few-shot prompting
    train_examples = []
    for ex in train_set:
        train_examples.append({
            "question": ex["question"],
            "answer": ex["answer"]
        })
    
    # Load model and tokenizer
    model, tokenizer = load_model(config, DEVICE)
    
    results = []
    correct_count = 0
    
    for idx, example in enumerate(tqdm(test_set, desc="Evaluating")):
        question = example["question"]
        
        # Extract target answer, forcing it to be a float
        target_answer = extract_answer(example["answer"])
        print(f"\n\n{'-'*80}")
        print(f"Example {idx+1}:")
        print(f"Question: {question}")
        print(f"Target answer: {target_answer}")
        
        # Create and print the full CoT prompt
        prompt = create_cot_prompt(train_examples, question, n_shot=config["n_shot"])
        print(f"\nFull CoT Prompt:\n{'-'*40}\n{prompt}\n{'-'*40}\n")

        # Generate and print the full model response
        model_response = generate_answer_hf(model, tokenizer, prompt, config, DEVICE, model_type="deepseek")
        print(f"\nFull Model Response:\n{'-'*40}\n{model_response}\n{'-'*40}\n")
        
        # Extract answer from response
        predicted_answer = extract_answer(model_response)
        print(f"Extracted predicted answer: {predicted_answer}")
        
        # Fix for the integer error: Ensure both are converted to float for comparison
        is_correct = is_correct_check(predicted_answer, target_answer)
        
        print(f"Is correct? {is_correct}")
        
        if is_correct:
            correct_count += 1
        
        results.append({
            "question": question,
            "target_answer": target_answer,
            "model_response": model_response,
            "predicted_answer": predicted_answer,
            "is_correct": is_correct
        })
        
        # Log to wandb
        wandb.log({
            "running_accuracy": correct_count / (idx + 1),
            "example_idx": idx
        })
        
        # Early stop option for debugging (remove for full evaluation)
        if idx == 3 and 'debug' in config and config['debug']:
            break
    
    # Calculate and log final accuracy
    accuracy = correct_count / len(results)
    print(f"\nFinal accuracy: {accuracy:.2%} ({correct_count}/{len(results)})")
    
    # Log final metrics to wandb
    wandb.log({
        "final_accuracy": accuracy,
    })
    
    # Save detailed results to CSV
    results_df = pd.DataFrame(results)
    results_file = f"{config['model_name']}_gsm8k_{config['n_shot']}shot_results.csv"
    results_df.to_csv(results_file, index=False)
    print(f"Detailed results saved to {results_file}")
    
    # Log results file to wandb
    wandb.save(results_file)
    
    return accuracy, results

if __name__ == "__main__":
    try:
        accuracy, results = evaluate_gsm8k()
        print(f"Evaluation completed successfully with accuracy: {accuracy:.2%}")
    except Exception as e:
        print(f"Error during evaluation: {e}")
        wandb.log({"error": str(e)})
    finally:
        wandb.finish()

Using device: cpu
Loading GSM8K dataset...
Loaded 7473 training examples and 100 test examples
Loading deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B model and tokenizer...


Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]



--------------------------------------------------------------------------------
Example 1:
Question: Marcus is half of Leo’s age and five years younger than Deanna. Deanna is 26. How old is Leo?
Target answer: 42.0

Full CoT Prompt:
----------------------------------------
Question: For every 12 cans you recycle, you receive $0.50, and for every 5 kilograms of newspapers, you receive $1.50. If your family collected 144 cans and 20 kilograms of newspapers, how much money would you receive?
Answer: There are 144/12 = <<144/12=12>>12 sets of 12 cans that the family collected.
So, the family would receive $0.50 x 12 = $<<0.50*12=6>>6 for the cans.
There are 20/5 = <<20/5=4>>4 sets of 5 kilograms of newspapers that the family collected.
So, the family would receive $1.50 x 4 = $<<1.50*4=6>>6 for the newspapers.
Therefore, the family would receive a total of $6 + $6 = $<<6+6=12>>12.
#### 12

Question: Betty picked 16 strawberries. Matthew picked 20 more strawberries than Betty and twice a

Evaluating:   1%|          | 1/100 [03:21<5:32:07, 201.29s/it]


Full Model Response:
----------------------------------------
James starts with 500 Legos and uses half of them, leaving 500/2=<<500/2=250>>250 Legos unused.
He puts those unused Legos away but since he's missing 5 he only puts 250-5=<<250-5=245>>245 Legos away.
#### 245

Question: Marcus is half of Leo’s age and five years younger than Deanna. Deanna is 26. How old is Leo? Let's think step by step. At the end, you MUST write the answer as an integer after '####'. Ensure your answer is fully written before stopping.
Alright, let me try to figure this problem out.

First, I know that Deanna's age is given directly as 22. Wait no, wait—Deanna says she is 30. Hmm, hold on, maybe I misread it earlier?

Wait, looking again:

"Deanna is [something]."

But actually, according to my initial thought process, when solving similar problems, sometimes there might be inconsistencies or errors if not paying attention carefully.

Let me go through the question once more:

"Marcus is half the age of 

Evaluating:   2%|▏         | 2/100 [06:42<5:28:38, 201.21s/it]


Full Model Response:
----------------------------------------
Alright, so I need to figure out how long it takes Gary to walk all the way around the park.

First, let me visualize this park. It’s a rectangle, right? So opposite sides are equal. The length is given as 6 meters... Wait no, wait—wait, hold on—isn’t that 1 mile or 1 kilometer?

Wait, actually, looking again:

"the local park is a rectangular that measures ... 1 .5 miles..." Hmm, okay, maybe "miles." But then later when writing my thought process, I converted everything into kilometers—I might be mixing units here.

Let me go through carefully.

The problem says: "a rectangle that is measured 1½ miles by six miles."

Yes, so dimensions are 6.0 miles (length) and 1⅔ miles (width). Or perhaps vice versa; doesn't matter because perimeter will still be same regardless of order.

Perimeter formula for a rectangle is P = 2*(Length + Width).

So first, compute Length plus Width.

Given Length L = 6 mi,

Width W = 1 ½ mi = 3/2 mi.

Evaluating:   3%|▎         | 3/100 [10:04<5:26:11, 201.77s/it]


Full Model Response:
----------------------------------------
James starts with 500 Legos and uses half of them, leaving 500/2=<<500/2=250>>250 Legos unused.
He puts those unused Legos away but since he's missing 5 he only puts 250-5=<<250-5=245>>245 Legos away.
#### 245

Question: Mark is making a quadruple batch of brownies. The normal recipe calls for 3 cups of flour and 1 cup milk. If flour is sold in 2-cup bags and milk is sold in 2-cup bottles, how many more bags of flour than bottles of milk does Mark have to buy? Let's think step by step. At the end, you MUST write the answer as an integer after '####'. Ensure your answer is fully written before stopping.
Alright, let me try to figure this problem out.

First, I need to understand what exactly is being asked here. So, Mark is doing a quadriple batch of Brownies. That means he needs four times the amount of ingredients required for a single batch. 

The original recipe requires 3 tablespoons of butter per serving, right? Wait n

Evaluating:   4%|▍         | 4/100 [23:03<11:26:55, 429.33s/it]


Full Model Response:
----------------------------------------
James starts with 500 Legos and uses half of them, leaving 500/2=<<500/2=250>>250 Legos unused.
He puts those unused Legos away but since he's missing 5 he only puts 250-5=<<250-5=245>>245 Legos away.
#### 245

Question: Every 2 miles a car drives the tires rotate 725 times.  Jeremy drives 400 miles a month.  If each tire can undergo 10,440,000 rotations how many years before the tire needs to be replaced? Let's think step by step. At the end, you MUST write the answer as an integer after '####'. Ensure your answer is fully written before stopping.
Alright, so I need to figure this problem out.

First, let me understand what’s being asked here. We’re dealing with cars’ tires rotation over time based on mileage driven. The question gives some specific numbers:

- Every 1 mile, a car's tires rotate (I assume) 750 times or something like that—wait no, wait—the original statement says "every 2 Miles... rotates 7,25..." Wait, ho