In [1]:
#THIS IS FOR USING JUST ONE GPU
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"


In [2]:
!pip install transformers datasets evaluate



In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import torch
import re
from tqdm import tqdm

# Load the tokenizer and the base Phi model (not fine-tuned)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5")
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5")

# Set the model to evaluation mode and move it to GPU if available
model.eval().to("cuda" if torch.cuda.is_available() else "cpu")

# Load the official GSM8K test set (1,319 examples)
gsm8k = load_dataset("gsm8k", "main", split="test")

# Print how many test examples are being evaluated
print(f"GSM8K test set size: {len(gsm8k)} examples")

# Initialize counter for correct predictions
correct = 0

# Boolean flag to track whether we’ve printed the first example
first_printed = False

# Loop through each example in the dataset
for item in tqdm(gsm8k, desc="Evaluating base Phi model (zero-shot)..."):
    
    # Extract the question and strip any whitespace
    question = item["question"].strip()

    # Use regular expressions to extract numbers from the gold answer
    # Take the last number found as the final numeric answer
    gold_answer = re.findall(r"\d+", item["answer"])
    gold = gold_answer[-1] if gold_answer else None

    # Build the zero-shot prompt (no examples, no reasoning, just the question)
    prompt = question + "\nA:"
    
    # Tokenize the prompt and move it to the model's device (CPU or GPU)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # Use the model to generate a prediction, limiting the output length
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=50)
        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract any numbers from the model’s output
    # Take the last number found as the model's predicted answer
    pred_numbers = re.findall(r"\d+", decoded)
    prediction = pred_numbers[-1] if pred_numbers else None

    # For the first example only, print the full comparison
    if not first_printed:
        print("\nFirst Example:")
        print(f"Question: {question}")
        print(f"Expected (gold): {gold}")
        print(f"Predicted: {prediction}")
        first_printed = True

    # Count the prediction as correct if it matches the gold answer
    if prediction == gold:
        correct += 1

# After all predictions, calculate and print the accuracy
total = len(gsm8k)
accuracy = correct / total * 100
print(f"\nCorrect predictions: {correct}/{total}")
print(f"Accuracy of base Phi model (zero-shot): {accuracy:.2f}%")


  from .autonotebook import tqdm as notebook_tqdm


GSM8K test set size: 1319 examples


Evaluating base Phi model (zero-shot)...:   0%|          | 1/1319 [00:00<18:58,  1.16it/s]


First Example:
Question: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?
Expected (gold): 18
Predicted: 224


Evaluating base Phi model (zero-shot)...: 100%|██████████| 1319/1319 [17:31<00:00,  1.25it/s]


Correct predictions: 48/1319
Accuracy of base Phi model (zero-shot): 3.64%



