In [None]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

# --- Authenticate with Hugging Face ---
# We retrieve the token stored in Kaggle Secrets.
try:
    secrets = UserSecretsClient()
    hf_token = secrets.get_secret("HF_TOKEN")
    login(token=hf_token, add_to_git_credential=False)
    print("Successfully logged into Hugging Face.")
except Exception as e:
    print(f"Could not log in to Hugging Face. Please ensure your HF_TOKEN secret is set correctly. Error: {e}")

In [None]:
!pip install git+https://github.com/huggingface/transformers.git

In [None]:
!pip install git+https://github.com/huggingface/transformers.git
!pip install --upgrade timm

print("Libraries installed. PLEASE RESTART YOUR RUNTIME NOW.")

In [2]:
import torch
from transformers import AutoProcessor, Gemma3nForConditionalGeneration
import transformers
import timm

# Verify the versions to confirm the new installations are active
print(f"Transformers version: {transformers.__version__}")
print(f"Timm version: {timm.__version__}")

print("\nLoading model and processor onto the first GPU (cuda:0)...")

model = Gemma3nForConditionalGeneration.from_pretrained(
    "google/gemma-3n-E2B",
    torch_dtype=torch.bfloat16,
    device_map="cuda:0" # Load to first gpu
)

processor = AutoProcessor.from_pretrained("google/gemma-3n-E2B")

print("\nModel and processor loaded successfully!")
# You can verify which GPU the model is on
print(f"Model is on device: {model.device}")

Transformers version: 4.55.0.dev0
Timm version: 1.0.19

Loading model and processor onto the first GPU (cuda:0)...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/196 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]


Model and processor loaded successfully!
Model is on device: cuda:0


In [3]:
from datasets import load_dataset 
# --- Load the HellaSwag benchmark dataset ---
# We let the library automatically select the 'default' configuration for HellaSwag.
# https://huggingface.co/datasets/Rowan/hellaswag/viewer/default/validation
dataset = load_dataset("hellaswag", split="validation")

print(f"✅ HellaSwag validation set loaded successfully. It contains {len(dataset)} examples.")
print("\n--- Example Entry ---")
# Print the first example to see the structure
print(f"Context: {dataset[0]['ctx']}")
print(f"Endings: {dataset[0]['endings']}")
print(f"Correct Label: {dataset[0]['label']}")

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/24.4M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/6.11M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/6.32M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/39905 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10003 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10042 [00:00<?, ? examples/s]

✅ HellaSwag validation set loaded successfully. It contains 10042 examples.

--- Example Entry ---
Context: A man is sitting on a roof. he
Endings: ['is using wrap to wrap a pair of skis.', 'is ripping level tiles off.', "is holding a rubik's cube.", 'starts pulling up roofing on a roof.']
Correct Label: 3


In [7]:
# --- 1. Ensure the dataset sample is ready ---
print(f"Full dataset has {len(dataset)} examples.")
sampled_dataset = dataset.shuffle(seed=42).select(range(100))
print(f"Now evaluating on a random sample of {len(sampled_dataset)} examples...")
print("-" * 30)

# --- 2. Initialize counters and start the loop ---
correct_predictions = 0
total_predictions = 0

from tqdm.auto import tqdm

for example in tqdm(sampled_dataset):
    context = example["ctx"]
    endings = example["endings"]
    correct_ending_index = int(example["label"])

    log_likelihoods = []

    # Calculate the likelihood of each ending
    for ending in endings:
        input_text = context + " " + ending
        
        # --- Call the processor directly ---
        # It returns a dictionary with 'input_ids', 'attention_mask', etc.
        inputs = processor(text=input_text, return_tensors="pt").to(model.device)

        with torch.no_grad():
            # --- Pass the entire 'inputs' dictionary to the model ---
            # The **inputs syntax unpacks the dictionary into arguments (input_ids=..., attention_mask=...)
            # We also provide the labels for loss calculation.
            outputs = model(**inputs, labels=inputs["input_ids"])
            
            log_likelihood = -outputs.loss.item()
            log_likelihoods.append(log_likelihood)

    # The model's prediction is the ending with the highest log likelihood
    predicted_index = log_likelihoods.index(max(log_likelihoods))

    # Check if the prediction was correct
    if predicted_index == correct_ending_index:
        correct_predictions += 1
    
    total_predictions += 1
 
print("Evaluation loop finished.")

# --- 3. Calculate and print the final accuracy ---
accuracy = (correct_predictions / total_predictions) * 100
print("-" * 30)
print(f"Evaluation finished on {total_predictions} examples.")
print(f"Correct predictions: {correct_predictions}")
print(f"Accuracy: {accuracy:.2f}%")

Full dataset has 10042 examples.
Now evaluating on a random sample of 100 examples...
------------------------------


  0%|          | 0/100 [00:00<?, ?it/s]

Evaluation loop finished.
------------------------------
Evaluation finished on 100 examples.
Correct predictions: 64
Accuracy: 64.00%


In [8]:
# --- 1. Define the number of samples for evaluation ---
num_eval_samples = 1000

print(f"Full dataset has {len(dataset)} examples.")

# Shuffle the dataset and select a new, larger random sample.
sampled_dataset_1000 = dataset.shuffle(seed=42).select(range(num_eval_samples))

print(f"Creating a new evaluation run on {len(sampled_dataset_1000)} random examples...")
print("-" * 30)


# --- 2. Initialize counters and start the loop ---
correct_predictions = 0
total_predictions = 0

from tqdm.auto import tqdm

# Loop over the new 1000-example sub-dataset
for example in tqdm(sampled_dataset_1000):
    context = example["ctx"]
    endings = example["endings"]
    correct_ending_index = int(example["label"])

    log_likelihoods = []

    # Calculate the likelihood of each ending
    for ending in endings:
        input_text = context + " " + ending
        
        # Call the processor to tokenize the text and move to the model's device
        inputs = processor(text=input_text, return_tensors="pt").to(model.device)

        with torch.no_grad():
            # Pass the processor's output to the model to get the loss
            outputs = model(**inputs, labels=inputs["input_ids"])
            log_likelihood = -outputs.loss.item()
            log_likelihoods.append(log_likelihood)

    # The model's prediction is the ending with the highest log likelihood
    predicted_index = log_likelihoods.index(max(log_likelihoods))

    # Check if the prediction was correct
    if predicted_index == correct_ending_index:
        correct_predictions += 1
    
    total_predictions += 1
 
print("Evaluation loop finished.")

# --- 3. Calculate and print the final accuracy for this run ---
accuracy = (correct_predictions / total_predictions) * 100
print("-" * 30)
print(f"Evaluation finished on {total_predictions} examples.")
print(f"Correct predictions: {correct_predictions}")
print(f"Accuracy: {accuracy:.2f}%")

Full dataset has 10042 examples.
Creating a new evaluation run on 1000 random examples...
------------------------------


  0%|          | 0/1000 [00:00<?, ?it/s]

Evaluation loop finished.
------------------------------
Evaluation finished on 1000 examples.
Correct predictions: 668
Accuracy: 66.80%


In [9]:
# --- 1. Prepare for full dataset evaluation ---
print(f"Starting evaluation on the full dataset of {len(dataset)} examples.")
print("-" * 30)


# --- 2. Initialize counters and start the loop ---
correct_predictions = 0
total_predictions = 0

from tqdm.auto import tqdm

# Loop over the ENTIRE dataset. No sampling is needed.
for example in tqdm(dataset):
    context = example["ctx"]
    endings = example["endings"]
    correct_ending_index = int(example["label"])

    log_likelihoods = []

    # Calculate the likelihood of each ending
    for ending in endings:
        input_text = context + " " + ending
        
        # Call the processor to tokenize the text and move to the model's device
        inputs = processor(text=input_text, return_tensors="pt").to(model.device)

        with torch.no_grad():
            # Pass the processor's output to the model to get the loss
            outputs = model(**inputs, labels=inputs["input_ids"])
            log_likelihood = -outputs.loss.item()
            log_likelihoods.append(log_likelihood)

    # The model's prediction is the ending with the highest log likelihood
    predicted_index = log_likelihoods.index(max(log_likelihoods))

    # Check if the prediction was correct
    if predicted_index == correct_ending_index:
        correct_predictions += 1
    
    total_predictions += 1
 
print("Full evaluation loop finished.")

# --- 3. Calculate and print the final accuracy ---
accuracy = (correct_predictions / total_predictions) * 100
print("-" * 30)
print(f"Evaluation finished on {total_predictions} examples.")
print(f"Correct predictions: {correct_predictions}")
print(f"Final Model Accuracy: {accuracy:.2f}%")

Starting evaluation on the full dataset of 10042 examples.
------------------------------


  0%|          | 0/10042 [00:00<?, ?it/s]

Full evaluation loop finished.
------------------------------
Evaluation finished on 10042 examples.
Correct predictions: 6902
Final Model Accuracy: 68.73%
