In [1]:
from google.colab import files
uploaded = files.upload()

Saving Algebra_train.csv to Algebra_train.csv


In [2]:
# Step 0: Install required libraries (run this cell only once)
!pip install transformers datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [3]:
# Step 1: Import necessary libraries
import re
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments
)

In [4]:
# Step 2: Load your dataset from CSV.
# Assumes your CSV is tab-delimited with columns:
# problem, solution, answer, level, unique_id
csv_file = "Algebra_train.csv"  # Adjust path if necessary
df = pd.read_csv(csv_file, sep="\t", header=None,
                 names=["problem", "solution", "answer", "level", "unique_id"])
# Create a Hugging Face Dataset
dataset = Dataset.from_pandas(df)


In [5]:
# Step 3: Define a helper function to extract the final answer.
def extract_final_answer(text):
    """
    This function attempts to extract the final answer from the generated text.
    It first looks for a LaTeX-style \boxed{...} pattern.
    If not found, it looks for a marker like "Final Answer:".
    Otherwise, it falls back to taking the last token.
    """
    # Try to extract what is inside \boxed{...}
    match = re.search(r'\\boxed\{([^}]*)\}', text)
    if match:
        return match.group(1).strip()
    # Look for "Final Answer:" marker if available
    if "Final Answer:" in text:
        return text.split("Final Answer:")[-1].strip().split()[0]
    # Fallback: take the last token (this may need refinement)
    tokens = text.strip().split()
    return tokens[-1] if tokens else text

In [6]:
# Step 4: Define a function to tokenize our dataset.
def tokenize_function(example, tokenizer):
    # The tokenizer returns input_ids and attention_mask.
    return tokenizer(example["problem"], truncation=True)


In [7]:
# Step 5: Create a custom Trainer that uses model.generate for predictions.
class GenerationTrainer(Trainer):
    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
        input_ids = inputs["input_ids"]
        # Use max_new_tokens to allow additional tokens beyond the input length.
        generated_tokens = model.generate(
            input_ids.to(model.device),
            max_new_tokens=50,  # Generate up to 50 new tokens; adjust as needed.
            num_beams=1,
            do_sample=False
        )
        return (None, generated_tokens, inputs.get("labels"))


In [8]:
# Step 6: Define a compute_metrics function for evaluation.
def compute_metrics(eval_preds):
    """
    This function compares the extracted final answer from the generated text
    with the ground truth answer from the dataset and computes an accuracy score.
    """
    generated_tokens = eval_preds[1]
    # Use the global_tokenizer (declared globally in evaluate_model)
    decoded_preds = global_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    # Use the global ground_truths set later from the dataset.
    correct = 0
    total = 0
    for pred_text, truth in zip(decoded_preds, ground_truths):
        final_ans = extract_final_answer(pred_text)
        if final_ans.strip() == truth.strip():
            correct += 1
        total += 1
    accuracy = correct / total if total > 0 else 0.0
    return {"accuracy": accuracy}


In [9]:
# Step 7: Define a function to evaluate a given model.
def evaluate_model(model_name):
    print(f"Loading model and tokenizer for: {model_name}")
    # Load tokenizer and model. Using torch.float16 for GPU efficiency.
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
    # Set padding_side to "left" for decoder-only models.
    tokenizer.padding_side = "left"
    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
    model.to("cuda")

    # Tokenize the dataset (while preserving the answer field)
    tokenized_dataset = dataset.map(lambda ex: tokenize_function(ex, tokenizer), batched=True)

    # Declare global variables to be used in compute_metrics.
    global ground_truths, global_tokenizer
    ground_truths = tokenized_dataset["answer"]
    global_tokenizer = tokenizer

    # Define training arguments.
    training_args = TrainingArguments(
        output_dir="./results",
        per_device_eval_batch_size=2,  # Adjust based on your GPU memory.
        do_train=False,
        do_eval=True,
        evaluation_strategy="no",  # We'll call predict() manually.
        logging_steps=1,
        report_to=[],
    )

    # Instantiate our GenerationTrainer.
    trainer = GenerationTrainer(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        eval_dataset=tokenized_dataset,
    )

    # Call predict() with the dataset as a positional argument.
    results = trainer.predict(tokenized_dataset)
    print(f"Results for {model_name}: {results.metrics}")
    return model, tokenizer, trainer


In [None]:
# Step 8: Evaluate each of the two models.
model_name_1 = "Qwen/Qwen2-0.5B-Instruct"
model_name_2 = "Qwen/Qwen2-1.5B-Instruct"

print("Evaluating Qwen2-0.5B-Instruct...")
model1, tokenizer1, trainer1 = evaluate_model(model_name_1)

print("\nEvaluating Qwen2-1.5B-Instruct...")
model2, tokenizer2, trainer2 = evaluate_model(model_name_2)


Evaluating Qwen2-0.5B-Instruct...
Loading model and tokenizer for: Qwen/Qwen2-0.5B-Instruct


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Map:   0%|          | 0/2824 [00:00<?, ? examples/s]

  trainer = GenerationTrainer(
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Results for Qwen/Qwen2-0.5B-Instruct: {'test_model_preparation_time': 0.0037, 'test_runtime': 2080.1975, 'test_samples_per_second': 1.358, 'test_steps_per_second': 0.679}

Evaluating Qwen2-1.5B-Instruct...
Loading model and tokenizer for: Qwen/Qwen2-1.5B-Instruct


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Map:   0%|          | 0/2824 [00:00<?, ? examples/s]

  trainer = GenerationTrainer(


In [None]:
# Step 9: (Optional) Interactive testing loop.
def interactive_test(model, tokenizer):
    model.to("cuda")
    print("\nInteractive testing (type 'quit' to exit):")
    while True:
        user_input = input("Enter a problem: ")
        if user_input.lower() == 'quit':
            break
        inputs = tokenizer(user_input, return_tensors="pt").to("cuda")
        output = model.generate(**inputs, max_new_tokens=50, num_beams=1, do_sample=False)
        decoded = tokenizer.decode(output[0], skip_special_tokens=True)
        final_ans = extract_final_answer(decoded)
        print("Model answer:", final_ans)
        print("Full generated text:", decoded, "\n")

# For interactive testing, choose the model you want:
print("\nTesting interactive mode for Qwen2-0.5B-Instruct:")
interactive_test(model1, tokenizer1)

# Uncomment the following lines to test the second model interactively:
# print("\nTesting interactive mode for Qwen2-1.5B-Instruct:")
# interactive_test(model2, tokenizer2)