In [1]:
from google.colab import files
uploaded = files.upload()

Saving Algebra.jsonl to Algebra.jsonl


In [2]:
# =============================================================
# Step 0: Install Required Libraries
# =============================================================
!pip install transformers datasets evaluate


Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.

In [3]:
# =============================================================
# Step 1: Import Libraries and Set Environment Variables
# =============================================================
import os
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset

# Disable wandb logging to avoid API key prompts.
os.environ["WANDB_DISABLED"] = "true"

In [4]:
# =============================================================
# Step 2: Define Your Dataset Files and Load Them
# =============================================================
# You can define multiple subject datasets here.
# For demonstration, we use one example: "Algebra".
# If you have CSV files, set dataset_format="csv" and adjust load_dataset parameters.
subject_datasets = {
    "Algebra": {
        "file": "Algebra.jsonl",  # Change to "Algebra_train.csv" for CSV
        "format": "json"          # Change to "csv" if needed
    }
}

# A dictionary to hold loaded datasets.
loaded_datasets = {}

# Loop over each subject and load the dataset.
for subject, info in subject_datasets.items():
    print(f"Loading dataset for subject: {subject}")
    loaded_datasets[subject] = load_dataset(info["format"], data_files=info["file"])["train"]


Loading dataset for subject: Algebra


Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
# =============================================================
# Step 3: Preprocess the Dataset
# =============================================================
# Add a "prompt" field (for zero-shot generation) to each example.
# NOTE: The prompt now instructs the model to output only the final answer without any explanation.
def add_prompt(example):
    # Modified prompt: instruct the model to output only the final answer.
    example["prompt"] = f"Problem: {example['problem']}\nAnswer (provide only the final answer, no explanation):"
    return example

# Process all subject datasets.
for subject, dataset in loaded_datasets.items():
    loaded_datasets[subject] = dataset.map(add_prompt)


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [6]:
# =============================================================
# Step 4: Define a Custom Generation Trainer for Zero-Shot Evaluation
# =============================================================
class GenerationTrainer(Trainer):
    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
        # Move input_ids to the appropriate device.
        input_ids = inputs["input_ids"].to(model.device)
        # Generate a limited number of tokens (adjust max_new_tokens as needed).
        # Here we use 30 tokens to reduce the chance of extra chain-of-thought reasoning.
        generated_tokens = model.generate(input_ids, max_new_tokens=30)
        return (None, generated_tokens, None)

In [7]:
# =============================================================
# Step 5: Define Compute Metrics Function (Overall Accuracy & F1)
# =============================================================
def compute_f1(pred, gold):
    pred_tokens = set(pred.split())
    gold_tokens = set(gold.split())
    if len(pred_tokens) == 0 or len(gold_tokens) == 0:
        return 0.0
    common = pred_tokens.intersection(gold_tokens)
    if len(common) == 0:
        return 0.0
    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(gold_tokens)
    return 2 * precision * recall / (precision + recall)

def compute_metrics(eval_preds, tokenizer, eval_dataset):
    preds, _ = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Clean up predictions by extracting text after our answer prompt and normalizing.
    cleaned_preds = []
    for text in decoded_preds:
        # We expect the model to follow the instruction; if it outputs extra text,
        # we take the part after the marker.
        split_text = text.split("Answer (provide only the final answer, no explanation):")
        answer_text = split_text[-1] if len(split_text) > 1 else text
        cleaned_preds.append(answer_text.strip().lower())

    # Gold answers should be in the "answer" field.
    gold_answers = [ex["answer"].strip().lower() for ex in eval_dataset]

    # Compute exact-match Accuracy.
    correct = sum(1 for pred, gold in zip(cleaned_preds, gold_answers) if pred == gold)
    accuracy = correct / len(gold_answers)

    # Compute token-level F1 for each example and average.
    f1_scores = [compute_f1(pred, gold) for pred, gold in zip(cleaned_preds, gold_answers)]
    avg_f1 = sum(f1_scores) / len(f1_scores)

    return {"accuracy": accuracy, "f1": avg_f1}


In [8]:
# =============================================================
# Step 6: Setup TrainingArguments for Evaluation
# =============================================================
training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=1,
    evaluation_strategy="no",  # We trigger evaluation manually.
    report_to="none",
    fp16=True  # Use half precision to reduce GPU memory usage.
)



In [9]:
# =============================================================
# Step 7: Loop Over Models and Subjects to Evaluate
# =============================================================
# Define the models to test.
model_names = [
    "Qwen/Qwen2-0.5B-Instruct",
    "Qwen/Qwen2-1.5B-Instruct"
]

evaluation_results = {}
# Save the first loaded model for interactive testing.
interactive_model = None
interactive_tokenizer = None

# Loop over each model and each subject dataset.
for model_name in model_names:
    print(f"\nLoading model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    # Loop over each subject dataset.
    for subject, dataset in loaded_datasets.items():
        print(f"\nEvaluating model {model_name} on subject {subject} dataset...")
        # Tokenize the dataset using the current tokenizer.
        tokenized_dataset = dataset.map(
            lambda examples: tokenizer(examples["prompt"], truncation=True, max_length=512),
            batched=False
        )

        # Instantiate the GenerationTrainer.
        trainer = GenerationTrainer(
            model=model,
            args=training_args,
            eval_dataset=tokenized_dataset,
            tokenizer=tokenizer,
            compute_metrics=lambda eval_preds: compute_metrics(eval_preds, tokenizer, dataset)
        )

        results = trainer.evaluate()
        evaluation_results[f"{model_name}__{subject}"] = results
        print(f"Results for model {model_name} on subject {subject}:")
        print(f"  Accuracy: {results.get('eval_accuracy', results.get('accuracy', 0.0)):.4f}")
        print(f"  F1 Score: {results.get('eval_f1', results.get('f1', 0.0)):.4f}")

        # Use the first evaluated model (first subject of the first model) for interactive testing.
        if interactive_model is None:
            interactive_model = model
            interactive_tokenizer = tokenizer



Loading model: Qwen/Qwen2-0.5B-Instruct


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]


Evaluating model Qwen/Qwen2-0.5B-Instruct on subject Algebra dataset...


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

  trainer = GenerationTrainer(
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Results for model Qwen/Qwen2-0.5B-Instruct on subject Algebra:
  Accuracy: 0.0000
  F1 Score: 0.0000

Loading model: Qwen/Qwen2-1.5B-Instruct


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]


Evaluating model Qwen/Qwen2-1.5B-Instruct on subject Algebra dataset...


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Results for model Qwen/Qwen2-1.5B-Instruct on subject Algebra:
  Accuracy: 0.0000
  F1 Score: 0.0000


In [10]:
# =============================================================
# Step 8: Optionally Save Evaluation Results
# =============================================================
output_folder = "./evaluation_results"
os.makedirs(output_folder, exist_ok=True)
results_path = os.path.join(output_folder, "evaluation_results.json")
with open(results_path, "w", encoding="utf-8") as f:
    json.dump(evaluation_results, f, indent=4)
print(f"\nSaved evaluation results to {results_path}")



Saved evaluation results to ./evaluation_results/evaluation_results.json


In [11]:
# =============================================================
# Step 9: Interactive Zero-Shot Testing
# =============================================================
def interactive_test(model, tokenizer):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    print("\nInteractive Test (type 'exit' to quit):")
    while True:
        user_input = input("Problem: ")
        if user_input.lower() == "exit":
            break
        # Use the same modified prompt as in training.
        prompt = f"Problem: {user_input}\nAnswer (provide only the final answer, no explanation):"
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        outputs = model.generate(**inputs, max_new_tokens=30)
        full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Extract the final answer based on our prompt.
        answer = full_text.split("Answer (provide only the final answer, no explanation):")[-1].strip()
        print("Generated Answer:", answer)


In [12]:
# =============================================================
# Step 10: Run Interactive Testing Using the First Evaluated Model
# =============================================================
if interactive_model is not None and interactive_tokenizer is not None:
    print("\nInteractive testing using the first evaluated model:")
    interactive_test(interactive_model, interactive_tokenizer)
else:
    print("Interactive model not available.")


Interactive testing using the first evaluated model:

Interactive Test (type 'exit' to quit):
Problem: Connie is starting an exercise program. On June 1, she will do 25 sit-ups. Each day after that, she will increase her number of sit-ups by four. On which date during the month of June will Connie first do more than 100 sit-ups in one day?
Generated Answer: To find out when Connie first does more than 100 sit-ups in one day, we need to calculate how many sit-ups she has done
Problem: 5+10
Generated Answer: 15

Solution: To find the sum of 5 and 10, you simply add these two numbers together:

\(5 +
Problem: How many vertical asymptotes does the graph of $y=\frac{2}{x^2+x-6}$ have?
Generated Answer: The vertical asymptote occurs when the denominator is equal to zero.
So we need to find the values of $x$ for which $x^2
Problem: The sum of the squares of three consecutive positive even numbers is $12296$. Find the product of the three numbers divided by $8$.
Generated Answer: Let's denote