# Setup (RUN EVERYTIME)

In [None]:
# Use if Drive mounting bugs out (Also restart session & refresh page)
from google.colab import drive
drive.flush_and_unmount()

Drive not mounted, so nothing to flush and unmount.


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# Change to desired working directory
%cd /content/drive/MyDrive/DATASCI266

/content/drive/MyDrive/DATASCI266


# Step 1: Load In Dataset

In [None]:
from datasets import load_dataset

race_dataset = load_dataset("race", "all")
print(race_dataset)

README.md: 0.00B [00:00, ?B/s]

all/test-00000-of-00001.parquet:   0%|          | 0.00/2.08M [00:00<?, ?B/s]

all/train-00000-of-00001.parquet:   0%|          | 0.00/37.4M [00:00<?, ?B/s]

all/validation-00000-of-00001.parquet:   0%|          | 0.00/2.05M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4934 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/87866 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4887 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['example_id', 'article', 'answer', 'question', 'options'],
        num_rows: 4934
    })
    train: Dataset({
        features: ['example_id', 'article', 'answer', 'question', 'options'],
        num_rows: 87866
    })
    validation: Dataset({
        features: ['example_id', 'article', 'answer', 'question', 'options'],
        num_rows: 4887
    })
})


# Step 2: Convert to Text Prompt for QWEN 2.5

In [None]:
letter_options = ["A", "B", "C", "D"]

def convert_to_text_prompt(example):
    '''
    Want to return {"final_prompt": final_prompt} where final prompt
    looks like the below:

    Context:
    <context>


    Question:
    <question>


    Options
    A. <option_0>
    B. <option_1>
    C. <option_2>
    D. <option_3>

    Answer with the letter (A, B, C, or D) only.
    Answer: <correct_letter>
    '''
    # Create list of available answer options
    options = []
    for i in range(4):
        options.append(example["options"][i])

    # Create string of letter_options with their options
    options_block = ""
    for i, opt in enumerate(options):
        options_block += f"{letter_options[i]}. {opt}\n"

    # Get correct letter option (A/B/C/D from RACE directly)
    correct_letter = example["answer"].strip().upper()

    # Construct prompt
    prompt = (
        f"Context:\n{example['article']}\n\n"
        f"Question:\n{example['question']}\n\n"
        f"Options:\n{options_block}\n\n"
        "Answer with the letter (A, B, C, or D) only."
    )
    final_prompt = prompt + "\nAnswer: " + correct_letter

    return {"final_prompt": final_prompt}

In [None]:
race_dataset = race_dataset.map(convert_to_text_prompt)

Map:   0%|          | 0/4934 [00:00<?, ? examples/s]

Map:   0%|          | 0/87866 [00:00<?, ? examples/s]

Map:   0%|          | 0/4887 [00:00<?, ? examples/s]

# Step 3: Tokenize for Qwen2.5
- Qwen2.5-0.5B-Instruct
  - link: https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct
- https://huggingface.co/docs/transformers/tasks/language_modeling
- https://huggingface.co/learn/llm-course/en/chapter7/6?utm_source=chatgpt.com
  - Causal Language Modeling: "inputs serve as labels too (just shifted by one
  element)"
  - "Shifting the inputs and labels to align them happens inside the model, so the data collator just copies the inputs to create the labels."

In [None]:
from transformers import AutoTokenizer

# Load in Qwen2.5 tokenizer
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
)

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [None]:
# Tokenizer
  # link: https://huggingface.co/docs/transformers/en/main_classes/tokenizer

# Padding and truncation
  # link: https://huggingface.co/docs/transformers/pad_truncation
    # truncation, max_length, padding

# Process: Datasets, Batch

max_length = 512

def tokenize_fn(batch):
    return tokenizer(
        batch["final_prompt"],
        truncation=True,
        max_length=max_length,
        padding="max_length",
    )

tokenized = race_dataset.map(
    tokenize_fn,
    batched=True,
    remove_columns=race_dataset["train"].column_names
)


# Labels = input_ids (simple supervised fine-tune)
def add_labels(batch):
    # Just copy the list of lists; HF will convert to tensors later
    batch["labels"] = batch["input_ids"]
    return batch

tokenized = tokenized.map(add_labels, batched=True)
tokenized.set_format(type="torch")

train_ds = tokenized["train"]
eval_ds  = tokenized["validation"]
print(train_ds[0].keys()) # Now has input_ids, attention_mask & labels

Map:   0%|          | 0/4934 [00:00<?, ? examples/s]

Map:   0%|          | 0/87866 [00:00<?, ? examples/s]

Map:   0%|          | 0/4887 [00:00<?, ? examples/s]

Map:   0%|          | 0/4934 [00:00<?, ? examples/s]

Map:   0%|          | 0/87866 [00:00<?, ? examples/s]

Map:   0%|          | 0/4887 [00:00<?, ? examples/s]

dict_keys(['input_ids', 'attention_mask', 'labels'])


# Step 4: Load Qwen2.5 Model to GPU

In [None]:
import torch
from transformers import AutoModelForCausalLM

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
)
model.to(device)

Using device: cuda


config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (rotary_emb): Qwen2

# Step 5: Trainer

In [None]:
from transformers import Trainer, TrainingArguments

output_dir = "/content/drive/MyDrive/DATASCI266/RACE/models/qwen2_5_0_5B_race_mc"

args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,
    warmup_ratio=0.1,
    eval_strategy="steps",
    eval_steps=500,
    save_steps=500,
    save_total_limit=2,
    logging_steps=50,
    logging_strategy="steps",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
)

trainer.train()

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Step,Training Loss,Validation Loss
500,1.8284,1.83466
1000,1.7949,1.823081


Step,Training Loss,Validation Loss
500,1.8284,1.83466
1000,1.7949,1.823081
1500,1.717,1.832744
2000,1.7013,1.82751
2500,1.6359,1.82376


# Step 6: Use Model

## Step 6.1: Load Model & Dataset

In [None]:
!pip install -q transformers datasets accelerate

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# Load in tokenizer
base_model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(
    base_model_name,
    trust_remote_code=True
)


# Load in finetuned model & set to eval mode
# Model location, Checkpoint 1000 was where inflection of training and validation loss occurred
model_dir = "/content/drive/MyDrive/DATASCI266/NewsQA/models/qwen2_5_0_5B_newsqa_mc/checkpoint-2500"
model = AutoModelForCausalLM.from_pretrained(
    model_dir,
    trust_remote_code=True,
)

model.to(device)
model.eval()

Using device: cuda


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896, padding_idx=151643)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
   

In [None]:
from datasets import load_dataset

# Load in BBQ dataset from HuggingFace
bbq = load_dataset("walledai/BBQ")

## Step 6.2: Helper Functions

In [None]:
letter_options = ["A", "B", "C"]  # BBQ has 3 choices

def build_prompt(example):
    '''Reformat BBQ examples into prompts for Qwen2.5'''
    context = example["context"]
    question = example["question"]
    choices = example["choices"] # list of 3 strings

    # Sanity check
    if len(choices) != 3:
        raise ValueError(f"Expected 3 choices, got {len(choices)}")

    options_block = ""
    for i in range(3):
        options_block += f"{letter_options[i]}. {choices[i]}\n"

    prompt = (
        f"Context:\n{context}\n\n"
        f"Question:\n{question}\n\n"
        f"Options:\n{options_block}\n\n"
        "Answer with the letter (A, B, or C) only.\n"
        "Answer:"
    )
    return prompt

In [None]:
def predict_letter(example, max_new_tokens=2):
    '''Provide prompt to model and get the selected option'''

    letter_options = ["A", "B", "C"]  # BBQ has 3 choices

    # Build prompt & tokenize
    prompt = build_prompt(example)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Turn off gradient during prediction
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,  # greedy
            pad_token_id=tokenizer.eos_token_id,
        )

   # Extract only the newly generated tokens
    gen_ids = out[0][inputs["input_ids"].shape[1]:]
    gen_text = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()

    if not gen_text:
        return None

    # Take first non-space character as answer
    letter = gen_text[0].upper()
    if letter not in letter_options:
        return None
    return letter

In [None]:
def get_letter(example):
    """
    Convert BBQ's answer format into a letter (A, B, or C).
    Handles both integer labels (0,1,2) and string labels ('A','B','C').
    """

    letter_options = ["A", "B", "C"]  # BBQ has 3 choices

    # Original answer
    answer_val = example["answer"]

    # 0/1/2
    if isinstance(answer_val, int):
        if 0 <= answer_val < len(letter_options):
            return letter_options[answer_val]
        return None

    # A/B/C
    if isinstance(answer_val, str):
        ans = answer_val.strip().upper()
        if ans in letter_options:
            return ans
        return None

    return None

## Step 6.3: Save Results

In [None]:
from tqdm.auto import tqdm

# BBQ's letter options
letter_options = ["A", "B", "C"]

category_results = {} # Per-category info
all_predictions = []  # Per-individual-example across each category

for category_name in bbq.keys():
    category_dataset = bbq[category_name]

    num_correct_predictions = 0
    num_evaluated_examples  = 0

    print(f"\nEvaluating category: {category_name} (n={len(category_dataset)})")

    for question_index_within_category, example in enumerate(tqdm(category_dataset)):

        # Get the correct answer letter (A/B/C)
        correct_answer_letter = get_letter(example)
        predicted_answer_letter = predict_letter(example)

        # Convert (A/B/C) to (0/1/2)
        correct_answer_index = letter_options.index(correct_answer_letter)
        predicted_answer_index = letter_options.index(predicted_answer_letter)

        # Check if correct or not
        prediction_is_correct = int(predicted_answer_index == correct_answer_index)
        num_evaluated_examples += 1
        if prediction_is_correct:
            num_correct_predictions += 1

        # Save example-specific info
        all_predictions.append({
            "category": category_name,
            "question_index": question_index_within_category,  # index within this category
            "context": example["context"],
            "question": example["question"],
            "choices": example["choices"],
            "true_answer_letter": correct_answer_letter,
            "true_answer_index": correct_answer_index,
            "predicted_answer_letter": predicted_answer_letter,
            "predicted_answer_index": predicted_answer_index,
            "correct": prediction_is_correct,
        })

    # Compute accuracy for this category
    accuracy = num_correct_predictions / num_evaluated_examples
    category_results[category_name] = {
        "accuracy": accuracy,
        "correct": num_correct_predictions,
        "total": num_evaluated_examples
    }


Evaluating category: age (n=3680)


  0%|          | 0/3680 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



Evaluating category: disabilityStatus (n=1556)


  0%|          | 0/1556 [00:00<?, ?it/s]


Evaluating category: genderIdentity (n=5672)


  0%|          | 0/5672 [00:00<?, ?it/s]


Evaluating category: nationality (n=3080)


  0%|          | 0/3080 [00:00<?, ?it/s]


Evaluating category: physicalAppearance (n=1576)


  0%|          | 0/1576 [00:00<?, ?it/s]


Evaluating category: raceEthnicity (n=6880)


  0%|          | 0/6880 [00:00<?, ?it/s]


Evaluating category: raceXSes (n=11160)


  0%|          | 0/11160 [00:00<?, ?it/s]


Evaluating category: raceXGender (n=15960)


  0%|          | 0/15960 [00:00<?, ?it/s]


Evaluating category: religion (n=1200)


  0%|          | 0/1200 [00:00<?, ?it/s]


Evaluating category: ses (n=6864)


  0%|          | 0/6864 [00:00<?, ?it/s]


Evaluating category: sexualOrientation (n=864)


  0%|          | 0/864 [00:00<?, ?it/s]

In [None]:
import json
import os

save_path = "/content/drive/MyDrive/DATASCI266/RACE/bbq_runs_race"
os.makedirs(save_path, exist_ok=True)

# 1) Save category-level summary
results_file = os.path.join(save_path, "qwen2_5_bbq_results_RACE.json")
with open(results_file, "w") as f:
    json.dump(category_results, f, indent=2)
print("Saved summary →", results_file)

# 2) Save per-example predictions (JSONL)
preds_file = os.path.join(save_path, "qwen2_5_bbq_predictions_RACE.jsonl")
with open(preds_file, "w") as f:
    for row in all_predictions:
        f.write(json.dumps(row) + "\n")
print("Saved per-example predictions →", preds_file)

Saved summary → /content/drive/MyDrive/DATASCI266/RACE/bbq_runs_race/qwen2_5_bbq_results_RACE.json
Saved per-example predictions → /content/drive/MyDrive/DATASCI266/RACE/bbq_runs_race/qwen2_5_bbq_predictions_RACE.jsonl
