# Step 1: Preparing

In [4]:
import json
import torch
import random
import os
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    get_linear_schedule_with_warmup,
    DataCollatorWithPadding
)
from peft import get_peft_model, LoraConfig, TaskType
from tqdm import tqdm 

MODEL_NAME = "/Volumes/SSD/llm-projects/hugging_face/hub/models--Qwen--Qwen3-0.6B/snapshots/c1899de289a04d12100db370d81485cdf75e47ca"
DATASET_PATH = "verifier_dataset_train.json"
OUTPUT_DIR = "./verifier_lora"
CHECKPOINT_FILE = "training_state.pt" 


MAX_LENGTH = 256             
BATCH_SIZE = 2               
GRAD_ACCUMULATION_STEPS = 16  


EPOCHS = 4
LEARNING_RATE = 1e-3 #5e-4
DEBUG_SAMPLE_SIZE = 1000 # Set to None for full run

device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Running on device: {device}")

Running on device: mps


# Step 2: Tokenizer and dataset

In [5]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

class VerifierDataset(Dataset):
    def __init__(self, raw_data_list, tokenizer, max_length=512):
        self.samples = []
        for entry in raw_data_list:
            question = entry['question']
            answers = entry['answers']
            labels = entry['answer_labels']
            for ans, label in zip(answers, labels):
                text = f"Question: {question}\nAnswer: {ans}"
                # Label must be float for BCE Loss (0.0 or 1.0)
                self.samples.append({"text": text, "label": float(label)})
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        item = self.samples[idx]
        encodings = self.tokenizer(
            item["text"],
            truncation=True,
            max_length=self.max_length,
            padding=False 
        )
        return {
            "input_ids": encodings["input_ids"],
            "attention_mask": encodings["attention_mask"],
            "labels": torch.tensor(item["label"], dtype=torch.float)
        }
    
print(f"Loading data from {DATASET_PATH}...")
if not os.path.exists(DATASET_PATH):
    print("Error: File not found.")
    raw_questions = []
else:
    with open(DATASET_PATH, 'r', encoding='utf-8') as f:
        raw_questions = json.load(f)

if DEBUG_SAMPLE_SIZE: raw_questions = raw_questions[:DEBUG_SAMPLE_SIZE]
random.seed(42)
random.shuffle(raw_questions)

split_idx = int(0.9 * len(raw_questions))
if split_idx == 0 and len(raw_questions) > 0: split_idx = 1
train_questions = raw_questions[:split_idx]
val_questions = raw_questions[split_idx:]

train_dataset = VerifierDataset(train_questions, tokenizer, MAX_LENGTH)
val_dataset = VerifierDataset(val_questions, tokenizer, MAX_LENGTH)

collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collator, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collator, pin_memory=True)

print(f"Train Samples: {len(train_dataset)} | Val Samples: {len(val_dataset)}")


Loading data from verifier_dataset_train.json...
Train Samples: 28800 | Val Samples: 3200


# Step 3: Model with LoRA and optimizer

In [None]:
print("Loading model...")
# num_labels=1 means we output a SINGLE number (The 'Logit')
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=1, 
    device_map=device,
    torch_dtype=torch.bfloat16
)
model.config.pad_token_id = tokenizer.pad_token_id

# Apply LoRA
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1, #0.05
    target_modules=["q_proj", "v_proj"],
    # use_dora=True #DoRA option, maybe for bigger model
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
num_training_steps = len(train_loader) * EPOCHS // GRAD_ACCUMULATION_STEPS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
loss_fn = torch.nn.BCEWithLogitsLoss() # This is the Binary Cross Entropy Loss



`torch_dtype` is deprecated! Use `dtype` instead!


Loading model...


Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at /Volumes/SSD/llm-projects/hugging_face/hub/models--Qwen--Qwen3-0.6B/snapshots/c1899de289a04d12100db370d81485cdf75e47ca and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,147,904 || all params: 597,198,848 || trainable%: 0.1922


# Step 4: Resume from the checkpoint

In [7]:
start_epoch = 0
checkpoint_path = os.path.join(OUTPUT_DIR, CHECKPOINT_FILE)

if os.path.exists(checkpoint_path):
    print(f"Found checkpoint: {checkpoint_path}")
    print("Loading state to resume training...")
    checkpoint = torch.load(checkpoint_path, map_location=device)
    
    # Load Model Weights
    model.load_state_dict(checkpoint['model_state_dict'])
    # Load Optimizer Brain (Momentum)
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    # Load Schedule (Current Learning Rate)
    scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
    # Load Epoch
    start_epoch = checkpoint['epoch'] + 1
    
    print(f"Resuming from Epoch {start_epoch + 1}")
else:
    print("No checkpoint found. Starting fresh.")
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)

Found checkpoint: ./verifier_lora_adapter1/training_state.pt
Loading state to resume training...
Resuming from Epoch 3


# Step 5: Training

In [1]:
print("\nStarting Manual Training Loop...")
print(f"Effective Batch Size: {BATCH_SIZE * GRAD_ACCUMULATION_STEPS}")


for epoch in range(start_epoch, EPOCHS):
    model.train()
    total_loss = 0
    optimizer.zero_grad()
    
    progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch+1}/{EPOCHS}")
    
    for step, batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        
        logits = outputs.logits.squeeze(-1) 
        
        loss = loss_fn(logits, batch['labels'])
        loss = loss / GRAD_ACCUMULATION_STEPS 
        
        loss.backward()
        
        if (step + 1) % GRAD_ACCUMULATION_STEPS == 0:
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        
        current_loss = loss.item() * GRAD_ACCUMULATION_STEPS
        total_loss += current_loss
        progress_bar.set_postfix({'loss': current_loss})
        
    avg_train_loss = total_loss / len(train_loader)
    
    model.eval()
    val_correct = 0
    val_total = 0
    
    print(f"Validating Epoch {epoch+1}...")
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            
            outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
            logits = outputs.logits.squeeze(-1)

            probs = torch.sigmoid(logits)
            
            predictions = (probs > 0.5).float()
            val_correct += (predictions == batch['labels']).sum().item()
            val_total += len(batch['labels'])
            
    val_acc = val_correct / val_total if val_total > 0 else 0
    print(f"Epoch {epoch+1} Finished | Train Loss: {avg_train_loss:.4f} | Val Acc: {val_acc:.2%}")

    print("Saving Checkpoint...")
    
    model.save_pretrained(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)
    
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
    }, checkpoint_path)
    print("Checkpoint Saved.\n")

print("Done.")

  from .autonotebook import tqdm as notebook_tqdm


Running on device: mps
Loading data from verifier_dataset_train.json...


`torch_dtype` is deprecated! Use `dtype` instead!


Train Samples: 28800 | Val Samples: 3200
Loading model...


Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at /Volumes/SSD/llm-projects/hugging_face/hub/models--Qwen--Qwen3-0.6B/snapshots/c1899de289a04d12100db370d81485cdf75e47ca and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,147,904 || all params: 597,198,848 || trainable%: 0.1922
No checkpoint found. Starting fresh.

Starting Manual Training Loop...
Effective Batch Size: 32


Epoch 1/4: 100%|██████████| 14400/14400 [5:28:48<00:00,  1.37s/it, loss=0.0287]    


Validating Epoch 1...
Epoch 1 Finished | Train Loss: 0.3449 | Val Acc: 80.06%
Saving Checkpoint...
Checkpoint Saved.



Epoch 2/4: 100%|██████████| 14400/14400 [5:28:36<00:00,  1.37s/it, loss=0.0437]    


Validating Epoch 2...
Epoch 2 Finished | Train Loss: 0.1828 | Val Acc: 81.94%
Saving Checkpoint...
Checkpoint Saved.



Epoch 3/4:   1%|          | 84/14400 [02:17<5:41:22,  1.43s/it, loss=0.00189] 

# Step 6-1: testing

In [1]:
import json
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel
from tqdm import tqdm
import os


BASE_MODEL = "/Volumes/SSD/llm-projects/hugging_face/hub/models--Qwen--Qwen3-0.6B/snapshots/c1899de289a04d12100db370d81485cdf75e47ca"
ADAPTER_PATH = "./verifier_lora"
TEST_DATA_PATH = "verifier_dataset_test.json" 
MAX_LENGTH = 256
BATCH_SIZE = 8 

# Set to 100 to check only the first 100 questions. Set to None for full test.
DEBUG_TEST_SIZE = 100 

device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Running evaluation on: {device}")

# ==============================================================================
# 1. Load Model & Adapter
# ==============================================================================
print("Loading Tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Loading Base Model...")
model = AutoModelForSequenceClassification.from_pretrained(
    BASE_MODEL,
    num_labels=1,
    device_map=device,
    torch_dtype=torch.float16
)
model.config.pad_token_id = tokenizer.pad_token_id

print(f"Loading Trained Adapter from {ADAPTER_PATH}...")
model = PeftModel.from_pretrained(model, ADAPTER_PATH)
model.eval() 

# ==============================================================================
# 2. Load & Slice Test Data
# ==============================================================================
print(f"Loading Test Data from {TEST_DATA_PATH}...")
if not os.path.exists(TEST_DATA_PATH):
    print(f"❌ Error: {TEST_DATA_PATH} not found. Please run generation script with ds['test'] first.")
    test_data = []
else:
    with open(TEST_DATA_PATH, 'r') as f:
        test_data = json.load(f)

# --- SLICING LOGIC ---
if DEBUG_TEST_SIZE and len(test_data) > DEBUG_TEST_SIZE:
    print(f"⚠️ DEBUG MODE: Evaluating only the first {DEBUG_TEST_SIZE} questions.")
    test_data = test_data[:DEBUG_TEST_SIZE]
else:
    print(f"Evaluating full dataset: {len(test_data)} questions.")

print(f"Loaded {len(test_data)} test questions.")

# ==============================================================================
# 3. Evaluation Loop (Best-of-N)
# ==============================================================================
stats = {
    "total": 0,
    "correct_base": 0,      
    "correct_verifier": 0,  
    "correct_oracle": 0     
}

print("Starting Evaluation...")

for entry in tqdm(test_data):
    question = entry['question']
    answers = entry['answers']
    labels = entry['answer_labels'] 
    
    if not answers: continue
    
    inputs_text = [f"Question: {question}\nAnswer: {ans}" for ans in answers]
    
    scores = []
    with torch.no_grad():
        for i in range(0, len(inputs_text), BATCH_SIZE):
            batch_text = inputs_text[i : i + BATCH_SIZE]
            
            encodings = tokenizer(
                batch_text, 
                return_tensors="pt", 
                padding=True, 
                truncation=True, 
                max_length=MAX_LENGTH
            ).to(device)
            
            outputs = model(**encodings)
            logits = outputs.logits.squeeze(-1)
            batch_scores = torch.sigmoid(logits).cpu().numpy().tolist()
            scores.extend(batch_scores)
            
    best_idx = np.argmax(scores)
    
    stats["total"] += 1
    
    # Metric 1: Base Model (Answer #0 is usually considered the 'greedy' or first sampled output)
    if labels[0] == 1:
        stats["correct_base"] += 1
        
    # Metric 2: Verifier (Best-of-N)
    if labels[best_idx] == 1:
        stats["correct_verifier"] += 1
        
    # Metric 3: Oracle (Is there ANY correct answer in the list?)
    if sum(labels) > 0:
        stats["correct_oracle"] += 1

# ==============================================================================
# 4. Final Report
# ==============================================================================
if stats["total"] > 0:
    total = stats["total"]
    acc_base = stats["correct_base"] / total
    acc_ver = stats["correct_verifier"] / total
    acc_oracle = stats["correct_oracle"] / total

    print("\n" + "="*60)
    print("FINAL EVALUATION REPORT")
    print("="*60)
    print(f"Total Questions Evaluated: {total}")
    print(f"Base Model Accuracy (Greedy):   {acc_base:.2%}  (Random/First guess)")
    print(f"Verifier Accuracy (Best-of-N):  {acc_ver:.2%}   (Your Trained Model)")
    print(f"Oracle Accuracy (Theoretical):  {acc_oracle:.2%}   (Perfect Selection)")
    print("-" * 60)

    improvement = acc_ver - acc_base
    print(f"Verifier Improvement: {improvement:+.2%}")

    if improvement > 0:
        print("✅ SUCCESS: Your Verifier is helping!")
    else:
        print("❌ FAILURE: Your Verifier is hurting or not learning.")
else:
    print("No data processed.")

  from .autonotebook import tqdm as notebook_tqdm


Running evaluation on: mps
Loading Tokenizer...


`torch_dtype` is deprecated! Use `dtype` instead!


Loading Base Model...


Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at /Volumes/SSD/llm-projects/hugging_face/hub/models--Qwen--Qwen3-0.6B/snapshots/c1899de289a04d12100db370d81485cdf75e47ca and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading Trained Adapter from ./verifier_lora_adapter...
Loading Test Data from verifier_dataset_test.json...
⚠️ DEBUG MODE: Evaluating only the first 100 questions.
Loaded 100 test questions.
Starting Evaluation...


100%|██████████| 100/100 [09:24<00:00,  5.64s/it]


FINAL EVALUATION REPORT
Total Questions Evaluated: 100
Base Model Accuracy (Greedy):   44.00%  (Random/First guess)
Verifier Accuracy (Best-of-N):  64.00%   (Your Trained Model)
Oracle Accuracy (Theoretical):  92.00%   (Perfect Selection)
------------------------------------------------------------
Verifier Improvement: +20.00%
✅ SUCCESS: Your Verifier is helping!





# Step 6-2: testing with abstention

In [2]:
import json
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel
from tqdm import tqdm
import os

# ==============================================================================
# CONFIGURATION
# ==============================================================================
BASE_MODEL = "/Volumes/SSD/llm-projects/hugging_face/hub/models--Qwen--Qwen3-0.6B/snapshots/c1899de289a04d12100db370d81485cdf75e47ca"
ADAPTER_PATH = "./verifier_lora"
TEST_DATA_PATH = "verifier_dataset_test.json" 
MAX_LENGTH = 256
BATCH_SIZE = 8 

# Set to None for full run, or 100 for quick test
DEBUG_TEST_SIZE = 100 

# Threshold for saying "None of these are correct"
CONFIDENCE_THRESHOLD = 0.5 

device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Running evaluation on: {device}")

# ==============================================================================
# 1. Load Model & Adapter
# ==============================================================================
print("Loading Tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Loading Base Model...")
model = AutoModelForSequenceClassification.from_pretrained(
    BASE_MODEL,
    num_labels=1,
    device_map=device,
    torch_dtype=torch.float16
)
model.config.pad_token_id = tokenizer.pad_token_id

print(f"Loading Trained Adapter from {ADAPTER_PATH}...")
model = PeftModel.from_pretrained(model, ADAPTER_PATH)
model.eval() 

# ==============================================================================
# 2. Load & Slice Test Data
# ==============================================================================
print(f"Loading Test Data from {TEST_DATA_PATH}...")
if not os.path.exists(TEST_DATA_PATH):
    print(f"❌ Error: {TEST_DATA_PATH} not found.")
    test_data = []
else:
    with open(TEST_DATA_PATH, 'r') as f:
        test_data = json.load(f)

if DEBUG_TEST_SIZE and len(test_data) > DEBUG_TEST_SIZE:
    print(f"⚠️ DEBUG MODE: Evaluating only the first {DEBUG_TEST_SIZE} questions.")
    test_data = test_data[:DEBUG_TEST_SIZE]
else:
    print(f"Evaluating full dataset: {len(test_data)} questions.")

print(f"Loaded {len(test_data)} test questions.")

# ==============================================================================
# 3. Evaluation Loop (Best-of-N with Abstention)
# ==============================================================================
stats = {
    "total": 0,
    "correct_base": 0,      
    "correct_verifier": 0,  
    "correct_oracle": 0,
    "abstain_correct": 0,   # Times verifier correctly said "None"
    "abstain_wrong": 0      # Times verifier said "None" but there WAS a correct answer
}

print("Starting Evaluation...")

for entry in tqdm(test_data):
    question = entry['question']
    answers = entry['answers']
    labels = entry['answer_labels'] # 1 = Correct, 0 = Wrong
    
    if not answers: continue
    
    # --- Prepare Inputs ---
    inputs_text = [f"Question: {question}\nAnswer: {ans}" for ans in answers]
    
    # --- Score Candidates ---
    scores = []
    with torch.no_grad():
        for i in range(0, len(inputs_text), BATCH_SIZE):
            batch_text = inputs_text[i : i + BATCH_SIZE]
            
            encodings = tokenizer(
                batch_text, 
                return_tensors="pt", 
                padding=True, 
                truncation=True, 
                max_length=MAX_LENGTH
            ).to(device)
            
            outputs = model(**encodings)
            logits = outputs.logits.squeeze(-1)
            # Sigmoid gives 0.0 to 1.0 probability
            batch_scores = torch.sigmoid(logits).cpu().numpy().tolist()
            scores.extend(batch_scores)
            
    # --- Decision Logic (The New Part) ---
    best_idx = np.argmax(scores)
    best_score = scores[best_idx]
    
    # Is there actually ANY correct answer in the pile?
    has_correct_answer = (sum(labels) > 0)
    
    # Metric 1: Base Model (Greedy)
    if labels[0] == 1:
        stats["correct_base"] += 1
        
    # Metric 2: Oracle (Potential)
    if has_correct_answer:
        stats["correct_oracle"] += 1

    # Metric 3: Verifier (With Abstention)
    if best_score < CONFIDENCE_THRESHOLD:
        # Case A: Verifier abstains ("None look right")
        if not has_correct_answer:
            # CORRECT ABSTENTION: There truly were no correct answers.
            stats["correct_verifier"] += 1
            stats["abstain_correct"] += 1
        else:
            # FAILED ABSTENTION: There was a correct answer, but we missed it.
            stats["abstain_wrong"] += 1
    else:
        # Case B: Verifier picks an answer
        if labels[best_idx] == 1:
            stats["correct_verifier"] += 1

    stats["total"] += 1

# ==============================================================================
# 4. Final Report
# ==============================================================================
if stats["total"] > 0:
    total = stats["total"]
    acc_base = stats["correct_base"] / total
    acc_ver = stats["correct_verifier"] / total
    acc_oracle = stats["correct_oracle"] / total

    print("\n" + "="*60)
    print("FINAL EVALUATION REPORT (With Abstention)")
    print("="*60)
    print(f"Total Questions: {total}")
    print(f"Base Model Accuracy:    {acc_base:.2%}")
    print(f"Oracle Accuracy:        {acc_oracle:.2%}")
    print(f"Verifier Accuracy:      {acc_ver:.2%}  <-- (Includes correct 'None' predictions)")
    print("-" * 60)
    print(f"Verifier Improvement:   {acc_ver - acc_base:+.2%}")
    print("-" * 60)
    print("Abstention Stats:")
    print(f"  - Correctly Abstains: {stats['abstain_correct']} times (Saved us from wrong answers)")
    print(f"  - Wrongly Abstains:   {stats['abstain_wrong']} times (Missed a good answer)")
else:
    print("No data processed.")

Running evaluation on: mps
Loading Tokenizer...
Loading Base Model...


Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at /Volumes/SSD/llm-projects/hugging_face/hub/models--Qwen--Qwen3-0.6B/snapshots/c1899de289a04d12100db370d81485cdf75e47ca and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading Trained Adapter from ./verifier_lora_adapter...
Loading Test Data from verifier_dataset_test.json...
⚠️ DEBUG MODE: Evaluating only the first 100 questions.
Loaded 100 test questions.
Starting Evaluation...


100%|██████████| 100/100 [09:58<00:00,  5.98s/it]


FINAL EVALUATION REPORT (With Abstention)
Total Questions: 100
Base Model Accuracy:    44.00%
Oracle Accuracy:        92.00%
Verifier Accuracy:      66.00%  <-- (Includes correct 'None' predictions)
------------------------------------------------------------
Verifier Improvement:   +22.00%
------------------------------------------------------------
Abstention Stats:
  - Correctly Abstains: 3 times (Saved us from wrong answers)
  - Wrongly Abstains:   8 times (Missed a good answer)



