In [1]:
# debug_qa_windows.py

import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

# 1) Local checkpoint + JSON paths
CHECKPOINT    = "qa_roberta_checkpoint"    # your saved model folder
ORIGINAL_JSON = "train.json"            # your original QA JSON

# 2) Reload tokenizer & model
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT, local_files_only=True)
model     = AutoModelForQuestionAnswering.from_pretrained(CHECKPOINT, local_files_only=True)
model.eval()

# 3) Load & concatenate full context
data = load_dataset("json", data_files={"full": ORIGINAL_JSON}, field="data")["full"]
record  = data[0]  # the race you trained on
context = "\n\n".join(p["context"] for p in record["paragraphs"])
print(f"[Loaded context length: {len(context)} chars]\n")

# 4) Question to debug
question = "Who won the race?"
q_tokens = tokenizer.tokenize(question)
c_tokens = tokenizer.tokenize(context)
print(f"Question → {len(q_tokens)} tokens")
print(f"Context  → {len(c_tokens)} tokens")




# 5) Tokenize into sliding windows (mirror training)
enc = tokenizer(
    question,
    context,
    max_length=384,
    truncation="only_second",       # only cut the context to max_length
    stride=128,
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
    padding="max_length",              # pad all windows to same length
    return_tensors="pt"
)

num_windows = enc.input_ids.size(0)
print(f"→ Generated {num_windows} windows\n")
lens = [len(win) for win in enc["input_ids"]]
print("Window lengths:", lens)
print("windows × length:", enc.input_ids.shape)
# 6) Loop & debug
best_score = -1e9
best_span  = (None, None, None)  # (win_i, start_idx, end_idx)

for win_i in range(num_windows):
    input_ids      = enc.input_ids[win_i : win_i+1]
    attention_mask = enc.attention_mask[win_i : win_i+1]
    offsets        = enc.offset_mapping[win_i].tolist()

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        start_logits = outputs.start_logits[0]
        end_logits   = outputs.end_logits[0]

    # Pick the highest start/end token (bruteforce)
    s_idx = int(torch.argmax(start_logits).item())
    e_idx = int(torch.argmax(end_logits).item())
    score = (start_logits[s_idx] + end_logits[e_idx]).item()

    # CLS token is always at position 0 in each window
    cls_id = input_ids[0, 0].item()

    # Print debug info for this window
    print(f"Window {win_i}:")
    print(f"  CLS token id     = {cls_id}")
    print(f"  Chosen start_idx = {s_idx}, end_idx = {e_idx}")
    print(f"  Combined score   = {score:.4f}")
    print(f"  Raw start_logits[:5] = {[round(x.item(),3) for x in start_logits[:5]]}")
    print(f"  Raw   end_logits[:5] = {[round(x.item(),3) for x in end_logits[:5]]}")
    print("  Offsets[0:5]         =", offsets[:5])
    print("  Token IDs [0:5]      =", input_ids[0].tolist()[:5], "...", input_ids[0].tolist()[-5:])
    print("")

    # Track best span across all windows
    if score > best_score:
        best_score = score
        best_span  = (win_i, s_idx, e_idx)

# 7) Report the global best
win_i, s_idx, e_idx = best_span
char_start = enc.offset_mapping[win_i][s_idx][0]
char_end   = enc.offset_mapping[win_i][e_idx][1]
answer     = context[char_start:char_end]

print("=== GLOBAL BEST SPAN ===")
print(f"Window {win_i}, tokens [{s_idx}, {e_idx}], score {best_score:.4f}")
print(f"Answer text: {repr(answer)}")


Token indices sequence length is longer than the specified maximum sequence length for this model (3465 > 512). Running this sequence through the model will result in indexing errors


[Loaded context length: 9555 chars]

Question → 5 tokens
Context  → 3465 tokens
→ Generated 14 windows

Window lengths: [384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384, 384]
windows × length: torch.Size([14, 384])
Window 0:
  CLS token id     = 0
  Chosen start_idx = 0, end_idx = 0
  Combined score   = 17.7573
  Raw start_logits[:5] = [8.908, -8.746, -8.745, -8.765, -8.729]
  Raw   end_logits[:5] = [8.849, -8.732, -8.73, -8.743, -8.71]
  Offsets[0:5]         = [[0, 0], [0, 3], [4, 7], [8, 11], [12, 16]]
  Token IDs [0:5]      = [0, 12375, 351, 5, 1015] ... [25, 365, 6, 7074, 2]

Window 1:
  CLS token id     = 0
  Chosen start_idx = 0, end_idx = 0
  Combined score   = 17.7523
  Raw start_logits[:5] = [8.906, -8.738, -8.733, -8.731, -8.754]
  Raw   end_logits[:5] = [8.846, -8.691, -8.695, -8.689, -8.705]
  Offsets[0:5]         = [[0, 0], [0, 3], [4, 7], [8, 11], [12, 16]]
  Token IDs [0:5]      = [0, 12375, 351, 5, 1015] ... [17058, 35, 42960, 4, 2]

Window 2:
  CLS toke