# Qwen3-4B-Instruct-2507 Base Model Format Testing

Test whether the base model can follow the expected `<think>...</think>` format for SRL training.

## 1. Install Dependencies

In [None]:
!pip install -q transformers accelerate torch

## 2. Load Base Model

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_NAME = "Qwen/Qwen3-4B-Instruct-2507"

print(f"Loading model: {MODEL_NAME}")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    padding_side="left",
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Tokenizer loaded. Vocab size: {tokenizer.vocab_size}")

In [None]:
# Load model
print("Loading model (this may take a minute)...")

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
    attn_implementation="sdpa",
)
model.eval()

print(f"Model loaded on: {next(model.parameters()).device}")
if torch.cuda.is_available():
    print(f"GPU memory: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")

## 3. Define Prompt Format and Parsing

In [None]:
from typing import List, Optional, Tuple

# SRL prompt preamble (from training)
PROMPT_PREAMBLE = (
    "You are a helpful assistant for solving mathematical problems. "
    "A user will provide a math problem, which may include a partial solution. "
    "Your task is to continue the solution by providing the very next logical step. "
    "A user will ask you to solve a task. You should first draft your thinking process "
    "(inner monologue). Then, generate the solution. "
    "Your response format must follow the template below:\n"
    "<think> Your thoughts or/and draft, like working through an exercise on scratch paper. "
    "Be as casual and as long as you want until you are confident to generate a correct solution. </think>\n"
    "Provide only the single, next step to continue the solution. Do not solve the entire problem."
)


def build_srl_prompt(
    problem: str,
    previous_steps: List[str] = None,
    step_title: Optional[str] = None,
) -> str:
    """Build the SRL prompt."""
    if previous_steps is None:
        previous_steps = []
    
    parts = [
        PROMPT_PREAMBLE,
        "",
        "Problem:",
        problem.strip(),
        "",
    ]
    for step in previous_steps:
        parts.append(step.strip())
    if step_title:
        parts.append(step_title.strip())
    return "\n".join(parts)


def parse_model_output(text: str) -> Tuple[Optional[str], Optional[str]]:
    """Parse output to extract thought and action. Returns (None, None) if format invalid."""
    if text is None:
        return (None, None)
    
    close_idx = text.find("</think>")
    if close_idx == -1:
        return (None, None)  # Format error!
    
    action = text[close_idx + len("</think>"):].strip()
    
    open_idx = text.find("<think>")
    if open_idx == -1:
        thought = text[:close_idx].strip()
    else:
        thought = text[open_idx + len("<think>"):close_idx].strip()
    
    return (thought, action)


print("Functions defined.")

## 4. Generation Function

In [None]:
def generate_step(
    problem: str,
    previous_steps: List[str] = None,
    max_new_tokens: int = 512,
    temperature: float = 0.7,
    do_sample: bool = True,
    verbose: bool = True,
):
    """Generate a step and analyze output format."""
    if previous_steps is None:
        previous_steps = []
    
    prompt = build_srl_prompt(problem, previous_steps)
    
    if verbose:
        print("=" * 80)
        print("PROMPT:")
        print("=" * 80)
        print(prompt)
        print()
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    input_len = inputs["input_ids"].shape[1]
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature if do_sample else None,
            do_sample=do_sample,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    generated = tokenizer.decode(outputs[0][input_len:], skip_special_tokens=True)
    
    if verbose:
        print("=" * 80)
        print("RAW OUTPUT:")
        print("=" * 80)
        print(generated)
        print()
    
    thought, action = parse_model_output(generated)
    
    if verbose:
        print("=" * 80)
        print("FORMAT ANALYSIS:")
        print("=" * 80)
        
        has_open = "<think>" in generated
        has_close = "</think>" in generated
        
        print(f"  Has <think>:  {has_open}")
        print(f"  Has </think>: {has_close}")
        
        if thought is None and action is None:
            print("  ❌ FORMAT ERROR: Missing </think> tag!")
        else:
            print(f"  ✓ Valid format")
            print(f"  Thought: {len(thought)} chars")
            print(f"  Action:  {len(action)} chars")
        
        if thought:
            print("\nTHOUGHT:")
            print("-" * 40)
            print(thought[:500] + ("..." if len(thought) > 500 else ""))
        
        if action:
            print("\nACTION (step):")
            print("-" * 40)
            print(action)
    
    return {
        "raw_output": generated,
        "thought": thought,
        "action": action,
        "is_valid": action is not None,
    }


print("Generation function ready.")

## 5. Test Base Model Formatting

In [None]:
# Test 1: Simple problem
result1 = generate_step(
    problem="Calculate the derivative of f(x) = x^3 + 2x^2 - 5x + 1",
    do_sample=False,  # Greedy for reproducibility
)

In [None]:
# Test 2: Problem with previous step
result2 = generate_step(
    problem="Solve the equation: 2x + 5 = 13",
    previous_steps=["Step 1: Subtract 5 from both sides: 2x + 5 - 5 = 13 - 5"],
    do_sample=False,
)

In [None]:
# Test 3: Simple arithmetic
result3 = generate_step(
    problem="What is 15% of 80?",
    do_sample=False,
)

In [None]:
# Test 4: Algebra
result4 = generate_step(
    problem="Simplify: 3(x + 4) - 2(x - 1)",
    do_sample=False,
)

## 6. Batch Test Summary

In [None]:
test_problems = [
    "What is 2 + 2?",
    "Find the area of a circle with radius 5.",
    "Solve x^2 = 16",
    "What is the derivative of sin(x)?",
    "Calculate 25 × 4",
]

print("Running batch test with greedy decoding...\n")

results = []
for i, problem in enumerate(test_problems, 1):
    print(f"Test {i}: {problem[:50]}...")
    result = generate_step(problem, do_sample=False, verbose=False)
    results.append(result)
    status = "✓" if result["is_valid"] else "❌"
    print(f"  {status} Valid: {result['is_valid']}")
    if not result["is_valid"]:
        print(f"  Raw output preview: {result['raw_output'][:100]}...")
    print()

# Summary
valid = sum(1 for r in results if r["is_valid"])
print("=" * 60)
print(f"SUMMARY: {valid}/{len(results)} valid format ({100*valid/len(results):.0f}%)")
print("=" * 60)

## 7. Test with Chat Template (Qwen's Native Format)

Qwen3 Instruct models expect chat format. Let's try that instead.

In [None]:
def generate_with_chat_template(
    problem: str,
    previous_steps: List[str] = None,
    max_new_tokens: int = 512,
    do_sample: bool = False,
    verbose: bool = True,
):
    """Generate using Qwen's chat template."""
    if previous_steps is None:
        previous_steps = []
    
    # Build system message
    system_msg = (
        "You are a helpful assistant for solving mathematical problems. "
        "Your task is to provide the very next logical step to continue a solution. "
        "Your response format must follow this template:\n"
        "<think> Your thoughts or draft. </think>\n"
        "[Your single next step here]\n\n"
        "Provide only ONE step. Do not solve the entire problem."
    )
    
    # Build user message
    user_content = f"Problem: {problem}"
    if previous_steps:
        user_content += "\n\nPrevious steps:\n" + "\n".join(previous_steps)
    user_content += "\n\nProvide the next step:"
    
    messages = [
        {"role": "system", "content": system_msg},
        {"role": "user", "content": user_content},
    ]
    
    # Apply chat template
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )
    
    if verbose:
        print("=" * 80)
        print("PROMPT (with chat template):")
        print("=" * 80)
        print(prompt)
        print()
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    input_len = inputs["input_ids"].shape[1]
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=do_sample,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    generated = tokenizer.decode(outputs[0][input_len:], skip_special_tokens=True)
    
    if verbose:
        print("=" * 80)
        print("RAW OUTPUT:")
        print("=" * 80)
        print(generated)
        print()
    
    thought, action = parse_model_output(generated)
    
    if verbose:
        print("=" * 80)
        print("FORMAT ANALYSIS:")
        print("=" * 80)
        has_open = "<think>" in generated
        has_close = "</think>" in generated
        print(f"  Has <think>:  {has_open}")
        print(f"  Has </think>: {has_close}")
        
        if action is None:
            print("  ❌ FORMAT ERROR")
        else:
            print(f"  ✓ Valid format")
            print(f"  Action: {action[:200]}..." if len(action) > 200 else f"  Action: {action}")
    
    return {"raw_output": generated, "thought": thought, "action": action, "is_valid": action is not None}


print("Chat template function ready.")

In [None]:
# Test with chat template
result_chat = generate_with_chat_template(
    problem="Calculate the derivative of f(x) = x^3 + 2x^2 - 5x + 1",
    do_sample=False,
)

In [None]:
# Test with previous step using chat template
result_chat2 = generate_with_chat_template(
    problem="Solve: 2x + 5 = 13",
    previous_steps=["Step 1: Subtract 5 from both sides: 2x = 8"],
    do_sample=False,
)

## 8. Compare Both Approaches

In [None]:
test_problems = [
    "What is 2 + 2?",
    "Find the area of a circle with radius 5.",
    "Solve x^2 = 16",
]

print("Comparing raw prompt vs chat template...\n")
print(f"{'Problem':<40} {'Raw Prompt':<12} {'Chat Template':<12}")
print("=" * 70)

for problem in test_problems:
    r1 = generate_step(problem, do_sample=False, verbose=False)
    r2 = generate_with_chat_template(problem, do_sample=False, verbose=False)
    
    s1 = "✓" if r1["is_valid"] else "❌"
    s2 = "✓" if r2["is_valid"] else "❌"
    
    print(f"{problem:<40} {s1:<12} {s2:<12}")

## 9. Clean Up

In [None]:
# Free memory if needed
# import gc
# del model
# gc.collect()
# torch.cuda.empty_cache()
# print("Memory cleared.")