In [None]:
!pip uninstall -y torch
!pip uninstall -y vllm

# !pip install -q torch==2.6.0
!pip install -q torch==2.6.0+cu124 --index-url https://download.pytorch.org/whl/cu124
!pip install -q vllm==0.8.5.post1


In [None]:
!pip install -U "pyzmq<26"

In [None]:
# !pip install flashinfer-python -i https://flashinfer.ai/whl/cu124/torch2.6
!pip install flashinfer-python==0.2.2 -i https://flashinfer.ai/whl/cu124/torch2.6

In [1]:
import torch
print(torch.__version__)

import vllm
print(vllm.__version__)

# 2.6.0+cu124
# 0.8.5.post1

print(torch.cuda.is_available())
print(torch.cuda.device_count())

2.6.0+cu124
INFO 06-27 11:14:04 [__init__.py:239] Automatically detected platform cuda.
0.8.5.post1
True
4


In [2]:
import psutil, torch, textwrap

gib = lambda b: b / 1024**3

cpu = psutil.virtual_memory()
print(textwrap.dedent(f"""
CPU RAM
  Total     : {gib(cpu.total):5.1f} GiB
  Available : {gib(cpu.available):5.1f} GiB
  Used      : {gib(cpu.used):5.1f} GiB ({cpu.percent:.1f}%)
"""))

print("GPU VRAM")
for i in range(torch.cuda.device_count()):
    free, total = map(gib, torch.cuda.mem_get_info(i))
    props = torch.cuda.get_device_properties(i)
    gpu_tot = gib(props.total_memory)
    gpu_used = total - free
    print(textwrap.dedent(f"""\
    GPU {i} - {props.name}
      Total     : {gpu_tot:5.1f} GiB
      Free      : {free:5.1f} GiB
      Used      : {gpu_used:5.1f} GiB ({gpu_used/gpu_tot*100:.1f}%)
    """))



CPU RAM
  Total     : 251.6 GiB
  Available : 240.3 GiB
  Used      :   9.1 GiB (4.5%)

GPU VRAM
GPU 0 - NVIDIA RTX A6000
  Total     :  47.5 GiB
  Free      :  47.3 GiB
  Used      :   0.3 GiB (0.5%)

GPU 1 - NVIDIA RTX A6000
  Total     :  47.5 GiB
  Free      :  47.3 GiB
  Used      :   0.3 GiB (0.5%)

GPU 2 - NVIDIA RTX A6000
  Total     :  47.5 GiB
  Free      :  47.3 GiB
  Used      :   0.3 GiB (0.5%)

GPU 3 - NVIDIA RTX A6000
  Total     :  47.5 GiB
  Free      :  47.3 GiB
  Used      :   0.3 GiB (0.5%)



In [4]:
from datasets import load_dataset
import pandas as pd
import sqlite3
import os

DB_FILE = 'dataset_4qwen3.db'

if os.path.exists(DB_FILE):
    print(f"'{DB_FILE}' already exists")
else:
    # Define columns including the new 'trace' column
    columns = ["question", "answer", "trace"]
    df = pd.DataFrame(columns=columns)

    # Load and process s1K-1.1 dataset
    s1 = load_dataset("simplescaling/s1K-1.1", split="train")
    df_s1 = pd.DataFrame(s1)[["question", "solution"]]
    df_s1.columns = ["question", "answer"]
    df_s1["trace"] = ""  # Add empty trace column
    df = pd.concat([df, df_s1], ignore_index=True)

    # Load and process LIMO dataset
    limo = load_dataset("GAIR/LIMO", split="train")
    df_limo = pd.DataFrame(limo)[["question", "answer"]]
    df_limo["trace"] = ""  # Add empty trace column
    df = pd.concat([df, df_limo], ignore_index=True)

    # Save as SQLite database
    conn = sqlite3.connect(DB_FILE)
    df.to_sql('dataset', conn, if_exists='replace', index=False)
    conn.close()
    print(f"Dataset saved as {DB_FILE}")


Dataset saved as dataset_4qwen3.db


In [1]:
import pandas as pd
import sqlite3

DB_FILE = 'dataset_4qwen3.db'

# Read the SQLite database back into a DataFrame
conn = sqlite3.connect(DB_FILE)
df = pd.read_sql('SELECT * FROM dataset', conn)
conn.close()

print(f"Loaded {len(df)} rows")
print(f"Columns: {df.columns.tolist()}")

null_count = df['trace'].isnull().sum()
empty_string_count = (df['trace'] == '').sum()
whitespace_only_count = df['trace'].str.strip().eq('').sum() if df['trace'].dtype == 'object' else 0
total_empty = df['trace'].isnull().sum() + (df['trace'].str.strip() == '').sum()

print(f"Null/NaN values: {null_count}")
print(f"Empty strings: {empty_string_count}")
print(f"Whitespace-only strings: {whitespace_only_count}")
print(f"Total empty rows: {total_empty}")

df.head()

Loaded 1817 rows
Columns: ['question', 'answer', 'trace']
Null/NaN values: 0
Empty strings: 1817
Whitespace-only strings: 1817
Total empty rows: 1817


Unnamed: 0,question,answer,trace
0,"Given a rational number, write it as a fractio...",128,
1,Let $ \mathcal{H}$ be an infinite-dimensiona...,1. **Consider a countable subset \( S_0 \subse...,
2,Find the remainder when $9 \times 99 \times 99...,109,
3,Compute the mean molecular speed v in the heav...,167.0,
4,Two capacitors with capacitance values $C_{1}=...,1.3,


In [2]:
LARGE_GPU_INDEX = "3"
SMALL_GPU_INDEX = "1"

# Reverse Speculative Decoding with vLLM in Token ID Space
#
# - Large model generates candidate tokens, small model validates them based on:
#   * Token must be in small model's top-20 predictions
#   * Token probability must exceed 0.01 threshold in small model's token distribution
# - Operates entirely in token ID space using vLLM's TokensPrompt to avoid Unicode issues (U+FFFD)
#   that commonly occur with rare mathematical symbols in reasoning traces when using BPE tokenization
# - Vocabulary matching mask ensures only compatible tokens are proposed
# - Special token mapping handles tokens unique to the small model's vocabulary by translating
#   them to equivalent token sequences for the large model

# --------------------------- imports ---------------------------------------
import os, html, uuid, asyncio, contextlib, nest_asyncio, logging
from IPython.display import HTML, display

import torch
from huggingface_hub import snapshot_download
from vllm import TokensPrompt
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.sampling_params import SamplingParams, RequestOutputKind

nest_asyncio.apply()
torch.set_grad_enabled(False)
logging.disable(logging.INFO)

# --------------------------- configuration ---------------------------------
LARGE_MODEL_NAME  = "simplescaling/s1.1-7B"
SMALL_MODEL_NAME  = "Qwen/Qwen3-0.6B"

LARGE_TEMPERATURE = 0.7
SMALL_TEMPERATURE = 0.7
MAX_SEQ_LEN       = 8192
MAX_NEW_TOKENS    = MAX_SEQ_LEN - 1024

# ---------------- utility: temporarily set visible GPUs --------------------
@contextlib.contextmanager
def visible_gpus(devices: str):
    original = os.environ.get("CUDA_VISIBLE_DEVICES", "")
    os.environ["CUDA_VISIBLE_DEVICES"] = devices
    print(f"\nCUDA_VISIBLE_DEVICES = {devices}")
    try:
        yield
    finally:
        os.environ["CUDA_VISIBLE_DEVICES"] = original

# --------------------------- engine setup ----------------------------------
async def setup_engines():
    global large_engine, small_engine, large_tokenizer, small_tokenizer
    global large_vocab_size, small_vocab_size, vocab_match_mask, token_mapping
    
    large_checkpoint = snapshot_download(LARGE_MODEL_NAME)
    small_checkpoint = snapshot_download(SMALL_MODEL_NAME)

    with visible_gpus(LARGE_GPU_INDEX):
        print("torch sees", torch.cuda.device_count(), "GPU(s)")              
        large_engine = AsyncLLMEngine.from_engine_args(
            AsyncEngineArgs(model=large_checkpoint, 
                            tensor_parallel_size=1,
                            max_model_len=MAX_SEQ_LEN, 
                            gpu_memory_utilization=0.60,
                            dtype="float16"),
            start_engine_loop=True)
        
        large_tokenizer = await large_engine.get_tokenizer()

    with visible_gpus(SMALL_GPU_INDEX):
        print("torch sees", torch.cuda.device_count(), "GPU(s)")              
        small_engine = AsyncLLMEngine.from_engine_args(
            AsyncEngineArgs(model=small_checkpoint, 
                            tensor_parallel_size=1,
                            max_model_len=MAX_SEQ_LEN, 
                            gpu_memory_utilization=0.20,
                            dtype="bfloat16"),
            start_engine_loop=True)
        
        small_tokenizer = await small_engine.get_tokenizer()

    # Get model configs using async methods
    large_model_config = await large_engine.get_model_config()
    small_model_config = await small_engine.get_model_config()
    
    large_vocab_size = large_model_config.get_vocab_size()
    small_vocab_size = small_model_config.get_vocab_size()
    
    print(f"Large vocab size: {large_vocab_size}")
    print(f"Small vocab size: {small_vocab_size}")
    print(f"Difference      : {abs(large_vocab_size - small_vocab_size)}")

    vocab_match_mask = torch.zeros(max(large_vocab_size, small_vocab_size), dtype=torch.float32)
    mismatches = []

    for idx in range(min(large_vocab_size, small_vocab_size)):
        large_token = large_tokenizer.convert_ids_to_tokens(idx)
        small_token = small_tokenizer.convert_ids_to_tokens(idx)
        
        if large_token == small_token:
            vocab_match_mask[idx] = 1.0
        else:
            mismatches.append((idx, large_token, small_token))

    print(f"Unmatched tokens: {len(mismatches)}")

    # print every mismatch 
    print(f"\n{'ID':>6}  {'Large token':<25}  Small token")
    for idx, large_token, small_token in mismatches:
        large_token  = "None" if large_token is None else str(large_token)
        small_token  = "None" if small_token is None else str(small_token)
        print(f"{idx:6}  {large_token:<25}  {small_token}")

    # show how the large tokenizer splits "<think>" and "</think>"
    for token_str in ("<think>", "</think>"):
        token_ids   = large_tokenizer.encode(token_str, add_special_tokens=False)
        token_pieces = [large_tokenizer.convert_ids_to_tokens(token_id) for token_id in token_ids]

        print(f"\nTokenization of {token_str!r} by the large tokenizer:")
        print(f"{'ID':>6}  Token piece")
        for token_id, token_piece in zip(token_ids, token_pieces):
            print(f"{token_id:6}  {token_piece}")
    
    # Create mapping for tokens that only exist in small model
    # When small model generates these tokens, we need to translate them
    # to equivalent token sequences for the large model
    token_mapping = {
        151665: [27, 14172, 9655, 29],   # <tool_response>  --> <, tool, _response, >
        151666: [522, 14172, 9655, 29],  # </tool_response> --> </, tool, _response, >
        151667: [13708, 766, 29],        # <think>          --> <th, ink, >
        151668: [522, 26865, 29],        # </think>         --> </, think, >
    }

# --------------------------- sampling params -------------------------------
large_sampling_params = SamplingParams(
    max_tokens  = 1,
    temperature = LARGE_TEMPERATURE,
    top_p       = 0.95, 
    logprobs    = 20,
    output_kind = RequestOutputKind.DELTA,
)
small_sampling_params = SamplingParams(
    max_tokens  = 1,
    temperature = SMALL_TEMPERATURE,
    top_p       = 0.95, 
    logprobs    = 20,
    output_kind = RequestOutputKind.DELTA,
)

# -------------------------- helper functions -------------------------------
def html_heatmap(token_ids, records, tokenizer):
    """Create heatmap visualization of generated tokens"""
    probability_min, probability_max = 0.0, 0.2
    
    def colour(probability):
        if probability >= probability_max: 
            return "rgb(0,0,0)"
        red = int(255 * (probability_max - probability) / (probability_max - probability_min))
        return f"rgb({red},0,0)"
    
    spans = []
    prompt_length = len(token_ids) - len(records)
    
    # Find token groups that form complete characters
    token_groups = []
    i = 0
    
    while i < len(token_ids):
        # Start with current token
        group_start = i
        group_end = i + 1
        
        # Expand the group until we have a valid UTF-8 sequence
        while group_end <= len(token_ids):
            # Try decoding the current group
            group_text = tokenizer.decode(token_ids[group_start:group_end])
            
            if '\ufffd' not in group_text:
                # Valid decode, but check if we should include more tokens
                if group_end < len(token_ids):
                    # Check if adding the next token changes the decode
                    extended_text = tokenizer.decode(token_ids[group_start:group_end+1])
                    current_plus_next = group_text + tokenizer.decode([token_ids[group_end]])
                    
                    if extended_text != current_plus_next or '\ufffd' in current_plus_next:
                        # Next token is part of this character, continue
                        group_end += 1
                        continue
                
                # We have a complete group
                break
            else:
                # Invalid decode, need more tokens
                group_end += 1
                if group_end > len(token_ids):
                    # Reached end with incomplete sequence
                    group_end = len(token_ids)
                    break
        
        # Store the group
        token_groups.append((group_start, group_end))
        i = group_end
    
    # Now render each group
    for group_start, group_end in token_groups:
        # Decode the group
        text = tokenizer.decode(token_ids[group_start:group_end])
        
        if not text:
            continue
        
        escaped = html.escape(text).replace(" ", "&nbsp;")
        
        # Check if this group is entirely in prompt
        if group_end - 1 < prompt_length:
            # All tokens in group are from prompt - gray
            spans.append(f"<span style='color:#666;'>{escaped}</span>")
        else:
            # At least one token is generated
            # Find minimum probability among generated tokens in this group
            min_prob = 1.0
            any_fallback = False
            
            for token_idx in range(group_start, group_end):
                if token_idx >= prompt_length:
                    record = records[token_idx - prompt_length]
                    min_prob = min(min_prob, record['small_probability'])
                    any_fallback = any_fallback or record['fallback']
            
            style = f"color:{colour(min_prob)};"
            if any_fallback:
                style += " text-decoration:underline;"
            spans.append(f"<span style='{style}'>{escaped}</span>")
    
    return HTML("<pre style='white-space:pre-wrap; line-height:1.45; "
                "font-family:inherit; background:#fff; padding:8px; "
                "border:1px solid #ddd;'>" + "".join(spans) + "</pre>")

# ------------------------- core decode loop --------------------------------
async def one_step(engine, sampling_params, context_ids):
    # Pass token IDs directly to vLLM using TokensPrompt
    # This avoids any decoding issues with partial tokens
    tokens_prompt = TokensPrompt(prompt_token_ids=context_ids)
    generator = engine.generate(tokens_prompt, sampling_params, request_id=str(uuid.uuid4()))
    return (await anext(generator)).outputs[0]

async def mixed_decode(prompt: str, max_new_tokens: int = MAX_NEW_TOKENS):
    # Tokenize the prompt to IDs using both tokenizers
    context_ids_small = small_tokenizer.encode(prompt)
    context_ids_large = []
    
    # For large model, we need to handle special tokens during initial tokenization
    for token_id in context_ids_small:
        if token_id in token_mapping:
            context_ids_large.extend(token_mapping[token_id])
        else:
            context_ids_large.append(token_id)
    
    step_index = 0
    PROB_THRESHOLD = 0.01  
    NUM_TRIALS = 5 
    
    for _ in range(max_new_tokens):
        large_output, small_output = await asyncio.gather(
            one_step(large_engine, large_sampling_params, context_ids_large),
            one_step(small_engine, small_sampling_params, context_ids_small))

        if step_index < 3:
            print(f"  large_output: {large_output}")
            print(f"  small_output: {small_output}")

        # Extract probabilities from large model output - logprobs is a list
        large_logprobs_dict = large_output.logprobs[0]  
        large_probs = {}
        for token_id, logprob in large_logprobs_dict.items():
            if vocab_match_mask[token_id] > 0:  # Only include vocab-matched tokens
                large_probs[token_id] = torch.exp(torch.tensor(logprob.logprob)).item()
        
        idx_pool = torch.tensor(list(large_probs.keys()))
        prob_pool = torch.tensor(list(large_probs.values()))
        prob_pool = prob_pool / prob_pool.sum()  # Normalize for sampling

        # Extract probabilities from small model output - compact dict for lookup  
        small_logprobs_dict = small_output.logprobs[0] 
        small_probs = {}
        for token_id, logprob in small_logprobs_dict.items():
            small_probs[token_id] = torch.exp(torch.tensor(logprob.logprob)).item()

        # Try to accept a token from large model's distribution
        fallback = True
        for _ in range(NUM_TRIALS):
            pool_idx = torch.multinomial(prob_pool, 1).item()
            candidate_token_id = idx_pool[pool_idx].item()
            if candidate_token_id in small_probs and small_probs[candidate_token_id] >= PROB_THRESHOLD:
                chosen_id = candidate_token_id
                fallback = False
                break

        # Fallback: sample from small model if no acceptance
        if fallback:
            idx_pool = torch.tensor(list(small_probs.keys()))
            prob_pool = torch.tensor(list(small_probs.values()))
            prob_pool = prob_pool / prob_pool.sum()
            pool_idx = torch.multinomial(prob_pool, 1).item()
            chosen_id = idx_pool[pool_idx].item()

        # Get token text for display only
        chosen_text = small_tokenizer.decode([chosen_id])
        large_probability = large_probs.get(chosen_id, 0.0)
        small_probability = small_probs.get(chosen_id, 0.0)

        step_index += 1
        record = dict(
            idx=step_index, 
            token_text=chosen_text,  # For display only
            token_id=chosen_id,
            fallback=fallback, 
            large_probability=large_probability, 
            small_probability=small_probability,
        )
        yield record

        print(f"{step_index:4d}{'*' if fallback else ' '}\t"
              f"{large_probability:.4f}\t{small_probability:.4f}\t"
              f"{chosen_id}\t'{chosen_text}'",
              flush=True)

        # Append to context in ID space for both models
        context_ids_small.append(chosen_id)
        
        # For large model, check if we need to map the token
        if chosen_id in token_mapping:
            # Append the mapped token sequence
            context_ids_large.extend(token_mapping[chosen_id])
        else:
            # Regular token, just append
            context_ids_large.append(chosen_id)
        
        if chosen_id == small_tokenizer.eos_token_id:
            break

# ---------------------- high-level convenience -----------------------------
async def run_mixed_decode(prompt: str, max_new_tokens: int = MAX_NEW_TOKENS):
    print("-"*80)
    print("Step\tL_Prob\tS_Prob\tTok_ID\tTok_Txt")
    records = []
    prompt_ids = small_tokenizer.encode(prompt)
    all_token_ids = prompt_ids.copy()
    
    async for record in mixed_decode(prompt, max_new_tokens):
        records.append(record)
        all_token_ids.append(record['token_id'])
    
    print("-"*80)
    
    # Display the properly decoded full text with heatmap
    display(html_heatmap(all_token_ids, records, small_tokenizer))
    
    # Also display the full decoded text for verification
    full_text = small_tokenizer.decode(all_token_ids)
    print("\nFull decoded text:")
    print(full_text)
    
    fallback_count = sum(record['fallback'] for record in records)
    print(f"\nFallback tokens: {fallback_count}/{len(records)} "
          f"({fallback_count/len(records)*100:.2f} %)")

# ------------------------ fire up the engines ------------------------------
await setup_engines()

# --------------------------- example usage ---------------------------------
system_prompt = (
    f"You are Qwen, created by Alibaba Cloud. You are a helpful assistant.\n"
    f"You must respond to every query in the following manner:\n"
    f"First, provide a step-by-step logical exploration of the problem.\n"
    f"Then, provide a clear and direct response based on your reasoning, with the final answer enclosed in \\boxed{{}}."
)

question = df['question'].iloc[0]

input = (
    f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
    f"<|im_start|>user\n{question}<|im_end|>\n"
    f"<|im_start|>assistant\n<think>"
)

await run_mixed_decode(input)

INFO 06-27 11:24:37 [__init__.py:239] Automatically detected platform cuda.


Fetching 20 files:   0%|          | 0/20 [00:00<?, ?it/s]

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]


CUDA_VISIBLE_DEVICES = 3
torch sees 1 GPU(s)


Loading safetensors checkpoint shards:   0% Completed | 0/7 [00:00<?, ?it/s]



CUDA_VISIBLE_DEVICES = 1
torch sees 1 GPU(s)


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Large vocab size: 152064
Small vocab size: 151936
Difference      : 128
Unmatched tokens: 4

    ID  Large token                Small token
151665  None                       <tool_response>
151666  None                       </tool_response>
151667  None                       <think>
151668  None                       </think>

Tokenization of '<think>' by the large tokenizer:
    ID  Token piece
 13708  <th
   766  ink
    29  >

Tokenization of '</think>' by the large tokenizer:
    ID  Token piece
   522  </
 26865  think
    29  >
--------------------------------------------------------------------------------
Step	L_Prob	S_Prob	Tok_ID	Tok_Txt
  large_output: CompletionOutput(index=0, text='Alright', token_ids=[71486], cumulative_logprob=-0.8970334529876709, logprobs=[{71486: Logprob(logprob=-0.8970334529876709, rank=2, decoded_token='Alright'), 32313: Logprob(logprob=-0.8189084529876709, rank=1, decoded_token='Okay'), 35439: Logprob(logprob=-2.147033452987671, rank=3, decoded_tok


Full decoded text:
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.
You must respond to every query in the following manner:
First, provide a step-by-step logical exploration of the problem.
Then, provide a clear and direct response based on your reasoning, with the final answer enclosed in \boxed{}.<|im_end|>
<|im_start|>user
Given a rational number, write it as a fraction in lowest terms and calculate the product of the resulting numerator and denominator. For how many rational numbers between 0 and 1 will $20_{}^{}!$ be the resulting product?<|im_end|>
<|im_start|>assistant
<think>
Okay, let me try to tackle this problem. The question is asking: Given a rational number between 0 and 1, write it as a fraction in lowest terms and calculate the product of the numerator and denominator. Then, how many such rational numbers will result in the product 20!?

First, let me make sure I understand the problem correctly. A rational number between 0 and 1

In [4]:
# --------------------------- example usage ---------------------------------
system_prompt = (
    f"You are Qwen, created by Alibaba Cloud. You are a helpful assistant.\n"
    f"You must respond to every query in the following manner:\n"
    f"First, provide a step-by-step logical exploration of the problem.\n"
    f"Then, provide a clear and direct response based on your reasoning, with the final answer enclosed in \\boxed{{}}."
)

question = df['question'].iloc[0]

input = (
    f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
    f"<|im_start|>user\n{question}<|im_end|>\n"
    f"<|im_start|>assistant\n<think>"
)

await run_mixed_decode(input)

--------------------------------------------------------------------------------
Step	L_Prob	S_Prob	Tok_ID	Tok_Txt
  large_output: CompletionOutput(index=0, text='Okay', token_ids=[32313], cumulative_logprob=-0.8107389211654663, logprobs=[{32313: Logprob(logprob=-0.8107389211654663, rank=1, decoded_token='Okay'), 71486: Logprob(logprob=-0.9044889211654663, rank=2, decoded_token='Alright'), 35439: Logprob(logprob=-2.154489040374756, rank=3, decoded_token='ĠOkay'), 97593: Logprob(logprob=-3.498239040374756, rank=4, decoded_token='ĠAlright'), 5338: Logprob(logprob=-7.107614040374756, rank=5, decoded_token='First'), 3925: Logprob(logprob=-7.326364040374756, rank=6, decoded_token='OK'), 11578: Logprob(logprob=-7.420114040374756, rank=7, decoded_token='Ok'), 6771: Logprob(logprob=-8.076363563537598, rank=8, decoded_token='ĠLet'), 510: Logprob(logprob=-8.170113563537598, rank=9, decoded_token=':Ċ'), 10402: Logprob(logprob=-8.279488563537598, rank=10, decoded_token='ĠOK'), 10061: Logprob(logpr


Full decoded text:
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.
You must respond to every query in the following manner:
First, provide a step-by-step logical exploration of the problem.
Then, provide a clear and direct response based on your reasoning, with the final answer enclosed in \boxed{}.<|im_end|>
<|im_start|>user
Given a rational number, write it as a fraction in lowest terms and calculate the product of the resulting numerator and denominator. For how many rational numbers between 0 and 1 will $20_{}^{}!$ be the resulting product?<|im_end|>
<|im_start|>assistant
<think>
Okay, let me try to figure this out. The problem says: given a rational number between 0 and 1, write it as a fraction in lowest terms and then multiply the numerator and the denominator. Then, we need to find how many such rational numbers will result in the product 20! (that's 20 factorial). Hmm, okay. Let me break this down step by step.

First, I need to underst

In [9]:
system_prompt = (
    f"You are Qwen, created by Alibaba Cloud. You are a helpful assistant.\n"
    f"You must respond to every query in the following manner:\n"
    f"First, provide a step-by-step logical exploration of the problem.\n"
    f"Then, provide a clear and direct response based on your reasoning, with the final answer enclosed in \\boxed{{}}."
)

question = df['question'].iloc[0]

input = (
    f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
    f"<|im_start|>user\n{question}<|im_end|>\n"
    f"<|im_start|>assistant\n<think>"
)

print(input)

model_inputs = small_tokenizer([input], return_tensors="pt")
print(model_inputs)


<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.
You must respond to every query in the following manner:
First, provide a step-by-step logical exploration of the problem.
Then, provide a clear and direct response based on your reasoning, with the final answer enclosed in \boxed{}.<|im_end|>
<|im_start|>user
Given a rational number, write it as a fraction in lowest terms and calculate the product of the resulting numerator and denominator. For how many rational numbers between 0 and 1 will $20_{}^{}!$ be the resulting product?<|im_end|>
<|im_start|>assistant
<think>
{'input_ids': tensor([[151644,   8948,    198,   2610,    525,   1207,  16948,     11,   3465,
            553,  54364,  14817,     13,   1446,    525,    264,  10950,  17847,
            624,   2610,   1969,   5889,    311,   1449,   3239,    304,    279,
           2701,  11566,    510,   5338,     11,   3410,    264,   3019,  14319,
          29208,  19819,  26403,    315,    279,  

In [10]:
from transformers import AutoTokenizer

model_name = "Qwen/Qwen3-0.6B" 
tokenizer = AutoTokenizer.from_pretrained(model_name)

system_prompt = (
    f"You are Qwen, created by Alibaba Cloud. You are a helpful assistant.\n"
    f"You must respond to every query in the following manner:\n"
    f"First, provide a step-by-step logical exploration of the problem.\n"
    f"Then, provide a clear and direct response based on your reasoning, with the final answer enclosed in \\boxed{{}}."
)

question = df['question'].iloc[0]

messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": question}
]

text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=True
)

input = text + '<think>'
print(input)

model_inputs = tokenizer([input], return_tensors="pt")
print(model_inputs)

<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.
You must respond to every query in the following manner:
First, provide a step-by-step logical exploration of the problem.
Then, provide a clear and direct response based on your reasoning, with the final answer enclosed in \boxed{}.<|im_end|>
<|im_start|>user
Given a rational number, write it as a fraction in lowest terms and calculate the product of the resulting numerator and denominator. For how many rational numbers between 0 and 1 will $20_{}^{}!$ be the resulting product?<|im_end|>
<|im_start|>assistant
<think>
{'input_ids': tensor([[151644,   8948,    198,   2610,    525,   1207,  16948,     11,   3465,
            553,  54364,  14817,     13,   1446,    525,    264,  10950,  17847,
            624,   2610,   1969,   5889,    311,   1449,   3239,    304,    279,
           2701,  11566,    510,   5338,     11,   3410,    264,   3019,  14319,
          29208,  19819,  26403,    315,    279,  