In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from dotenv import load_dotenv

load_dotenv("../.env")
model_name = "Qwen/Qwen3-0.6B"

In [None]:
# load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)

In [None]:
print(type(tokenizer)) # tokenizer = convert text -> token IDs
print(type(model)) # model

In [None]:
# prepare the model input
prompt = "What's the meaning of machine learning?"
messages = [{"role": "user", "content": prompt}]

text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True, # Adds the <|im_start|>assistant\n tag -> signal model to start generating response
    enable_thinking=True # switch btw thinking and non-thinking modes. Default is True
) # convert conversation into format that Qwen expected

print(f"--- Formatting string ---")
print(text)
# note if set tokenize = False -> return formating string (not token ids)
# <|im_start|>user\nWhat's the meaning of machine learning?<|im_end|>\n<|im_start|>assistant\n


print(f"--- token ids ---")
print(text)
# note if set tokenize=True
# {'input_ids': [151644, 872, 198, 3838, 594, 279, 7290, 315, 5662, 6832, 30, 151645, 198, 151644, 77091, 198], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
# input_ids = numerical token ids
# Special numbers like 151644 and 151645 are likely special tokens
#       -> <|im_start|> and <|im_end|> (chat format markers)
#       -> role indicators like "user", "assistant", etc.
#       --> BOS (beginning of sequence) / EOS (end of sequence)

# attention_mask = binary mask show which token model should pay attention to
# 1 = "pay attention to this token"
# 0 = "ignore this token" (used for padding)
# Our example: [1, 1, 1, 1, ...] - all 1s means no padding, all tokens are real content

# What is the capital of France?
#          ↓ (tokenization)
# [151644, 872, 198, 3838, 594, 279, 7290, ...]
#  │       └─────── actual text tokens ──────┘  │
#  │                                              │
# special token                            special token

model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
model_inputs # get the token ids directly

In [None]:
model.device, type(text), text

In [None]:
# conduct text completion
generated_ids = model.generate(
    **model_inputs, #input_ids, attention_mask
    max_new_tokens=32768 # maximum of new token (not total) -> upper limit
)
# generated_ids = tensor containing complete sequence of token IDs (original prompt + new generate token(max 32768))

In [None]:
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() # filter to get only new generated token ids

In [None]:
# parsing thinkign content
try:
    # rindex finding 151668 (</think>)
    index = len(output_ids) - output_ids[::-1].index(151668)
except ValueError:
    index = 0

thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

print("thinking content:", thinking_content)
print(f"---"*20)
print("content:", content)

In [None]:
model_inputs.input_ids