In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

model_name = "Qwen/Qwen3-0.6B"

# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    load_in_4bit=True,      # bitsandbytes QLoRA
    device_map="auto",
)
# Prepare for k-bit training (freeze layers, cast norms, etc.)
model = prepare_model_for_kbit_training(model) 

# 2. Attach LoRA adapters
lora_cfg = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_cfg) 
model.print_trainable_parameters()

In [None]:
print(model.generation_config.temperature)  # likely 0.6

In [None]:
# prepare the model input
prompt = "Toula went to the bakery and bought various types of pastries. She bought 3 dozen donuts which cost $68 per dozen, 2 dozen mini cupcakes which cost $80 per dozen, and 6 dozen mini cheesecakes for $55 per dozen. How much was the total cost?"
messages = [
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=False # Switches between thinking and non-thinking modes. Default is True.
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

# conduct text completion
generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=32768,
    do_sample=False,
    temperature=0.0,
)
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 

# parsing thinking content
try:
    # rindex finding 151668 (</think>)
    index = len(output_ids) - output_ids[::-1].index(151668)
except ValueError:
    index = 0

thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

print("thinking content:", thinking_content)
print("content:", content)

In [None]:
########################

In [None]:
# ─── Cell 1: Imports & Quantized + LoRA-Wrapped Model Loading ───
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel, prepare_model_for_kbit_training

# 1. Quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

# 2. Load base in 4-bit
base = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen3-0.6B",
    device_map="auto",
    quantization_config=bnb_config
)
print("Base params:", sum(p.numel() for p in base.parameters()))

# 3. Prep for k-bit LoRA
base = prepare_model_for_kbit_training(base)

# 4. Load your adapter
peft_model = PeftModel.from_pretrained(
    base,
    "../outputs/qwen3-0.6B-gsm8k-lora/adapter",
    is_trainable=True
)

# 5. Inspect parameter counts
total = sum(p.numel() for p in peft_model.parameters())
trainable = sum(p.numel() for p in peft_model.parameters() if p.requires_grad)
print(f"Total: {total:,} | Trainable: {trainable:,} ({100*trainable/total:.2f}%)")

# move model to GPU (or stay on CPU if no GPU)
device = "cuda" if torch.cuda.is_available() else "cpu"
peft_model.to(device)
peft_model.eval()

In [2]:
# ─── Cell 2: Tokenizer & Generate Function ───
# 6. Tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B", use_fast=True)

# 7. Inference helper
def generate(prompt: str,
             max_new_tokens: int = 128,
             temperature: float = 0.7,
             do_sample: bool = True):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        out = peft_model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=do_sample,
            temperature=temperature,
            eos_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(out[0], skip_special_tokens=True)


In [None]:
# ─── Cell 3: Run Some Examples ───
# You can now call generate(...) in any later cell:

prompt = "Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?"
print(generate(prompt, max_new_tokens=256, temperature=0.0, do_sample=False))
