## AMD GPU Setup

In [1]:
!export TF_ENABLE_ONEDNN_OPTS=0

In [2]:
import os
os.environ['HSA_OVERRIDE_GFX_VERSION'] = '11.0.0'

gfx_version = os.getenv('HSA_OVERRIDE_GFX_VERSION')
print(f"HSA_OVERRIDE_GFX_VERSION is set to: {gfx_version}")

HSA_OVERRIDE_GFX_VERSION is set to: 11.0.0


In [3]:
import warnings
warnings.filterwarnings("ignore")

-------------------------------------------------------

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

In [14]:
model_name = "Qwen/Qwen3-0.6B"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
).to(device)
device

device(type='cuda')

In [3]:
model

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 1024)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=1024, out_features=2048, bias=False)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (up_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (down_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
        (post_attention_layernorm): Qwe

In [4]:
prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "user", "content": prompt}
]

In [5]:

text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)


In [6]:
model_inputs

{'input_ids': tensor([[151644,    872,    198,  35127,    752,    264,   2805,  16800,    311,
           3460,   4128,   1614,     13, 151645,    198, 151644,  77091,    198]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [7]:
generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=2768
)
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 


In [8]:
len(output_ids)

255

In [9]:
# parsing thinking content
try:
    # rindex finding 151668 (</think>)
    index = len(output_ids) - output_ids[::-1].index(151668)
except ValueError:
    index = 0
    

In [10]:
index

190

In [11]:

thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")


In [12]:
thinking_content

'<think>\nOkay, the user wants a short introduction to a large language model. Let me start by recalling what I know about LLMs. They\'re big, powerful models used for tasks like writing, translation, etc. I should mention their capabilities.\n\nI need to keep it concise. Maybe start with the basics: "Large Language Models (LLMs) are AI systems designed to understand and generate human language." Then talk about their main functions, like text generation, translation, and answering questions. \n\nWait, should I include something about training data? Probably yes. LLMs are trained on vast amounts of text to understand context. Also, mention their applications: creative writing, customer service, etc. \n\nMake sure the tone is friendly and informative. Avoid jargon. Keep it simple. Let me check if I\'m missing anything. Oh, maybe mention that they can be customized for specific tasks. Alright, that should cover it.\n</think>'

In [13]:
content

'Large Language Models (LLMs) are AI systems designed to understand and generate human language. They are trained on vast amounts of text to grasp context and learn patterns, enabling tasks like writing, translation, and answering questions. These models can be customized for specific applications, making them versatile tools for creative and professional tasks.'