In [None]:
!pip list | grep neuron

In [2]:
import torch

from transformers import AutoTokenizer, GenerationConfig
from modeling_qwen import Qwen2InferenceConfig, NeuronQwen2ForCausalLM
from neuronx_distributed_inference.models.config import NeuronConfig, OnDeviceSamplingConfig
from neuronx_distributed_inference.utils.hf_adapter import HuggingFaceGenerationAdapter, load_pretrained_config

In [3]:
model_path = "/home/ubuntu/model_hf_qwq/qwq/"
traced_model_path = "/home/ubuntu/traced_model_qwq/qwq/"

In [None]:
from huggingface_hub import HfFolder
HfFolder.save_token("YOUR TOKEN HERE")

In [None]:
from huggingface_hub import snapshot_download

snapshot_download("Qwen/QwQ-32B", local_dir=model_path)

In [11]:
from modeling_qwen import Qwen2InferenceConfig, NeuronQwen2ForCausalLM

def run_qwq_compile():
    # Initialize configs and tokenizer.
    tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side="right")
    tokenizer.pad_token = tokenizer.eos_token

    generation_config = GenerationConfig.from_pretrained(model_path)
    generation_config_kwargs = {
        "do_sample": True,
        "top_k": 1,
        "pad_token_id": tokenizer.pad_token_id,
    }
    generation_config.update(**generation_config_kwargs)
 
    neuron_config = NeuronConfig(
        tp_degree=8,
        batch_size=1,
        max_context_length=4096,
        seq_len=8096,
        on_device_sampling_config=OnDeviceSamplingConfig(top_k=5),
        enable_bucketing=True,
        context_encoding_buckets=[128, 1024, 4096],
        token_generation_buckets=[128, 1024, 8096],
        flash_decoding_enabled=False,
        torch_dtype=torch.bfloat16,
        fused_qkv=False,
        attn_cls="NeuronQwen2Attention"
    )
    config = Qwen2InferenceConfig(
        neuron_config,
        load_config=load_pretrained_config(model_path),
    )
    
    # Compile and save model.
    print("\nCompiling and saving model...")
    model = NeuronQwen2ForCausalLM(model_path, config)
    model.compile(traced_model_path)
    tokenizer.save_pretrained(traced_model_path)

In [None]:
run_qwq_compile()

# Run inference

In [None]:
model = NeuronQwen2ForCausalLM(traced_model_path)
model.load(traced_model_path)

In [29]:
tokenizer = AutoTokenizer.from_pretrained(traced_model_path)
tokenizer.pad_token = tokenizer.eos_token
generation_config = GenerationConfig.from_pretrained(model_path)
generation_config_kwargs = {
    "do_sample": True,
    "temperature": 0.9,
    "top_k": 5,
    "pad_token_id": tokenizer.pad_token_id,
}
generation_config.update(**generation_config_kwargs)
generation_model = HuggingFaceGenerationAdapter(model)

In [32]:
# Define a list of prompts
prompts = [
    "How many r's are in the word \"strawberry\"",
]

# Create messages for each prompt
messages_list = [
    [{"role": "user", "content": prompt}] for prompt in prompts
]

# Apply chat template to each set of messages
texts = [
    tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    ) for messages in messages_list
]

# Tokenize the batch of texts
model_inputs = tokenizer(texts, return_tensors="pt", padding=True)

In [33]:
print("\nGenerating outputs...")
outputs = generation_model.generate(
    **model_inputs,
    generation_config=generation_config,
    max_length=model.config.neuron_config.max_length,
)
output_tokens = tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False)

In [None]:
from IPython.display import display
display("Generated outputs:")
for i, output_token in enumerate(output_tokens):
    display(f"Output {i}: {output_token}")

'Generated outputs:'

'Output 0: user\nHow many r\'s are in the word "strawberry"\nassistant\n<think>\nOkay, so I need to figure out how many times the letter \'r\' appears in the word "strawberry." Let me start by writing down the word and looking at each letter one by one. \n\nFirst, I\'ll spell out "strawberry" to make sure I have all the letters right. S-T-R-A-W-B-E-R-R-Y. Wait, let me check that again. Sometimes I might miss a letter. Let me count the letters as I write them:\n\n1. S\n2. T\n3. R\n4. A\n5. W\n6. B\n7. E\n8. R\n9. R\n10. Y\n\nHmm, so that\'s 10 letters in total. Now, I need to count how many times \'R\' shows up. Let me go through each letter again and note the positions where \'R\' is.\n\nStarting from the first letter:\n1. S – not an R\n2. T – not an R\n3. R – that\'s the first R\n4. A – no\n5. W – no\n6. B – no\n7. E – no\n8. R – second R\n9. R – third R\n10. Y – no\n\nWait a second, so after the first R at position 3, the next R is at position 8, and then another at 9? Let me confirm

In [37]:
model.reset()

In [38]:
del model

Test Token Output

In [3]:
dir = '/opt/aws_neuronx_venv_pytorch_2_5_nxd_inference/lib/python3.10/site-packages/neuronx_distributed_inference/'
!cp modeling_qwen.py {dir}

In [None]:
!cp {dir}/inference_demo.py .

# Add the following to the inference_demo.py we just copied to our working directory

```
from .modeling_qwen import NeuronQwen2ForCausalLM

MODEL_TYPES = {
    "llama": {"causal-lm": NeuronLlamaForCausalLM},
    "mixtral": {"causal-lm": NeuronMixtralForCausalLM},
    "dbrx": {"causal-lm": NeuronDbrxForCausalLM},
    "qwen": {'causal-lm': NeuronQwen2ForCausalLM} #add this line
}
```

In [None]:
!cp ./inference_demo.py {dir}/inference_demo.py

# Restart your kernel

In [None]:
!inference_demo \
    --model-type qwen \
    --task-type causal-lm \
    run \
    --model-path /home/ubuntu/model_hf_qwq/qwq/ \
    --compiled-model-path /home/ubuntu/traced_model_qwq/qwq/ \
    --torch-dtype bfloat16 \
    --tp-degree 8 \
    --batch-size 1 \
    --max-context-length 32 \
    --seq-len 64 \
    --on-device-sampling \
    --enable-bucketing \
    --top-k 1 \
    --do-sample \
    --pad-token-id 32000 \
    --prompt "To be, or not to be" \
    --check-accuracy-mode token-matching \
    --benchmark