In [1]:
!pip list | grep neuron

libneuronxla                  2.2.3493.0+78c3e78c
neuronx-cc                    2.18.121.0+9e31e41a
neuronx-distributed           0.12.12111+cdd84048
neuronx-distributed-inference 0.3.5591+f50feae2
torch-neuronx                 2.6.0.2.7.5413+113e6810


In [None]:
import torch
from transformers import AutoTokenizer, GenerationConfig
from neuronx_distributed_inference.models.config import NeuronConfig, OnDeviceSamplingConfig
from neuronx_distributed_inference.utils.hf_adapter import HuggingFaceGenerationAdapter, load_pretrained_config

In [2]:
model_path = "/home/ubuntu/model_hf_qwen/qwen2/"
traced_model_path = "/home/ubuntu/traced_model_qwen/qwen2"

In [None]:
from huggingface_hub import snapshot_download

snapshot_download("Qwen/QwQ-32B", local_dir=model_path)

In [None]:
from modeling_qwen_v2 import Qwen2InferenceConfig, NeuronQwen2ForCausalLM

def run_qwen2_compile():
    # Initialize configs and tokenizer.
    tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side="right")
    tokenizer.pad_token = tokenizer.eos_token

    generation_config = GenerationConfig.from_pretrained(model_path)
    generation_config_kwargs = {
        "do_sample": False,
        "top_k": 1,
        "pad_token_id": tokenizer.pad_token_id,
    }
    generation_config.update(**generation_config_kwargs)
 
    neuron_config = NeuronConfig(
        tp_degree=8,
        batch_size=1,
        max_context_length=128,
        seq_len=256,
        enable_bucketing=True,
        context_encoding_buckets=[128],
        token_generation_buckets=[256],
        flash_decoding_enabled=False,
        torch_dtype=torch.bfloat16,
        fused_qkv=False,
        attn_kernel_enabled=True,
        attn_cls="NeuronQwen2Attention"
    )
    config = Qwen2InferenceConfig(
        neuron_config,
        load_config=load_pretrained_config(model_path),
    )
    
    # Compile and save model.
    print("\nCompiling and saving model...")
    model = NeuronQwen2ForCausalLM(model_path, config)
    model.compile(traced_model_path)
    tokenizer.save_pretrained(traced_model_path)

In [None]:
run_qwen2_compile()

In [None]:
from modeling_qwen_v2 import Qwen2InferenceConfig, NeuronQwen2ForCausalLM

model = NeuronQwen2ForCausalLM(traced_model_path)
model.load(traced_model_path)

In [None]:
config = model.get_config_cls()
config.get_neuron_config_cls()

In [9]:
model.config.num_attention_heads

40

In [10]:
model.config.num_key_value_heads

8

In [11]:
model.config.hidden_size

5120

In [12]:
tokenizer = AutoTokenizer.from_pretrained(traced_model_path)
tokenizer.pad_token = tokenizer.eos_token
generation_config = GenerationConfig.from_pretrained(model_path)
generation_config_kwargs = {
    "do_sample": True,
    "temperature": 0.9,
    "top_k": 5,
    "pad_token_id": tokenizer.pad_token_id,
}

prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt")
generation_model = HuggingFaceGenerationAdapter(model)
generated_ids = generation_model.generate(
    **model_inputs,
    max_new_tokens=128
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


"Okay, the user wants a short introduction to large language models. Let me start by defining what a large language model is. I should mention that they are AI systems trained on vast amounts of text data. Maybe include that they use deep learning, specifically transformer architectures.\n\nI need to highlight their capabilities, like generating text, understanding context, and performing various tasks such as answering questions, writing stories, or coding. It's important to note their scale—large parameter counts and extensive training data. \n\nAlso, touch on their applications: customer service, content creation, research, etc. Maybe mention some examples like GPT, BERT, or"

In [13]:
model.reset()

# Run Benchmarks

In [1]:
model_path = "/home/ubuntu/model_hf_qwen/qwen2"
traced_model_path = "/home/ubuntu/traced_model_qwen/qwen2/logit"

In [None]:
dir = '/opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/lib/python3.10/site-packages/neuronx_distributed_inference/'
!cp modeling_qwen2.py {dir}

# Edit the inference_demo.py file to include the following:

```python
from .modeling_qwen2 import NeuronQwen2ForCausalLM

MODEL_TYPES = {
    "llama": {"causal-lm": NeuronLlamaForCausalLM},
    "mixtral": {"causal-lm": NeuronMixtralForCausalLM},
    "dbrx": {"causal-lm": NeuronDbrxForCausalLM},
    'qwen2': {"causal-lm": NeuronQwen2ForCausalLM}
}
```

In [8]:
!inference_demo \
    --model-type qwen2 \
    --task-type causal-lm \
    run \
    --model-path /home/ubuntu/model_hf_qwen/qwen2 \
    --compiled-model-path /home/ubuntu/traced_model_qwen/qwen2/logit \
    --torch-dtype bfloat16 \
    --tp-degree 8 \
    --batch-size 1 \
    --max-context-length 16 \
    --seq-len 32 \
    --top-k 1 \
    --pad-token-id 151645 \
    --prompt "To be, or not to be" \
    --check-accuracy-mode logit-matching \
    --benchmark

  from neuronx_distributed.modules.moe.blockwise import (
  from neuronx_distributed.modules.moe.blockwise import (
  from neuronx_distributed.modules.moe.blockwise import (
  from neuronx_distributed_inference.modules.custom_calls import neuron_cumsum
  return fn(*args, **kwargs)
  from neuronx_distributed_inference.modules.attention.gqa import GQA, GroupQueryAttention_QKV
  from neuronx_distributed_inference.modules.attention.gqa import GQA, GroupQueryAttention_QKV
  from neuronx_distributed_inference.modules.attention.gqa import GQA, GroupQueryAttention_QKV
  from neuronx_distributed_inference.modules.attention.attention_base import NeuronAttentionBase
  from neuronx_distributed_inference.modules.attention.attention_base import NeuronAttentionBase
  from neuronx_distributed_inference.models.dbrx.modeling_dbrx import NeuronDbrxForCausalLM
  from neuronx_distributed_inference.models.mixtral.modeling_mixtral import NeuronMixtralForCausalLM
  from .modeling_mllama_vision import NeuronMl