In [3]:
import os
import json
import torch

# import the bits you need
from qwen3_manual import QwenMoe, load_weights, generate
from tokenizers import Tokenizer

# 1) load config & tokenizer
model_dir = "/home/zheng/LLMs/Qwen3-30B-A3B"
with open(os.path.join(model_dir, "config.json")) as f:
    cfg = json.load(f)
tokenizer = Tokenizer.from_file(os.path.join(model_dir, "tokenizer.json"))

# 2) build the model
model = QwenMoe(cfg).to(memory_format=torch.channels_last)
#   (the script sets TARGET_DTYPE from its CLI; 
#    if you want float32, set torch.TARGET_DTYPE before loading)

# 3) load the weights
print("Loading weights…")
load_weights(model, model_dir)




Loading weights…
#0  loading model-00016-of-00016.safetensors …
#1  loading model-00001-of-00016.safetensors …
#2  loading model-00004-of-00016.safetensors …
#3  loading model-00005-of-00016.safetensors …
#4  loading model-00006-of-00016.safetensors …
#5  loading model-00007-of-00016.safetensors …
#6  loading model-00002-of-00016.safetensors …
#7  loading model-00008-of-00016.safetensors …
#8  loading model-00009-of-00016.safetensors …
#9  loading model-00010-of-00016.safetensors …
#10  loading model-00011-of-00016.safetensors …
#11  loading model-00012-of-00016.safetensors …
#12  loading model-00013-of-00016.safetensors …
#13  loading model-00014-of-00016.safetensors …
#14  loading model-00015-of-00016.safetensors …
#15  loading model-00003-of-00016.safetensors …


QwenMoe(
  (model): _QwenBody(
    (embed_tokens): Embedding(151936, 2048)
    (layers): ModuleList(
      (0-47): 48 x DecoderLayer(
        (self_attn): QwenAttention(
          (q_proj): Linear(in_features=2048, out_features=4096, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=4096, out_features=2048, bias=False)
          (q_norm): RMSNorm()
          (k_norm): RMSNorm()
        )
        (mlp): SparseMoe(
          (gate): Linear(in_features=2048, out_features=128, bias=False)
          (experts): ModuleList(
            (0-127): 128 x ExpertMLP(
              (gate_proj): Linear(in_features=2048, out_features=768, bias=False)
              (up_proj): Linear(in_features=2048, out_features=768, bias=False)
              (down_proj): Linear(in_features=768, out_features=2048, bias=False)
              (act): SiLU()
            )
  

In [26]:
# 4) prepare input IDs
prompt = "Hello from notebook"
input_ids = torch.tensor([tokenizer.encode(prompt).ids], dtype=torch.long)

# 5) generate
out_ids = generate(
    model, input_ids,
    max_new=4,
    temperature=0.8,
    top_p=0.95,
    top_k=50,
    eos=tokenizer.token_to_id("<|im_end|>")
)

# 6) decode & inspect
generated = tokenizer.decode(out_ids[0].tolist())
print(generated)

Hello from notebook演示常用常用常用


In [27]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_dir)
# Prepare the model input
prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=True # Switch between thinking and non-thinking modes. Default is True.
)
model_inputs = tokenizer([text], return_tensors="pt")

# Initialize the input ids for the generation
input_ids = model_inputs['input_ids']

# 5) generate
out_ids = generate(
    model, input_ids,
    max_new=16,
    temperature=0,
    top_p=0.95,
    top_k=50,
    eos=tokenizer.eos_token_id
)

# 6) decode & inspect
generated = tokenizer.decode(out_ids[0].tolist())
print(generated)

<|im_start|>user
Give me a short introduction to large language model.<|im_end|>
<|im_start|>assistant
 hashtag webinar deadline webinar webinar deadline免费免费免费免费免费免费免费免费免费免费


In [None]:
# ───────────────────────────────────────────────────────────────
# 0.  Imports & paths
# ───────────────────────────────────────────────────────────────
import os, json, gc, math, torch, safetensors.torch as st
from tokenizers import Tokenizer
from transformers import AutoModelForCausalLM, AutoConfig

from qwen3_manual import QwenMoe, load_weights        # ← your script


model_dir = "/home/zheng/LLMs/Qwen3-30B-A3B"
tok = Tokenizer.from_file(os.path.join(model_dir, "tokenizer.json"))

# ───────────────────────────────────────────────────────────────
# 1.  Build & load *our* model (once)
# ───────────────────────────────────────────────────────────────
with open(os.path.join(model_dir, "config.json")) as f:
    cfg = json.load(f)

our_model = model

print("✓  our model ready")

# ───────────────────────────────────────────────────────────────
# 2.  Safer generate()  (handles temperature = 0)
# ───────────────────────────────────────────────────────────────
@torch.no_grad()
def generate(model, input_ids, max_new=16, temperature=0.7,
             top_p=0.95, top_k=50, eos=None):
    for _ in range(max_new):
        logits = model(input_ids)[:, -1, :]
        if temperature == 0:                       # greedy
            next_token = logits.argmax(-1, keepdim=True)
        else:
            logits = logits / temperature
            top_k = min(top_k, logits.size(-1))
            topk_vals, topk_idx = torch.topk(logits, top_k, dim=-1)
            probs = torch.softmax(topk_vals, -1)
            next_token = topk_idx.gather(-1, torch.multinomial(probs, 1))
        input_ids = torch.cat([input_ids, next_token], dim=-1)
        if eos is not None and next_token.item() == eos:
            break
    return input_ids

# ───────────────────────────────────────────────────────────────
# 3.  Quick sanity: greedy generation
# ───────────────────────────────────────────────────────────────
prompt = "<|im_start|>user\nGive me a short introduction to large language model.<|im_end|>\n<|im_start|>assistant\n"
ids = torch.tensor([tok.encode(prompt).ids])
print(tok.decode(generate(our_model, ids, max_new=32, temperature=0)[0].tolist()))

# ───────────────────────────────────────────────────────────────
# 4.  Helper: run *our* network up to layer N
# ───────────────────────────────────────────────────────────────
def ours_hidden(input_ids, upto):
    h = our_model.model.embed_tokens(input_ids)
    for i in range(upto + 1):
        h = our_model.model.layers[i](h, our_model.model.rope)
    return our_model.model.norm(h)

# ───────────────────────────────────────────────────────────────
# 5.  Helper: run HF **single layer** without loading the whole net
# ───────────────────────────────────────────────────────────────
# ── fixed helper: run a single HF layer without loading the whole model ──
# ── helper: run a single HF layer without loading full model ────────────
# ── helper: run a single HF layer, robust to 4.48 → 4.51 APIs ────────────────
# ── run a single HF layer without loading the whole model ─────────────
def hf_one_layer(input_ids, layer_idx):
    import gc, json, torch, safetensors.torch as st
    from transformers import AutoConfig, AutoModelForCausalLM

    # 1) config trimmed to 1 decoder layer
    cfg = AutoConfig.from_pretrained(model_dir)
    cfg.num_hidden_layers = 1
    hf = AutoModelForCausalLM.from_config(cfg, torch_dtype=torch.bfloat16).eval()

    # 2) load only needed weights
    wanted = [
        "model.embed_tokens",
        f"model.layers.{layer_idx}",
        "model.norm",
        "model.rotary_emb",
    ]
    fmap = json.load(open(os.path.join(model_dir, "model.safetensors.index.json")))["weight_map"]
    names = [n for n in fmap if any(n.startswith(w) for w in wanted)]
    shard_to_names = {}
    for n in names:
        shard_to_names.setdefault(fmap[n], []).append(n)

    sd = hf.state_dict()
    for shard, keys in shard_to_names.items():
        tensors = st.load_file(os.path.join(model_dir, shard), device="cpu")
        for k in keys:
            sd[k].copy_(tensors[k].to(sd[k].dtype))
        del tensors
        gc.collect()

    # 3) forward through the single layer (let HF handle RoPE internally)
    with torch.no_grad():
        h = hf.model.embed_tokens(input_ids)          # [B,T,D]
        pos_ids = torch.arange(h.size(1), device=h.device).unsqueeze(0)
        h = hf.model.layers[0](h, position_ids=pos_ids)   # no attention_mask
        h = hf.model.norm(h)

    del hf, sd
    gc.collect()
    return h


    del hf, state; gc.collect()
    return h


# ───────────────────────────────────────────────────────────────
# 6.  Compare layers one-by-one (always <4 GB at a time)
# ───────────────────────────────────────────────────────────────
ids = torch.tensor([tok.encode("Hello").ids])
for L in range(cfg["num_hidden_layers"]):
    ours = ours_hidden(ids, L)
    ref  = hf_one_layer(ids, L)
    if torch.allclose(ours, ref, atol=1e-2, rtol=1e-2):
        print(f"✓ layer {L} matches")
    else:
        print(f"❌ divergence at layer {L}")
        break


✓  our model ready
user
Give me a short introduction to large language model.
assistant
 hashtag webinar deadline webinar webinar deadline免费免费免费免费免费免费免费免费免费免费免费免费免费免费免费免费免费免费免费免费免费免费免费订阅免费订阅


TypeError: Qwen3MoeRotaryEmbedding.forward() missing 1 required positional argument: 'x'