In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
import torch
device = "cuda" # the device to load the model onto

model_name = "Qwen/Qwen1.5-MoE-A2.7B"

# 4-bit QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                 # QLoRA = 4-bit base model
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",         # QLoRA uses nf4
    bnb_4bit_use_double_quant=True
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

model.eval()
inputs = tokenizer("Hello!", return_tensors="pt").to(model.device)

with torch.no_grad():
    out = model.generate(**inputs, max_new_tokens=20)

print(tokenizer.decode(out[0], skip_special_tokens=True))



Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Hello! I'm a 20 year old girl from the Netherlands. I'm a student in the field


In [4]:
print(model)

Qwen2MoeForCausalLM(
  (model): Qwen2MoeModel(
    (embed_tokens): Embedding(151936, 2048)
    (layers): ModuleList(
      (0-23): 24 x Qwen2MoeDecoderLayer(
        (self_attn): Qwen2MoeSdpaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=True)
          (k_proj): Linear4bit(in_features=2048, out_features=2048, bias=True)
          (v_proj): Linear4bit(in_features=2048, out_features=2048, bias=True)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): Qwen2MoeRotaryEmbedding()
        )
        (mlp): Qwen2MoeSparseMoeBlock(
          (gate): Linear4bit(in_features=2048, out_features=60, bias=False)
          (experts): ModuleList(
            (0-59): 60 x Qwen2MoeMLP(
              (gate_proj): Linear4bit(in_features=2048, out_features=1408, bias=False)
              (up_proj): Linear4bit(in_features=2048, out_features=1408, bias=False)
              (down_proj): Linear4bit(in_features=1408, out_fea

In [19]:
def find_all_router_layers(model):
    router_layers = []
    modules = []
    for name, module in model.named_modules():
        if "mlp.gate" in name or "mlp.shared_expert_gate" in name or "mpl.gate.base_layer" in name:
            router_layers.append(name)
            modules.append(module)
    return router_layers, modules

routers, modules = find_all_router_layers(model)
for i in range(1):
    # print(routers[i], modules[i].__dict__)
    print(modules[i].quant_state.__dict__)

{'absmax': tensor([ 51, 198,  52,  ...,  59, 205, 192], dtype=torch.uint8), 'shape': torch.Size([60, 2048]), 'code': tensor([-1.0000, -0.6962, -0.5251, -0.3949, -0.2844, -0.1848, -0.0911,  0.0000,
         0.0796,  0.1609,  0.2461,  0.3379,  0.4407,  0.5626,  0.7230,  1.0000]), 'dtype': torch.float16, 'blocksize': 64, 'quant_type': 'nf4', 'offset': tensor(0.1111), 'state2': <bitsandbytes.functional.QuantState object at 0x7329c81a58b0>, 'nested': True}


In [24]:
import bitsandbytes as bnb

def get_gate_weight_4bit(gate):
    # gate: Linear4bit(in_features=2048, out_features=60, bias=False)
    
    # 1. 量化后的 packed 权重 (Params4bit -> uint8 tensor)
    # 有些版本是 gate.weight.data，有些要先拿 .value / .data
    qweight = gate.weight.data    # shape: (61440, 1) or (61440,)
    qweight = qweight.view(-1)    # 展平成 1D，方便函数吃进去

    # 2. 用 quant_state 解量化
    W = bnb.functional.dequantize_4bit(
        qweight,
        quant_state=gate.quant_state,   # 你刚才打印的那个 QuantState
        quant_type="nf4",               # 可写可不写，有 quant_state 时一般会被忽略
    )
    # 3. reshape 回原始矩阵形状：(out_features, in_features) = (60, 2048)
    W = W.view(gate.out_features, gate.in_features)
    return W  # float16/float32 matrix, shape == (60, 2048)


def get_router_l2_norms(model):
    layer_norms = {}

    for name, module in model.named_modules():
        if "mlp.gate" in name:
            print(f"name: {name} module: {module}")
            weight = module.weight  # (60, 2048)
            print(f"weights {weight.shape}")
            weight_dequantization = get_gate_weight_4bit(module)
            print(f"weight after dequantization {weight_dequantization.shape}")
            norms = torch.norm(weight_dequantization.float(), dim=1)  # shape (60,)
            print(f" norms {norms.shape}")
            layer_norms[name] = norms.detach().cpu()

    return layer_norms

router_norms = get_router_l2_norms(model)

for name, norms in list(router_norms.items()):
    print(name, norms)

name: model.layers.0.mlp.gate module: Linear4bit(in_features=2048, out_features=60, bias=False)
weights torch.Size([61440, 1])
weight after dequantization torch.Size([60, 2048])
 norms torch.Size([60])
name: model.layers.1.mlp.gate module: Linear4bit(in_features=2048, out_features=60, bias=False)
weights torch.Size([61440, 1])
weight after dequantization torch.Size([60, 2048])
 norms torch.Size([60])
name: model.layers.2.mlp.gate module: Linear4bit(in_features=2048, out_features=60, bias=False)
weights torch.Size([61440, 1])
weight after dequantization torch.Size([60, 2048])
 norms torch.Size([60])
name: model.layers.3.mlp.gate module: Linear4bit(in_features=2048, out_features=60, bias=False)
weights torch.Size([61440, 1])
weight after dequantization torch.Size([60, 2048])
 norms torch.Size([60])
name: model.layers.4.mlp.gate module: Linear4bit(in_features=2048, out_features=60, bias=False)
weights torch.Size([61440, 1])
weight after dequantization torch.Size([60, 2048])
 norms torch.S