In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# 7b参数，半精度（每个参数占2字节），加载约13GB参数在GPU上
model = AutoModelForCausalLM.from_pretrained(
    "NousResearch/Llama-2-7b-chat-hf",
    device_map="auto",           # 自动分配所有GPU
    torch_dtype=torch.float16,   # 半精度节省显存
    low_cpu_mem_usage=True       # 减少CPU内存使用
)

tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf")

  warn(f"Failed to load image Python extension: {e}")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      

| 组件                   | 是否加载到显存   | 说明                   |
| -------------------- | --------- | -------------------- |
| **模型权重（parameters）** | ✅         | 所有 `float16` 精度的权重张量 |
| **模型缓存（KV Cache）**   | ❌（初始为空）   | 运行推理时才会动态分配          |
| **Tokenizer**        | ❌         | 只在 CPU 上，轻量级         |
| **前向计算中间激活值**        | ❌（运行时才产生） | 每次推理临时分配             |


In [3]:
tokenizer

LlamaTokenizerFast(name_or_path='NousResearch/Llama-2-7b-chat-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	32000: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
}

In [5]:
# 看第 5 层（layers[4]）的完整参数名与形状
for name, param in model.model.layers[4].named_parameters():
    print(f"{name:30s} {param.shape}  dtype={param.dtype}")

self_attn.q_proj.weight        torch.Size([4096, 4096])  dtype=torch.float16
self_attn.k_proj.weight        torch.Size([4096, 4096])  dtype=torch.float16
self_attn.v_proj.weight        torch.Size([4096, 4096])  dtype=torch.float16
self_attn.o_proj.weight        torch.Size([4096, 4096])  dtype=torch.float16
mlp.gate_proj.weight           torch.Size([11008, 4096])  dtype=torch.float16
mlp.up_proj.weight             torch.Size([11008, 4096])  dtype=torch.float16
mlp.down_proj.weight           torch.Size([4096, 11008])  dtype=torch.float16
input_layernorm.weight         torch.Size([4096])  dtype=torch.float16
post_attention_layernorm.weight torch.Size([4096])  dtype=torch.float16


In [6]:
# 只看 q_proj 的前 10 个元素
q_w = model.model.layers[4].self_attn.q_proj.weight.detach().cpu().flatten()[:10]
print(q_w)

tensor([-0.0167,  0.0033, -0.0019, -0.0051, -0.0019, -0.0066,  0.0121,  0.0112,
         0.0027,  0.0228], dtype=torch.float16)
