In [1]:
import torch
from transformers import AutoModelForCausalLM
import os#环境代理设置
os.environ["http_proxy"] = "http://127.0.0.1:7890"
os.environ["https_proxy"] = "http://127.0.0.1:7890"

In [None]:
# all float32
model = AutoModelForCausalLM.from_pretrained('gpt2')
print(model.get_memory_footprint() / (1024**2))

486.7002410888672


In [10]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"总参数量 : {total_params:,}")
print(f"可训练量 : {trainable_params:,}")

总参数量 : 124,439,808
可训练量 : 124,439,808


float32 一个参数占4字节

0.1B 参数 => 0.4B 字节 => 486 兆字节

$124,439,808*4 / 2^{20} = 486.7$

In [13]:
# all float16
model = AutoModelForCausalLM.from_pretrained('gpt2', torch_dtype=torch.float16)
print(model.get_memory_footprint() / (1024**2))

249.3501205444336


In [None]:
# torch.int8 
# 混合精度量化
# 自动跳过对精度敏感的小张量（bias、norm、embedding）
# 只量化大权值矩阵（c_attn.weight, c_proj.weight, c_fc.weight）等
model = AutoModelForCausalLM.from_pretrained('gpt2', load_in_8bit=True)
print(model.get_memory_footprint() / (1024**2))

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


168.3501205444336


In [16]:
for name, para in model.named_parameters():
    print(para.dtype, name, para.device)

torch.float16 transformer.wte.weight cuda:0
torch.float16 transformer.wpe.weight cuda:0
torch.float16 transformer.h.0.ln_1.weight cuda:0
torch.float16 transformer.h.0.ln_1.bias cuda:0
torch.int8 transformer.h.0.attn.c_attn.weight cuda:0
torch.float16 transformer.h.0.attn.c_attn.bias cuda:0
torch.int8 transformer.h.0.attn.c_proj.weight cuda:0
torch.float16 transformer.h.0.attn.c_proj.bias cuda:0
torch.float16 transformer.h.0.ln_2.weight cuda:0
torch.float16 transformer.h.0.ln_2.bias cuda:0
torch.int8 transformer.h.0.mlp.c_fc.weight cuda:0
torch.float16 transformer.h.0.mlp.c_fc.bias cuda:0
torch.int8 transformer.h.0.mlp.c_proj.weight cuda:0
torch.float16 transformer.h.0.mlp.c_proj.bias cuda:0
torch.float16 transformer.h.1.ln_1.weight cuda:0
torch.float16 transformer.h.1.ln_1.bias cuda:0
torch.int8 transformer.h.1.attn.c_attn.weight cuda:0
torch.float16 transformer.h.1.attn.c_attn.bias cuda:0
torch.int8 transformer.h.1.attn.c_proj.weight cuda:0
torch.float16 transformer.h.1.attn.c_proj.bi

In [None]:
print(id(model.lm_head.weight))        
print(id(model.transformer.wte.weight)) 

140604414518752
140604414518752


In [17]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Linear8bitLt(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear8bitLt(in_features=768, out_features=768, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Linear8bitLt(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear8bitLt(in_features=3072, out_features=768, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, eleme

In [22]:
model.lm_head.weight.requires_grad

True

- transformer.wte.weight、transformer.wpe.weight：torch.float16
- h.0 - h.11
    - ln_1.weight, ln_1.bias, ln_2.weight, ln_2.bias: torch.float16
    - attn
        - c_attn.weight: torch.int8
            - bias: torch.float16
        - c_proj.weight: torch.int8
            - bias: torch.float16
    - mlp
        - c_fc.weight: torch.int8
            - bias: torch.float16
    - ln_f.weight, ln_f.bias: torch.float16

lm_head 权重与 token embedding 权重 在 GPT-2 里**共享同一矩阵**
结构图里虽然出现 lm_head，但它 不是独立参数，而是 transformer.wte.weight 的 引用（alias），所以 named_parameters() 自动去重只列一次

---

demo

In [25]:
torch.cuda.amp.autocast

torch.cuda.amp.autocast_mode.autocast

In [29]:
# Creates some tensors in default dtype (here assumed to be float32)
a_float32 = torch.rand((8, 8), device="cuda")
b_float32 = torch.rand((8, 8), device="cuda")
c_float32 = torch.rand((8, 8), device="cuda")
d_float32 = torch.rand((8, 8), device="cuda")

with torch.cuda.amp.autocast(): #开启「自动混合精度」环境
    # 在这个环境里，PyTorch 会自动把合适的算子（如矩阵乘法、卷积）切换到 float16（半精度）执行，
    # 而一些对精度敏感的算子（如归一化、softmax）依然保持 float32。
    e = torch.mm(a_float32, b_float32)
    print('in autocast', e.dtype, e.device)

    f = torch.mm(d_float32, e)
    print('in autocast', f.dtype, e.device)

# 退出autocast后, 显示调用.float() 使用 float32
g_float32 = torch.mm(d_float32, f.float())
print('out autocast', g_float32.dtype, g_float32.device)

h_float16 = torch.mm(d_float32.half(), f)
print('out autocast', h_float16.dtype, h_float16.device)

in autocast torch.float16 cuda:0
in autocast torch.float16 cuda:0
out autocast torch.float32 cuda:0
out autocast torch.float16 cuda:0


  with torch.cuda.amp.autocast(): #开启「自动混合精度」环境


## 前置知识

#### ✅ FP16（半精度）优势：
- **动态范围足够**：虽然FP16表示范围比FP32小，但在大多数深度学习计算中是够用的。
- **计算快**：FP16的计算吞吐量通常是FP32的**8倍**。
- **内存效率高**：
  - 内存带宽提升**2倍**（因为数据更小，传输更快）。
  - 显存占用减半（**1/2x**），可以训练更大的模型或batch size。

#### ⚠️ 但FP16也有问题：
- **精度不足**：某些操作（如累加、指数运算、权重更新）在FP16下容易**下溢（underflow）**或**精度不够**。
  - 举例：`0.1 + 0.0001` 在FP16下可能直接等于`0.1`，因为`0.0001`被“吃掉”了。
  - 权重更新时，如果更新量太小（比如小于 \(2^{-11} \approx 0.00049\)），FP16直接忽略，**模型不学习**。

---

### ✅ 所以需要混合精度（Mixed Precision）：
- **用FP16做大部分计算**（快、省内存）。
- **用FP32做关键操作**（保证精度）：
  - **累加（reductions）**
  - **指数运算（exponentiation）**
  - **权重更新（weight updates）**

---

> **FP16快但“粗心”，FP32慢但“细心”，混合精度训练就是让他俩分工合作，既快又准。**

In [26]:
import torch
a = torch.cuda.HalfTensor([2**-12])
one = torch.cuda.HalfTensor([1.0])
print(a)              # tensor([5.9605e-08], dtype=torch.float16)
print(one + a)        # tensor([1.], dtype=torch.float16)
print(one + a == one) # tensor([True])

tensor([0.0002], device='cuda:0', dtype=torch.float16)
tensor([1.], device='cuda:0', dtype=torch.float16)
tensor([True], device='cuda:0')


In [27]:
import torch
a = torch.tensor([2**-12], dtype=torch.float32, device='cuda')
one = torch.tensor([1.0], dtype=torch.float32, device='cuda')
print(a)              # tensor([5.9605e-08], device='cuda')
print(one + a)        # tensor([1.0000001], device='cuda')
print(one + a == one) # tensor([False])

tensor([0.0002], device='cuda:0')
tensor([1.0002], device='cuda:0')
tensor([False], device='cuda:0')


In [28]:
4096*16

65536

In [29]:
# torch.float16
a = torch.cuda.HalfTensor(4096)
a.fill_(16)
a.sum()

tensor(inf, device='cuda:0', dtype=torch.float16)

In [30]:
# torch.float32
a = torch.cuda.FloatTensor(4096)
a.fill_(16)
a.sum()

tensor(65536., device='cuda:0')

In [31]:
para = torch.cuda.HalfTensor([1.])
update = torch.cuda.HalfTensor([.0001])
para + update

tensor([1.], device='cuda:0', dtype=torch.float16)

In [33]:
para = torch.cuda.HalfTensor([1.])
update = torch.cuda.HalfTensor([.001])
para + update

tensor([1.0010], device='cuda:0', dtype=torch.float16)

In [34]:
para = torch.cuda.FloatTensor([1.])
update = torch.cuda.FloatTensor([.0001])
para + update

tensor([1.0001], device='cuda:0')

In [None]:
para = torch.cuda.FloatTensor([1.])
update = torch.cuda.FloatTensor([.0001])
para + update

混合精度训练的核心循环：
- FP16 跑前向+反向（快）；
- FP32 做梯度累加与权重更新（准）；
- 再把权重压回 FP16 继续下一轮。

### loss scaling

``` python
scaler = GradScaler()

# forward
with autocast():
    output = model(input)
    loss = loss_fn(output, target)

# backward
scaler.sacle(loss).backward()
scaler.step(optimizer)
scaler.update()
```

---

### master-weights-scale

``` python
# 计算梯度
loss.backward()

# 将计算的梯度从float16模型复制到float32模型
for param, param_float32 in zip(model.parameters(), model_float32.parameters()):
    if param.grad is not None:
        param_float32.grad = param.grad.float() * scale_factor  # 应用梯度缩放

# 更新主权重（float32模型）
optimizer.step()
```