In [1]:
import torch
import torch.nn as nn
import bitsandbytes as bnb 
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model 

  warn(f"Failed to load image Python extension: {e}")


In [2]:
%load_ext watermark

In [3]:
%watermark --iversions

transformers: 4.46.3
torch       : 2.4.1
bitsandbytes: 0.41.3
peft        : 0.13.2



In [4]:
from watermark import watermark
print(watermark(packages='peft,torch,loralib,transformers,accelerate,datasets'))

peft        : 0.13.2
torch       : 2.4.1
loralib     : 0.1.2
transformers: 4.46.3
accelerate  : 1.0.1
datasets    : 3.1.0



In [5]:
import os#环境代理设置
os.environ["http_proxy"] = "http://127.0.0.1:7890"
os.environ["https_proxy"] = "http://127.0.0.1:7890"

In [6]:
model = AutoModelForCausalLM.from_pretrained(
    "bigscience/bloom-7b1", 
    load_in_4bit=True
)

tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-7b1")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
model.config

BloomConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "bigscience/bloom-7b1",
  "apply_residual_connection_post_layernorm": false,
  "architectures": [
    "BloomForCausalLM"
  ],
  "attention_dropout": 0.0,
  "attention_softmax_in_fp32": true,
  "bias_dropout_fusion": true,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_dropout": 0.0,
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "masked_softmax_fusion": true,
  "model_type": "bloom",
  "n_head": 32,
  "n_inner": null,
  "n_layer": 30,
  "offset_alibi": 100,
  "pad_token_id": 3,
  "pretraining_tp": 1,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "float32",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "fp4",
    "bnb_4bit_use_double_quant": false,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_thresho

In [8]:
model.get_input_embeddings()

Embedding(250880, 4096)

In [9]:
tokenizer

BloomTokenizerFast(name_or_path='bigscience/bloom-7b1', vocab_size=250680, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

freeze original weights

In [10]:
list(model.parameters())[0].dtype

torch.float16

In [11]:
model.lm_head.weight.dtype

torch.float16

In [12]:
for name, p in model.named_parameters():
    print(f"{name:40s} {p.dtype}  {p.numel():>10,d}")

transformer.word_embeddings.weight       torch.float16  1,027,604,480
transformer.word_embeddings_layernorm.weight torch.float16       4,096
transformer.word_embeddings_layernorm.bias torch.float16       4,096
transformer.h.0.input_layernorm.weight   torch.float16       4,096
transformer.h.0.input_layernorm.bias     torch.float16       4,096
transformer.h.0.self_attention.query_key_value.weight torch.uint8  25,165,824
transformer.h.0.self_attention.query_key_value.bias torch.float16      12,288
transformer.h.0.self_attention.dense.weight torch.uint8   8,388,608
transformer.h.0.self_attention.dense.bias torch.float16       4,096
transformer.h.0.post_attention_layernorm.weight torch.float16       4,096
transformer.h.0.post_attention_layernorm.bias torch.float16       4,096
transformer.h.0.mlp.dense_h_to_4h.weight torch.uint8  33,554,432
transformer.h.0.mlp.dense_h_to_4h.bias   torch.float16      16,384
transformer.h.0.mlp.dense_4h_to_h.weight torch.uint8  33,554,432
transformer.h.0.mlp.d

### 1. 8-bit 量化在 Transformers 里是怎么做的
- **只有 `nn.Linear` 的权重张量被量化成 INT8**（1 字节）。  
- **量化参数（scale/zero-point）和非线性层、bias、LayerNorm 等**仍保持 **float16**（2 字节）。  
- 量化后的 `Linear` 模块被替换成 `bnb.nn.Int8Params`，但 **模块外壳仍注册为 `nn.Parameter`**，因此 `model.parameters()` 遍历到的**第一层往往就是非量化的 float16 参数**（例如 embed/LayerNorm/bias）。

---

### 2. 为什么 `.dtype` 不直接显示 `torch.int8`
- **Int8Params 的 `.dtype` 被重载为原始反量化后的类型**（float16），方便框架内部做类型推断；  
- 真实存储是 **1 字节 + 缩放系数**，可通过：
  ```python
  p = model.transformer.h[0].self_attn.q_proj.weight
  print(p.dtype)          # torch.float16 （外壳类型）
  print(p.CB.dtype)       # torch.int8   （真实量化数据）
  print(p.SCB.dtype)      # torch.float16（缩放系数）
  ```

---

In [13]:
len(list(model.parameters()))

365

In [14]:
# for i, param in enumerate(model.parameters()):
#     param.requires_grad = False
#     if param.ndim == 1: #把一维的小参数（bias、LayerNorm 等）单独升到 float32，保证数值稳定，其余权重保持原精度不动
#         param.data = param.data.to(torch.float32)

In [15]:
for name, p in model.named_parameters():
    print(f"{name:40s} {p.dtype}  {p.numel():>10,d}")

transformer.word_embeddings.weight       torch.float16  1,027,604,480
transformer.word_embeddings_layernorm.weight torch.float16       4,096
transformer.word_embeddings_layernorm.bias torch.float16       4,096
transformer.h.0.input_layernorm.weight   torch.float16       4,096
transformer.h.0.input_layernorm.bias     torch.float16       4,096
transformer.h.0.self_attention.query_key_value.weight torch.uint8  25,165,824
transformer.h.0.self_attention.query_key_value.bias torch.float16      12,288
transformer.h.0.self_attention.dense.weight torch.uint8   8,388,608
transformer.h.0.self_attention.dense.bias torch.float16       4,096
transformer.h.0.post_attention_layernorm.weight torch.float16       4,096
transformer.h.0.post_attention_layernorm.bias torch.float16       4,096
transformer.h.0.mlp.dense_h_to_4h.weight torch.uint8  33,554,432
transformer.h.0.mlp.dense_h_to_4h.bias   torch.float16      16,384
transformer.h.0.mlp.dense_4h_to_h.weight torch.uint8  33,554,432
transformer.h.0.mlp.d

In [16]:
model.transformer.h[0].self_attention.dense.weight.shape

torch.Size([8388608, 1])

**微调超大模型时的标准“省显存 + 解冻”组合**

---

### 1. `model.gradient_checkpointing_enable()`
- **启用“梯度检查点”**（Gradient Checkpointing）。  
- 前向时**不保存中间激活**，反向到对应层再临时重算一次前向，**显存换时间**。  
- 通常可把显存占用 **砍 30-50 %**，适合 batch 较大或卡较小的情况。

---

### 2. `model.enable_input_require_grads()`
- 把 **输入 embedding 层**的 `requires_grad` 设成 `True`。  
- 默认冻结（`requires_grad=False`）时，**最前面一层的梯度不会回传**，也就无法：  
  – 做 **LoRA/AdaLoRA**（需要输入梯度）  
  – 对 **prompt-tuning / prefix-tuning** 等优化输入嵌入的场景  
- 调用后，模型会强制给 `inputs_embeds` 打开梯度开关，保证这些算法能正常 backward。

---

### 3. 一句话总结
> **“先开检查点省显存，再开输入梯度开关，让输入嵌入也能回传梯度，后面才能放心做 LoRA/prompt-tuning 等微调。”**

In [17]:
model.gradient_checkpointing_enable()  
model.enable_input_require_grads()

In [18]:
# class CastOutputToFloat(nn.Sequential):
#     def forward(self, x):
#         return super().forward(x).to(torch.float32) #将输出参数转为float32，权重参数不变
# model.lm_head = CastOutputToFloat(model.lm_head)

LoRA Adapters

In [19]:
def print_trainable_parameters(mode):
    trainable_params = 0
    all_param = 0
    for _,param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"训练参数: {trainable_params} ||所有参数: {all_param} || 训练参数占比%: {100 * trainable_params / all_param}"
    )

In [20]:
from peft import LoraConfig, get_peft_model
config = LoraConfig(
    r=4,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [21]:
model = get_peft_model(model, config)

In [22]:
print_trainable_parameters(model)

训练参数: 1966080 ||所有参数: 4051083264 || 训练参数占比%: 0.04853220415071676


pipline

In [23]:
import transformers
from datasets import load_dataset
dataset = load_dataset("Abirate/english_quotes")

In [24]:
dataset

DatasetDict({
    train: Dataset({
        features: ['quote', 'author', 'tags'],
        num_rows: 2508
    })
})

In [25]:
dataset['train']['quote'][:5]

['“Be yourself; everyone else is already taken.”',
 "“I'm selfish, impatient and a little insecure. I make mistakes, I am out of control and at times hard to handle. But if you can't handle me at my worst, then you sure as hell don't deserve me at my best.”",
 "“Two things are infinite: the universe and human stupidity; and I'm not sure about the universe.”",
 '“So many books, so little time.”',
 '“A room without books is like a body without a soul.”']

In [26]:
dataset['train']['author'][:5]

['Oscar Wilde',
 'Marilyn Monroe',
 'Albert Einstein',
 'Frank Zappa',
 'Marcus Tullius Cicero']

In [27]:
dataset['train']['tags'][:5]

[['be-yourself',
  'gilbert-perreira',
  'honesty',
  'inspirational',
  'misattributed-oscar-wilde',
  'quote-investigator'],
 ['best', 'life', 'love', 'mistakes', 'out-of-control', 'truth', 'worst'],
 ['human-nature',
  'humor',
  'infinity',
  'philosophy',
  'science',
  'stupidity',
  'universe'],
 ['books', 'humor'],
 ['books', 'simile', 'soul']]

In [28]:
def merge(row):
    row['prediction'] = row['quote'] + ' ->: ' + str(row['tags'])
    return row
dataset['train'] = dataset['train'].map(merge)

In [29]:
dataset

DatasetDict({
    train: Dataset({
        features: ['quote', 'author', 'tags', 'prediction'],
        num_rows: 2508
    })
})

In [30]:
train_dataset = dataset.map(lambda x:tokenizer(x['prediction']), batched=True, remove_columns=['quote', 'author', 'tags', 'prediction'])

In [31]:
train_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 2508
    })
})

In [32]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

In [33]:
trainer = Trainer(
    model=model, 
    train_dataset=train_dataset['train'],
    args=TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        max_steps=200,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="bloom-7b1-lora-0927"
    ),
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False 
trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
max_steps is given, it will override any value given in num_train_epochs
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss


OutOfMemoryError: Caught OutOfMemoryError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/jiangzirou/.local/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 84, in _worker
    output = module(*input, **kwargs)
  File "/home/jiangzirou/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/jiangzirou/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/jiangzirou/.local/lib/python3.8/site-packages/peft/peft_model.py", line 1644, in forward
    return self.base_model(
  File "/home/jiangzirou/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/jiangzirou/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/jiangzirou/.local/lib/python3.8/site-packages/peft/tuners/tuners_utils.py", line 197, in forward
    return self.model.forward(*args, **kwargs)
  File "/home/jiangzirou/.local/lib/python3.8/site-packages/accelerate/hooks.py", line 170, in new_forward
    output = module._old_forward(*args, **kwargs)
  File "/home/jiangzirou/.local/lib/python3.8/site-packages/transformers/models/bloom/modeling_bloom.py", line 999, in forward
    loss = loss_fct(
  File "/home/jiangzirou/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/jiangzirou/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/jiangzirou/.local/lib/python3.8/site-packages/torch/nn/modules/loss.py", line 1188, in forward
    return F.cross_entropy(input, target, weight=self.weight,
  File "/home/jiangzirou/.local/lib/python3.8/site-packages/torch/nn/functional.py", line 3104, in cross_entropy
    return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.54 GiB. GPU 0 has a total capacity of 23.64 GiB of which 324.44 MiB is free. Including non-PyTorch memory, this process has 23.32 GiB memory in use. Of the allocated memory 18.80 GiB is allocated by PyTorch, and 4.20 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
