# Transformers 模型量化技术：GPTQ

## 使用 GPTQ 量化模型
### 使用`GPTQ`算法支持的默认数据集来量化

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
import torch

mode_name_or_path = "facebook/opt-2.7b"

quantization_config = GPTQConfig(
    bits=4,
    group_size=128,
    dataset="wikitext2",
    desc_act=False
)

In [None]:
# 逐层量化


quant_model = AutoModelForCausalLM.from_pretrained(
    mode_name_or_path,
    quantization_config=quantization_config,
    device_map='auto'
)

### 检查量化模型正确性

In [None]:
quant_model.decoder.layers[0].self_attn.q_proj.__dict__

In [None]:
# 保存模型权重

quant_model.save_pretrained("models/opt-2.7b-gptq")

#### 使用 GPU 加载模型并生成文本

In [None]:
tokenizer = AutoTokenizer.from_pretrained(mode_name_or_path)

text = "Merry Christmas! I'm glad to"
inputs = tokenizer(text, return_tensors="pt").to(0)

out = quant_model.generate(**inputs, max_new_tokens=64)
print(tokenizer.decode(out[0], skip_special_tokens=True))

### 使用自定义数据集量化模型（灵活可扩展，前提是准备好数据）

In [None]:
## 这个不需要
from transformers import AutoModelForCausalLM, GPTQConfig, AutoTokenizer

model_name_or_path = "facebook/opt-2.7b"
custom_dataset = ["auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]

custom_quantization_config = GPTQConfig(
    bits=4,
    group_size=128,
    desc_act=False,
    dataset=custom_dataset
)

custom_quant_model = AutoModelForCausalLM.from_pretrained(mode_name_or_path,
                                                          quantization_config=custom_quantization_config,
                                                          torch_dtype=torch.float16,
                                                          device_map="auto")

In [None]:
text = "Merry Christmas! I'm glad to"
inputs = tokenizer(text, return_tensors="pt").to(0)

out = custom_quant_model.generate(**inputs, max_new_tokens=64)
print(tokenizer.decode(out[0], skip_special_tokens=True))

In [None]:
# week 4 作业-2

# HOMEWORK: 使用GPTQ算法量化OPT-6.7B模型
# 作业可选，需要GPU超过16G的显存

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
import torch

mode_name_or_path = "facebook/opt-6.7b"

quantization_config = GPTQConfig(
    bits=4,
    group_size=128,
    dataset="wikitext2",
    desc_act=False
)

In [None]:
# 逐层量化

# 这个会量化模型

quant_model = AutoModelForCausalLM.from_pretrained(
    mode_name_or_path,
    quantization_config=quantization_config,
    device_map='auto'
)

In [None]:
# 验证是否量化了
quant_model.decoder.layers[0].self_attn.q_proj.__dict__

In [None]:
# 保存模型权重

quant_model.save_pretrained("models/opt-6.7b-gptq")

In [None]:
# 使用训练的模型

tokenizer = AutoTokenizer.from_pretrained(mode_name_or_path)

text = "Merry Christmas! I'm glad to"
inputs = tokenizer(text, return_tensors="pt").to(0)

out = quant_model.generate(**inputs, max_new_tokens=64)
print(tokenizer.decode(out[0], skip_special_tokens=True))