## 第八章作业

#### 1. 使用 GPTQ 量化 OPT-6.7B 模型。课程代码（ https://github.com/DjangoPeng/LLM-quickstart/blob/main/quantization/AutoGPTQ_opt-2.7b.ipynb ）

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig

model_name = 'facebook/opt-6.7b'
gptq_model_dir = 'models/opt-6.7b-gptq'

quantization_config = GPTQConfig(
    bits=4,
    group_size=128,
    dataset='wikitext2',
    desc_act=False
)


In [None]:
gptq_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map='auto'
)

In [15]:
gptq_model.model.decoder.layers[0].self_attn.q_proj.__dict__

{'training': True,
 '_parameters': OrderedDict(),
 '_buffers': OrderedDict([('qweight',
               tensor([[-1533367701,  2022282410, -1182414405,  ..., -1537760937,
                          968190105, -1218869078],
                       [-1973762678, -1990888091,  1451669112,  ...,  1767327094,
                         1485273242, -1769109111],
                       [ -890521657,  1705355194,  2042256023,  ...,  1401453177,
                         -963081656, -1212573545],
                       ...,
                       [-1192544404,   697191045,  1432856694,  ...,  1967820506,
                        -1482119368, -1787262823],
                       [-1736931943, -1753576812,  2027985786,  ..., -1757504693,
                         2090308806, -1987483739],
                       [ 1549191317,  1151064006, -1735993498,  ..., -1317428394,
                        -1182375288, -1199925157]], device='cuda:0', dtype=torch.int32)),
              ('qzeros',
               tensor(

In [4]:
gptq_model.save_pretrained(gptq_model_dir)

In [16]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

text = "Merry Chrismas! I'm glad to"
inputs = tokenizer(text, return_tensors='pt').to(0)
out = gptq_model.generate(**inputs, max_new_tokens=64)
print(tokenizer.decode(out[0], skip_special_tokens=True))



Merry Chrismas! I'm glad to see you're still around.
I'm still around, just not as much as I used to be.


In [17]:
tokenizer.batch_decode(out, skip_special_tokens=True)

["Merry Chrismas! I'm glad to see you're still around.\nI'm still around, just not as much as I used to be."]

#### 2. 使用 AWQ 量化 Facebook OPT-6.7B 模型。Facebook OPT 模型地址： https://huggingface.co/facebook?search_models=opt

课程代码： https://github.com/DjangoPeng/LLM-quickstart/blob/main/quantization/AWQ_opt-2.7b.ipynb

 https://github.com/DjangoPeng/LLM-quickstart/blob/main/quantization/AWQ-opt-125m.ipynb

In [13]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer, AwqConfig, AutoModelForCausalLM

model_name = 'facebook/opt-6.7b'
awq_model_dir = 'models/opt-6.7b-awq'

In [3]:
model = AutoAWQForCausalLM.from_pretrained(model_name, trust_remote_code=True)
tokenzier = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)



Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

README.md: 0.00B [00:00, ?B/s]

.gitattributes:   0%|          | 0.00/203 [00:00<?, ?B/s]

LICENSE.md: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
quant_config = {
    'zero_point': True,
    'q_group_size': 128,
    'w_bit': 4,
    'version': 'GEMM'
}

tokenizer = AutoTokenizer.from_pretrained(model_name)
text = "Merry Chrismas! I'm glad to"
model.quantize(tokenizer, quant_config)

Downloading readme:   0%|          | 0.00/167 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data:   0%|          | 0.00/471M [00:00<?, ?B/s]

Generating validation split: 0 examples [00:00, ? examples/s]

AWQ: 100%|██████████| 32/32 [09:50<00:00, 18.44s/it]


In [6]:
awq_config = AwqConfig(
    bits=quant_config['w_bit'],
    group_size=quant_config['q_group_size'],
    zero_point=quant_config['zero_point'],
    version=quant_config['version'].lower(),
    backend='autoawq'
)

model.model.config.quantization_config = awq_config

In [14]:
model.save_quantized(awq_model_dir)
tokenzier.save_pretrained(awq_model_dir)

('models/opt-6.7b-awq/tokenizer_config.json',
 'models/opt-6.7b-awq/special_tokens_map.json',
 'models/opt-6.7b-awq/vocab.json',
 'models/opt-6.7b-awq/merges.txt',
 'models/opt-6.7b-awq/added_tokens.json',
 'models/opt-6.7b-awq/tokenizer.json')

In [8]:
model.eval()

OptAWQForCausalLM(
  (model): OPTForCausalLM(
    (model): OPTModel(
      (decoder): OPTDecoder(
        (embed_tokens): Embedding(50272, 4096, padding_idx=1)
        (embed_positions): OPTLearnedPositionalEmbedding(2050, 4096)
        (final_layer_norm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (layers): ModuleList(
          (0-31): 32 x OPTDecoderLayer(
            (self_attn): OPTAttention(
              (k_proj): WQLinear_GEMM(in_features=4096, out_features=4096, bias=True, w_bit=4, group_size=128)
              (v_proj): WQLinear_GEMM(in_features=4096, out_features=4096, bias=True, w_bit=4, group_size=128)
              (q_proj): WQLinear_GEMM(in_features=4096, out_features=4096, bias=True, w_bit=4, group_size=128)
              (out_proj): WQLinear_GEMM(in_features=4096, out_features=4096, bias=True, w_bit=4, group_size=128)
            )
            (activation_fn): ReLU()
            (self_attn_layer_norm): LayerNorm((4096,), eps=1e-05, elementwise_affin

In [15]:
awq_tokenizer = AutoTokenizer.from_pretrained(awq_model_dir)
awq_model = AutoModelForCausalLM.from_pretrained(awq_model_dir, device_map='cuda').to(0)

In [16]:
def generate_text(text, top_n=1):
    inputs = awq_tokenizer(text, return_tensors='pt').to(0)
    out = awq_model.generate(**inputs, max_new_tokens=64)
    return awq_tokenizer.batch_decode(out[:top_n], skip_special_tokens=True)

In [17]:
generate_text(text, 3)

["Merry Chrismas! I'm glad to hear you guys are still hanging out, hope it continues to be fun and positive. Keep us updated!\n>I'm glad to hear you guys are still hanging out  Oh yeah, I don't care at all. I get paid either way."]

In [18]:
generate_text('The woman worked as a', 3)

["The woman worked as a model scout in New York and the East Coast\n\nThe mother of the late rock icon Keith Richards' child has been given three years in prison for illegally using his name in a scam to sell luxury homes and antiques.\n\nJaclyn McElrath, who was sentenced in Manhattan Federal Court from Monday,"]