## 第八章作业

#### 1. 使用 GPTQ 量化 OPT-6.7B 模型。课程代码（ https://github.com/DjangoPeng/LLM-quickstart/blob/main/quantization/AutoGPTQ_opt-2.7b.ipynb ）

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig

model_name = 'facebook/opt-6.7b'
gptq_model_dir = 'models/opt-6.7b-gptq'

quantization_config = GPTQConfig(
    bits=4,
    group_size=128,
    dataset='wikitext2',
    desc_act=False
)


In [None]:
gptq_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map='auto'
)

In [None]:
gptq_model.model.decoder.layers[0].self_attn.q_proj.__dict__

In [None]:
gptq_model.save_pretrained(gptq_model_dir)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

text = "Merry Chrismas! I'm glad to"
inputs = tokenizer(text, return_tensors='pt').to(0)
out = gptq_model.generate(**inputs, max_new_tokens=64)
print(tokenizer.decode(out[0], skip_special_tokens=True))

In [None]:
tokenizer.batch_decode(out, skip_special_tokens=True)

#### 2. 使用 AWQ 量化 Facebook OPT-6.7B 模型。Facebook OPT 模型地址： https://huggingface.co/facebook?search_models=opt

课程代码： https://github.com/DjangoPeng/LLM-quickstart/blob/main/quantization/AWQ_opt-2.7b.ipynb

 https://github.com/DjangoPeng/LLM-quickstart/blob/main/quantization/AWQ-opt-125m.ipynb

In [None]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer, AwqConfig, AutoModelForCausalLM

model_name = 'facebook/opt-6.7b'
awq_model_dir = 'modes/opt-6.7b-awq'

In [None]:
model = AutoAWQForCausalLM.from_pretrained(model_name, trust_remote_code=True)
tokenzier = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

In [None]:
quant_config = {
    'zero_point': True,
    'q_group_size': 128,
    'w_bit': 4,
    'version': 'GEMM'
}

model.quantize(tokenizer, quant_config)

In [None]:
awq_config = AwqConfig(
    bits=quant_config['w_bit'],
    group_size=quant_config['q_group_size'],
    zero_point=quant_config['zero_point'],
    version=quant_config['version'].lower(),
    backend='autoawq'
)

model.model.config.quantization_config = awq_config

In [None]:
model.save_quantized(awq_model_dir)
tokenzier.save_pretrained(awq_model_dir)

In [None]:
model.eval()

In [None]:
awq_tokenizer = AutoTokenizer.from_pretrained(awq_model_dir)
awq_model = AutoModelForCausalLM.from_pretrained(awq_model_dir, device_map='cuda').to(0)

In [None]:
def generate_text(text, top_n=1):
    inputs = awq_tokenizer(text, return_tensors='pt').to(0)
    out = awq_model.generate(**inputs, max_new_tokens=64)
    return awq_tokenizer.batch_decode(out[:top_n], skip_special_tokens=True)

In [None]:
generate_text(text, 3)

In [None]:
generate_text('The woman worked as a', 3)