<a href="https://colab.research.google.com/github/jevliu/2022-Machine-Learning-Specialization/blob/main/ESB_OPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 加载必要的库

In [None]:
from transformers import OPTForCausalLM, GPT2Tokenizer
from datasets import load_dataset
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from scipy.optimize import minimize_scalar
import time
import copy

### 定义量化函数

In [None]:
def esb_quantize(weights, b, k, alpha, eps=1e-8):
    # 计算N
    N = 2 ** (b - k - 1) - 1

    # 生成移位因子xi
    xi = [2 ** -k] + [2 ** (i - k - 1) for i in range(1, N + 1)]

    # 生成分数集合Omega
    Omega = [[0] + list(range(1, 2 ** k))] + [list(range(2 ** k, 2 ** (k + 1))) for _ in range(N)]

    # 生成非负的量化值集合Q_positive
    Q_positive = []
    for i in range(N + 1):
        Q_positive.extend([alpha * x * y for x in xi[i:] for y in Omega[i]])
    Q_positive = sorted(set(Q_positive))

    # 对量化值集合进行对称处理
    Q_e = sorted(set([-q for q in Q_positive] + [0] + Q_positive))

    # 计算量化范围C
    C = np.max(Q_e)

    # 创建掩码,标记非零权重
    mask = weights != 0

    # 缩放非零权重
    scaled_weights = np.zeros_like(weights)
    scaled_weights[mask] = weights[mask] / alpha

    # 截断非零权重
    clipped_weights = np.clip(scaled_weights, -C, C)

    # 移位量化非零权重
    quantized_weights = np.zeros_like(clipped_weights)
    nonzero_indices = np.where(mask)
    v = clipped_weights[nonzero_indices]
    n = np.ceil(np.log2(np.abs(v) + eps))
    quantized_v = np.round(v / (2 ** (n - k))) * (2 ** (n - k))
    quantized_weights[nonzero_indices] = quantized_v

    # 反缩放量化后的权重
    dequantized_weights = quantized_weights * alpha

    return dequantized_weights

### 定义加载模型和数据集的函数

In [None]:
def load_model_and_tokenizer(model_name):
    model = OPTForCausalLM.from_pretrained(model_name)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    return model, tokenizer

def text_dataset():
    test_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
    return test_dataset

def encode_dataset(tokenizer, testdata):
    testenc = tokenizer("\n\n".join(testdata['text']), return_tensors='pt')
    return testenc


### 定义量化加权量化误差函数和寻找最优的k和α的函数

In [None]:
def quantization_error_weighted(weights, alpha, b, k, eps=1e-8):
    quantized_weights = esb_quantize(weights, b, k, alpha)
    abs_weights = np.abs(weights)
    weights_sum = np.sum(abs_weights)
    weighted_error = np.sum(np.abs(weights - quantized_weights) * abs_weights) / (weights_sum + eps)
    return weighted_error

def find_optimal_k_alpha(weights, b):
    min_error = float("inf")
    optimal_k = None
    optimal_alpha = None

    for k in range(b):
        alpha_opt = minimize_scalar(lambda a: quantization_error_weighted(weights, a, b, k)).x
        error = quantization_error_weighted(weights, alpha_opt, b, k)
        if error < min_error:
            min_error = error
            optimal_k = k
            optimal_alpha = alpha_opt

    return optimal_k, optimal_alpha, min_error

def quantize_gpt2_layer(layer, b):
    weights = layer.weight.cpu().detach().numpy()

    optimal_k, optimal_alpha, min_error = find_optimal_k_alpha(weights, b)

    quantized_weights = esb_quantize(weights, b, optimal_k, optimal_alpha)
    layer.weight = torch.nn.Parameter(torch.tensor(quantized_weights))

    print(f"最优k: {optimal_k}, 最优α: {optimal_alpha:.4f}, 量化相对误差：{min_error:.8f}")

### 针对模型进行量化的函数（只量化了Block）

In [None]:
def quantize_opt_model(model, b):

    start_time = time.time()
    for i, layer in enumerate(model.model.decoder.layers):
        print(f"\n对第{i+1}个Decoder Layer进行量化...")

        print("对注意力层进行量化...")
        quantize_gpt2_layer(layer.self_attn.k_proj, b)
        quantize_gpt2_layer(layer.self_attn.v_proj, b)
        quantize_gpt2_layer(layer.self_attn.q_proj, b)
        quantize_gpt2_layer(layer.self_attn.out_proj, b)

        print("对前馈神经网络层进行量化...")
        quantize_gpt2_layer(layer.fc1, b)
        quantize_gpt2_layer(layer.fc2, b)

        print("对Layer Norm层进行量化...")
        quantize_gpt2_layer(layer.self_attn_layer_norm, b)
        quantize_gpt2_layer(layer.final_layer_norm, b)

    # print("\n对嵌入层进行量化...")
    # quantize_layer(copy_model.model.decoder.embed_tokens, b)
    # quantize_layer(copy_model.model.decoder.embed_positions, b)

    # print("\n对最后的Layer Norm层进行量化...")
    # quantize_layer(copy_model.model.decoder.final_layer_norm, b)

    # print("\n对最后的线性层进行量化...")
    # quantize_layer(copy_model.lm_head, b)


    end_time = time.time()
    print(f"量化总时间: {end_time - start_time:.2f} 秒")

### 使用GPTQ中给出的评估困惑度的函数

In [None]:
import torch.nn as nn

@torch.no_grad()
# 评估OPT模型使用testenc数据和指定设备的性能
def calculate_perplexity(model, testenc, dev):
    '''
    1.根据输入数据对OPT模型进行评估，评估性能指标，并计算模型的性能。
    2.将指定的模型层移动到指定设备，并准备捕获输入数据和注意力掩码。
    3.通过遍历模型的每个层，对每一层进行评估，根据评估结果计算性能指标，以便评估模型的性能表现。
    '''
    print('Evaluating ...')

    testenc = testenc.input_ids
    nsamples = testenc.numel() // 2048 # 根据testenc数据计算样本数

    use_cache = model.config.use_cache
    model.config.use_cache = False
    layers = model.model.decoder.layers

    # 将模型参数加载到指定设备
    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev)
    model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev)
    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
        model.model.decoder.project_out = model.model.decoder.project_out.to(dev)
    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
        model.model.decoder.project_in = model.model.decoder.project_in.to(dev)
    layers[0] = layers[0].to(dev)

    dtype = next(iter(model.parameters())).dtype
    inps = torch.zeros(
        (nsamples, 2048, model.config.hidden_size), dtype=dtype, device=dev
    )
    cache = {'i': 0, 'attention_mask': None}

    class Catcher(nn.Module):
        def __init__(self, module):
            super().__init__()
            self.module = module
        def forward(self, inp, **kwargs):
            inps[cache['i']] = inp
            cache['i'] += 1
            cache['attention_mask'] = kwargs['attention_mask']
            raise ValueError
    layers[0] = Catcher(layers[0])
    for i in range(nsamples):
        batch = testenc[:, (i * 2048):((i + 1) * 2048)].to(dev)
        try:
            model(batch)
        except ValueError:
            pass
    layers[0] = layers[0].module

    layers[0] = layers[0].cpu()
    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu()
    model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu()
    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
        model.model.decoder.project_out = model.model.decoder.project_out.cpu()
    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
        model.model.decoder.project_in = model.model.decoder.project_in.cpu()
    torch.cuda.empty_cache()

    outs = torch.zeros_like(inps)
    attention_mask = cache['attention_mask']

    # 通过执行每个层并计算性能指标来进行评估
    for i in range(len(layers)):
        print(i)
        # 对每一层进行评估，类似于opt_sequential函数
        layer = layers[i].to(dev)
        for j in range(nsamples):
            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
        layers[i] = layer.cpu()
        del layer
        torch.cuda.empty_cache()
        inps, outs = outs, inps

    if model.model.decoder.final_layer_norm is not None:
        model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(dev)
    if model.model.decoder.project_out is not None:
        model.model.decoder.project_out = model.model.decoder.project_out.to(dev)
    model.lm_head = model.lm_head.to(dev)

    testenc = testenc.to(dev)
    nlls = []
    for i in range(nsamples):
        hidden_states = inps[i].unsqueeze(0)
        if model.model.decoder.final_layer_norm is not None:
            hidden_states = model.model.decoder.final_layer_norm(hidden_states)
        if model.model.decoder.project_out is not None:
            hidden_states = model.model.decoder.project_out(hidden_states)
        lm_logits = model.lm_head(hidden_states)
        shift_logits = lm_logits[:, :-1, :].contiguous()
        shift_labels = testenc[
            :, (i * 2048):((i + 1) * 2048)
        ][:, 1:]
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        neg_log_likelihood = loss.float() * 2048
        nlls.append(neg_log_likelihood)
    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * 2048))
    print(ppl.item())
    return ppl.item()


### 加载模型和测试数据

In [None]:
model_name = "facebook/opt-1.3b"  # 可选: "facebook/opt-125m", "facebook/opt-1.3b", "facebook/opt-2.7b", "facebook/opt-6.7b", "facebook/opt-13b"

model_13b, tokenizer = load_model_and_tokenizer(model_name)
test_dataset = text_dataset()
encodings = encode_dataset(tokenizer, test_dataset)

### 指定量化位宽b并进行量化

In [None]:
copy_model_4bit_13b = copy.deepcopy(model_13b)
b = 4
quantize_opt_model(copy_model_4bit_13b, b)

### 评估量化后的困惑度

In [None]:
dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ppl = calculate_perplexity(copy_model_4bit_13b, encodings, dev)
print(f"{model_name}量化到{b}比特的困惑度(PPL): {ppl}")