<a href="https://colab.research.google.com/github/jevliu/2022-Machine-Learning-Specialization/blob/main/ESB_GPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 针对GPT-2模型进行ESB量化

### 加载必要的库

In [None]:
import torch
import numpy as np
from scipy.optimize import minimize_scalar
from scipy.stats import laplace
import math
import time
from transformers import GPT2LMHeadModel

### 定义量化函数

In [None]:
def esb_quantize(weights, b, k, alpha, eps=1e-8):
    # 计算N
    N = 2 ** (b - k - 1) - 1

    # 生成移位因子xi
    xi = [2 ** -k] + [2 ** (i - k - 1) for i in range(1, N + 1)]

    # 生成分数集合Omega
    Omega = [[0] + list(range(1, 2 ** k))] + [list(range(2 ** k, 2 ** (k + 1))) for _ in range(N)]

    # 生成非负的量化值集合Q_positive
    Q_positive = []
    for i in range(N + 1):
        Q_positive.extend([alpha * x * y for x in xi[i:] for y in Omega[i]])
    Q_positive = sorted(set(Q_positive))

    # 对量化值集合进行对称处理
    Q_e = sorted(set([-q for q in Q_positive] + [0] + Q_positive))

    # 计算量化范围C
    C = np.max(Q_e)

    # 创建掩码,标记非零权重
    mask = weights != 0

    # 缩放非零权重
    scaled_weights = np.zeros_like(weights)
    scaled_weights[mask] = weights[mask] / alpha

    # 截断非零权重
    clipped_weights = np.clip(scaled_weights, -C, C)

    # 移位量化非零权重
    quantized_weights = np.zeros_like(clipped_weights)
    nonzero_indices = np.where(mask)
    v = clipped_weights[nonzero_indices]
    n = np.ceil(np.log2(np.abs(v) + eps))
    quantized_v = np.round(v / (2 ** (n - k))) * (2 ** (n - k))
    quantized_weights[nonzero_indices] = quantized_v

    # 反缩放量化后的权重
    dequantized_weights = quantized_weights * alpha

    return dequantized_weights

### 定义加载模型和数据集的函数

In [None]:
def load_model_and_tokenizer(model_name):
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    return model, tokenizer

def text_dataset():
    test_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
    return test_dataset

def encode_dataset(tokenizer, testdata):
    testenc = tokenizer("\n\n".join(testdata['text']), return_tensors='pt')
    return testenc


### 定义加权量化误差函数和寻找最优的k和α的函数

In [None]:
def quantization_error_weighted(weights, alpha, b, k, eps=1e-8):
    quantized_weights = esb_quantize(weights, b, k, alpha)
    abs_weights = np.abs(weights)
    weights_sum = np.sum(abs_weights)
    weighted_error = np.sum(np.abs(weights - quantized_weights) * abs_weights) / (weights_sum + eps)
    return weighted_error

def find_optimal_k_alpha(weights, b):
    min_error = float("inf")
    optimal_k = None
    optimal_alpha = None

    for k in range(b):
        alpha_opt = minimize_scalar(lambda a: quantization_error_weighted(weights, a, b, k)).x
        error = quantization_error_weighted(weights, alpha_opt, b, k)
        if error < min_error:
            min_error = error
            optimal_k = k
            optimal_alpha = alpha_opt

    return optimal_k, optimal_alpha, min_error

def quantize_gpt2_layer(layer, b):
    weights = layer.weight.cpu().detach().numpy()

    optimal_k, optimal_alpha, min_error = find_optimal_k_alpha(weights, b)

    quantized_weights = esb_quantize(weights, b, optimal_k, optimal_alpha)
    layer.weight = torch.nn.Parameter(torch.tensor(quantized_weights))

    print(f"最优k: {optimal_k}, 最优α: {optimal_alpha:.4f}, 量化相对误差：{min_error:.8f}")

### 针对模型进行量化的函数（只量化了Block）

In [None]:
def quantize_gpt_model(model, b):

    start_time = time.time()

    # 对嵌入层进行量化
    # print("对嵌入层进行量化...")
    # quantize_gpt2_layer(quantized_model_2.transformer.wte, b)
    # quantize_gpt2_layer(quantized_model_2.transformer.wpe, b)

    # 对每个Block进行量化
    for i, block in enumerate(model.transformer.h):
        print(f"\n对第{i+1}个Block进行量化...")

        print("对注意力层进行量化...")
        quantize_gpt2_layer_new(block.attn.c_attn, b)
        quantize_gpt2_layer_new(block.attn.c_proj, b)

        print("对前馈层进行量化...")
        quantize_gpt2_layer_new(block.mlp.c_fc, b)
        quantize_gpt2_layer_new(block.mlp.c_proj, b)

        print("对LayerNorm层进行量化...")
        quantize_gpt2_layer_new(block.ln_1, b)
        quantize_gpt2_layer_new(block.ln_2, b)

    # 对最后的LayerNorm层和语言模型头进行量化
    # print("对最后的LayerNorm层和语言模型头进行量化...")
    # quantize_gpt2_layer(quantized_model_2.transformer.ln_f, b)
    # quantize_gpt2_layer(quantized_model_2.lm_head, b)
    end_time = time.time()
    print(f"量化总时间: {end_time - start_time:.2f} 秒")

### 定义计算困惑度的函数

In [None]:
# 计算困惑度的函数
def compute_perplexity(model, data, device):
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    data.to(device)

    max_length = model.config.n_positions
    stride = 512

    lls = []
    total_time = 0
    for i in range(0, data.input_ids.size(1), stride):
        begin_loc = max(i + stride - max_length, 0)
        end_loc = min(i + stride, data.input_ids.size(1))
        trg_len = end_loc - i
        input_ids = data.input_ids[:, begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        start_time = time.time()
        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            log_likelihood = outputs[0] * trg_len
        end_time = time.time()
        elapsed_time = end_time - start_time
        total_time += elapsed_time

        lls.append(log_likelihood)

    ppl = torch.exp(torch.stack(lls).sum() / end_loc)
    print(f"总前向传播时间: {total_time:.2f} 秒")
    return ppl.item()

### 加载模型和测试数据

In [None]:
model_name = "gpt2"  # 可选: "gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl"

model_125m, tokenizer = load_model_and_tokenizer(model_name)
test_dataset = text_dataset()
encodings = encode_dataset(tokenizer, test_dataset)

### 指定量化位宽b并进行量化

In [None]:
copy_model_4bit_125m = copy.deepcopy(model_125m)
b = 4
quantize_opt_model(copy_model_4bit_125m, b)

### 评估量化后的困惑度

In [None]:
dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ppl = calculate_perplexity(copy_model_4bit_125m, encodings, dev)
print(f"{model_name}量化到{b}比特的困惑度(PPL): {ppl}")