download_and_load_gpt2 是作者准备的下载gpt2官方weight的脚本（需要tensorflow支持）

方法返回模型的setting，和一个字典，key是层，value是numpy保存的各层参数

In [1]:
from gpt_download import download_and_load_gpt2
settings, params = download_and_load_gpt2(
    model_size="124M", models_dir="gpt2"
)

File already exists and is up-to-date: gpt2\124M\checkpoint
File already exists and is up-to-date: gpt2\124M\encoder.json
File already exists and is up-to-date: gpt2\124M\hparams.json
File already exists and is up-to-date: gpt2\124M\model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2\124M\model.ckpt.index
File already exists and is up-to-date: gpt2\124M\model.ckpt.meta
File already exists and is up-to-date: gpt2\124M\vocab.bpe


官方的GPT2配置

In [3]:
model_configs = {
    "gpt2-small (124M)": {
        "vocab_size": 50257,
        "context_length": 1024,
        "emb_dim": 768,
        "n_heads": 12,
        "n_layers": 12,
        "drop_rate": 0.1,
        "qkv_bias": True
    }
}

In [4]:
import torch
import numpy as np

def assign(left, right):
    '''
    保证right的shape和left相等，然后将right作为pytorch parameter返回
    '''
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, "
                          "Right: {right.shape}"
        )
    return torch.nn.Parameter(torch.tensor(right))


def load_weights_into_gpt(gpt, params):
    '''
    gpt pytorch模型
    params 从官方下载的参数，是一个字典，key是不同的层，value是层的参数
    '''
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params["wpe"])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params["wte"])

    for b in range(len(params["blocks"])):
        q_w, k_w, v_w = np.split((params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        gpt.trf_blocks[b].mha.q_proj.weight = assign(gpt.trf_blocks[b].mha.q_proj.weight, q_w.T)
        gpt.trf_blocks[b].mha.k_proj.weight = assign(gpt.trf_blocks[b].mha.k_proj.weight, k_w.T)
        gpt.trf_blocks[b].mha.v_proj.weight = assign(gpt.trf_blocks[b].mha.v_proj.weight, v_w.T)

        q_b, k_b, v_b = np.split((params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.trf_blocks[b].mha.q_proj.bias = assign(gpt.trf_blocks[b].mha.q_proj.bias, q_b)
        gpt.trf_blocks[b].mha.k_proj.bias = assign(gpt.trf_blocks[b].mha.k_proj.bias, k_b)
        gpt.trf_blocks[b].mha.v_proj.bias = assign(gpt.trf_blocks[b].mha.v_proj.bias, v_b)

        gpt.trf_blocks[b].mha.proj.weight = assign(
            gpt.trf_blocks[b].mha.proj.weight, 
            params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].mha.proj.bias = assign(
            gpt.trf_blocks[b].mha.proj.bias, 
            params["blocks"][b]["attn"]["c_proj"]["b"])
         
        # 下边feed forward layers取0和2的是因为 feedforward是 sequential，
        # 第0，2是Linear，第1层是Gelu
        gpt.trf_blocks[b].feed_forward.layer[0].weight = assign(
            gpt.trf_blocks[b].feed_forward.layer[0].weight, 
            params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_blocks[b].feed_forward.layer[0].bias = assign(
            gpt.trf_blocks[b].feed_forward.layer[0].bias, 
            params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.trf_blocks[b].feed_forward.layer[2].weight = assign(
            gpt.trf_blocks[b].feed_forward.layer[2].weight, 
            params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].feed_forward.layer[2].bias = assign(
            gpt.trf_blocks[b].feed_forward.layer[2].bias, 
            params["blocks"][b]["mlp"]["c_proj"]["b"])
        
        gpt.trf_blocks[b].layer_norm1.scale = assign(
            gpt.trf_blocks[b].layer_norm1.scale, 
            params["blocks"][b]["ln_1"]["g"])
        gpt.trf_blocks[b].layer_norm1.shift = assign(
            gpt.trf_blocks[b].layer_norm1.shift, 
            params["blocks"][b]["ln_1"]["b"])
        gpt.trf_blocks[b].layer_norm2.scale = assign(
            gpt.trf_blocks[b].layer_norm2.scale, 
            params["blocks"][b]["ln_2"]["g"])
        gpt.trf_blocks[b].layer_norm2.shift = assign(
            gpt.trf_blocks[b].layer_norm2.shift, 
            params["blocks"][b]["ln_2"]["b"])
    
    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])
    

In [5]:
from MyGPT2.gpt2_model import GPT2Model

model = GPT2Model(model_configs["gpt2-small (124M)"])
load_weights_into_gpt(model, params)

In [10]:
import tiktoken
from MyGPT2.text_utils import text_to_ids
from MyGPT2.text_utils import generate_text_simple
from MyGPT2.text_utils import ids_to_text

start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

output_ids = generate_text_simple(
    model=model,
    idx = text_to_ids(start_context, tokenizer=tokenizer),
    max_new_tokens=50,
    context_size=model_configs["gpt2-small (124M)"]["context_length"]
)

print("Output text:", ids_to_text(output_ids, tokenizer=tokenizer))

Output text: Every effort moves you to the next step.

The first step is to get your life back on track.

The second step is to get your life back on track.

The third step is to get your life back on track.

The
