# GPT2 模型

# 1 训练数据加工
# 2 GPT2 模型
# 3 模型训练
# 4 模型推理

# 1 训练数据加工
## 数据来源：https://modelscope.cn/datasets/mapjack/openwebtextSample

In [6]:
import os
import torch
from datasets import load_dataset
from tqdm import tqdm
import tiktoken
from torch.utils.data import DataLoader,Dataset
# 设置参数
num_proc = 14

local_data_path="/Users/wangaijun/pythoncode/github/model/openwebtext"

block_size = 128  # 根据你的模型设置合适的块大小
batch_size = 32  # 根据你的硬件设置合适的批次大小
device_type = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(device_type)

enc = tiktoken.get_encoding("gpt2")

def process(example):
    ids = enc.encode_ordinary(example['text'])
    ids.append(enc.eot_token)
    return {'ids': ids, 'len': len(ids)}

class StreamingParquetDataset(torch.utils.data.IterableDataset):
    def __init__(self, data_files, split, block_size, num_proc=14):
        self.data_files = data_files
        self.split = split
        self.block_size = block_size
        self.num_proc = num_proc
        self.dataset = load_dataset("arrow", data_files={split: data_files}, streaming=True)
        self.tokenized = self.dataset[split].map(process, remove_columns=['text'])
    def __iter__(self):
        for example in self.tokenized:
            ids = example['ids']
            for i in range(0, len(ids) - self.block_size, self.block_size):
                x = torch.tensor(ids[i:i + self.block_size], dtype=torch.int64)
                y = torch.tensor(ids[i + 1:i + 1 + self.block_size], dtype=torch.int64)
                yield x, y

# 配置路径和文件

arrow_files = [os.path.join(local_data_path, f) for f in os.listdir(local_data_path) if f.endswith('.arrow')]
print(f"Found {len(arrow_files)} arrow files.")

# 创建数据集
train_dataset = StreamingParquetDataset(arrow_files, 'train', block_size, num_proc)
val_dataset = StreamingParquetDataset([arrow_files[-1]], 'val', block_size, num_proc)

# 创建数据加载器
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)

# 示例函数：获取一个批次的数据
def get_batch(loader, device):
    for x, y in loader:
        if device_type == 'cuda':
            x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
        else:
            x, y = x.to(device), y.to(device)
        yield x, y

# 使用示例
for x, y in get_batch(train_loader, device):
    # 在这里进行模型训练
    print(x.shape, y.shape)
    break  # 只打印一个批次的数据

Found 2 arrow files.
torch.Size([32, 128]) torch.Size([32, 128])


In [4]:
x,y

(tensor([[ 4303,    40,  7156,  ...,   356,   423,  1760],
         [  523,  1290,  5818,  ...,   661,   423,  3724],
         [   13,  1148,   612,  ..., 31309,   262, 10768],
         ...,
         [   13,   198,   198,  ...,    11,  5434, 30013],
         [27425,   290, 22771,  ...,  3003,   422,  4955],
         [ 6718,  1010,   284,  ...,    87,   737,  1629]]),
 tensor([[   40,  7156,  3698,  ...,   423,  1760,   523],
         [ 1290,  5818,   470,  ...,   423,  3724,    13],
         [ 1148,   612,  1223,  ...,   262, 10768,  5471],
         ...,
         [  198,   198,    37,  ...,  5434, 30013, 27425],
         [  290, 22771,   737,  ...,   422,  4955,  6718],
         [ 1010,   284, 13586,  ...,   737,  1629, 24897]]))

# 2 模型

<div style="text-align: center;">
  <img src="images/transformer.png" alt="Image" style="width:600px;">
</div>

In [3]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
class config:
    block_size=1024
    vocab_size=50304
    n_layer=12
    n_head=12
    n_embd=768
    dropout=0.0
    bias=False

## 2.1 Embedding和位置编码

In [4]:
wte = nn.Embedding(config.vocab_size, config.n_embd)
wpe = nn.Embedding(config.block_size, config.n_embd)
drop = nn.Dropout(config.dropout)
ln_f = nn.LayerNorm(config.n_embd, bias=config.bias)

In [5]:
x_embd=wte(x)
x_embd.shape

torch.Size([32, 128, 768])

In [6]:
x_embd_ln=ln_f(x_embd)
x_embd_ln.shape

torch.Size([32, 128, 768])

## 2.2 线性层-保持形状不变

In [7]:
class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
        self.gelu    = nn.GELU()
        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
        self.dropout = nn.Dropout(config.dropout)
    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        x = self.dropout(x)
        return x


In [8]:
mlp=MLP(config)(x_embd_ln)
mlp.shape

torch.Size([32, 128, 768])

## 2.3 因果自注意

<div style="text-align: center;">
  <img src="images/multiheadattention.png" alt="Image" style="width:500px;">
</div>

In [9]:
class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
        # regularization
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.dropout = config.dropout
        # flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        if not self.flash:
            print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
            # causal mask to ensure that attention is only applied to the left in the input sequence
            self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                                        .view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        if self.flash:
            # efficient attention using Flash Attention CUDA kernels
            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
        else:
            # manual implementation of attention
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
            att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
            att = F.softmax(att, dim=-1)
            att = self.attn_dropout(att)
            y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y

#### 构造因果mask

In [10]:
mask=torch.tril(torch.ones(4, 4)) .view(1, 1, 4, 4)
print("mask matric:")
print(mask)
q=k=v=torch.rand(1,4,10)
att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
print("att score ")
print(att)
mask_att = att.masked_fill(mask[:,:,:4,:4] == 0, float('-inf'))
print("mask  att ")
print(mask_att)


mask matric:
tensor([[[[1., 0., 0., 0.],
          [1., 1., 0., 0.],
          [1., 1., 1., 0.],
          [1., 1., 1., 1.]]]])
att score 
tensor([[[1.2386, 0.9562, 0.8730, 0.9235],
         [0.9562, 1.2590, 1.0642, 1.0390],
         [0.8730, 1.0642, 1.2794, 1.1148],
         [0.9235, 1.0390, 1.1148, 1.2704]]])
mask  att 
tensor([[[[1.2386,   -inf,   -inf,   -inf],
          [0.9562, 1.2590,   -inf,   -inf],
          [0.8730, 1.0642, 1.2794,   -inf],
          [0.9235, 1.0390, 1.1148, 1.2704]]]])


In [11]:
CausalSelfAttention(config)(torch.rand(1,4,768)).shape

torch.Size([1, 4, 768])

In [12]:
atten=CausalSelfAttention(config)(mlp)

In [13]:
class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 =  nn.LayerNorm(config.n_embd, bias=config.bias)
        self.attn = CausalSelfAttention(config)
        self.ln_2 =  nn.LayerNorm(config.n_embd, bias=config.bias)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

In [14]:
block=Block(config)(atten)
block.shape

torch.Size([32, 128, 768])

In [15]:
h = nn.ModuleList([Block(config) for _ in range(config.n_layer)])
lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

In [7]:
wte.weight = lm_head.weight

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


NameError: name 'lm_head' is not defined

In [17]:
wte.weight 

Parameter containing:
tensor([[ 0.0356, -0.0050,  0.0183,  ..., -0.0172, -0.0040, -0.0155],
        [ 0.0327, -0.0345, -0.0253,  ..., -0.0176,  0.0301, -0.0296],
        [ 0.0280, -0.0002,  0.0315,  ...,  0.0187, -0.0221,  0.0136],
        ...,
        [ 0.0083, -0.0109,  0.0299,  ..., -0.0106,  0.0268,  0.0111],
        [-0.0019,  0.0262,  0.0063,  ...,  0.0139, -0.0037,  0.0250],
        [ 0.0244,  0.0199,  0.0329,  ..., -0.0263,  0.0264,  0.0120]],
       requires_grad=True)

In [18]:
lm_head.weight

Parameter containing:
tensor([[ 0.0356, -0.0050,  0.0183,  ..., -0.0172, -0.0040, -0.0155],
        [ 0.0327, -0.0345, -0.0253,  ..., -0.0176,  0.0301, -0.0296],
        [ 0.0280, -0.0002,  0.0315,  ...,  0.0187, -0.0221,  0.0136],
        ...,
        [ 0.0083, -0.0109,  0.0299,  ..., -0.0106,  0.0268,  0.0111],
        [-0.0019,  0.0262,  0.0063,  ...,  0.0139, -0.0037,  0.0250],
        [ 0.0244,  0.0199,  0.0329,  ..., -0.0263,  0.0264,  0.0120]],
       requires_grad=True)

### 模型综合应用

In [19]:
transformer=nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            drop = nn.Dropout(config.dropout),
            bloks = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd, bias=config.bias),
        ))

In [20]:
# 参数计算
transformer.wpe.weight.numel()# 1024*768
torch.arange(10)

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [21]:
x_emb=transformer.wte(x)
x_pos=transformer.wpe(torch.arange(x.shape[1]))
print("x_emb shape:",x_emb.shape,"x_pos",x_pos.shape)
x_emb_pos=transformer.drop(x_pos+x_emb)
x_emb_pos.shape


x_emb shape: torch.Size([32, 128, 768]) x_pos torch.Size([128, 768])


torch.Size([32, 128, 768])

In [22]:
block_x=x_emb_pos
for block in transformer.bloks:
    block_x=block(block_x)
block_x.shape   

torch.Size([32, 128, 768])

In [23]:
block_x_ln=transformer.ln_f(block_x)
block_x_ln.shape

torch.Size([32, 128, 768])

# 3 损失函数

In [24]:
logits = lm_head(block_x_ln)
logits.shape

torch.Size([32, 128, 50304])

In [25]:
targets=y
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
print(loss)

tensor(10.9796, grad_fn=<NllLossBackward0>)


#### 补充 cross_entropy和NLLLoss计算损失

#### F.cross_entropy 是 PyTorch 中用于计算交叉熵损失的函数，它通常被用来评估分类模型的性能。这个函数内部已经包含了对预测值进行 log_softmax 的步骤，所以你不需要在输入到 F.cross_entropy 之前手动对预测值进行 softmax 或 log_softmax 处理。

In [26]:
length=3
number_of_label=5
predict=torch.rand(length,number_of_label)
y_fake=torch.randint(0,number_of_label,(length,))

en_loss=F.cross_entropy(predict,y_fake, ignore_index=-1)
print("predict")
print(predict)
print("y")
print(y_fake)
print("en_loss:", en_loss.item())


predict
tensor([[0.2956, 0.2771, 0.1637, 0.2688, 0.2839],
        [0.3973, 0.9177, 0.9226, 0.7005, 0.3502],
        [0.9846, 0.2220, 0.2206, 0.4623, 0.8314]])
y
tensor([4, 0, 3])
en_loss: 1.7416986227035522


In [27]:
# 应用 log_softmax
log_probs = F.log_softmax(predict, dim=1)
# 创建 NLLLoss 函数实例
nll_loss = nn.NLLLoss()
# 计算负对数似然损失
loss = nll_loss(log_probs, y_fake)
print("Loss:", loss.item())

Loss: 1.7416986227035522


# 4 模型训练

In [28]:
class GPT(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.vocab_size is not None
        assert config.block_size is not None
        self.config = config
        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            drop = nn.Dropout(config.dropout),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd, bias=config.bias),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        # with weight tying when using torch.compile() some warnings get generated:
        # "UserWarning: functional_call was passed multiple values for tied weights.
        # This behavior is deprecated and will be an error in future versions"
        # not 100% sure what this is, so far seems to be harmless. TODO investigate
        self.transformer.wte.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying

        # init all weights
        self.apply(self._init_weights)
        # apply special scaled init to the residual projections, per GPT-2 paper
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))

        # report number of parameters
        print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))

    def get_num_params(self, non_embedding=True):
        """
        Return the number of parameters in the model.
        For non-embedding count (default), the position embeddings get subtracted.
        The token embeddings would too, except due to the parameter sharing these
        params are actually used as weights in the final layer, so we include them.
        """
        n_params = sum(p.numel() for p in self.parameters())
        if non_embedding:
            n_params -= self.transformer.wpe.weight.numel()
        return n_params

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t)

        # forward the GPT model itself
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)
        
        if targets is not None:
            # if we are given some desired targets also calculate the loss
            logits = self.lm_head(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
            # inference-time mini-optimization: only forward the lm_head on the very last position
            logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
            loss = None

        return logits, loss


In [29]:
gpt2_model=GPT(config)
gpt2_model

number of parameters: 123.59M


GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50304, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=False)
          (c_proj): Linear(in_features=768, out_features=768, bias=False)
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=False)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=3072, out_features=768, bias=False)
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_h

In [30]:
# adamw optimizer
learning_rate = 6e-4 # max learning rate
max_iters = 600000 # total number of training iterations
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.95
grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
optimizer = torch.optim.AdamW(gpt2_model.parameters(), lr=learning_rate, betas=(beta1,beta2))

gpt2_model.transformer.wte.weight[:3,:5]

tensor([[ 0.0233,  0.0126, -0.0055, -0.0283,  0.0163],
        [-0.0210,  0.0059, -0.0161,  0.0175,  0.0436],
        [-0.0007,  0.0309,  0.0204,  0.0337, -0.0349]],
       grad_fn=<SliceBackward0>)

In [31]:
# 使用示例
i=0
for x, y in get_batch(train_loader, device):
    optimizer.zero_grad()  # 清零梯度
    # 在这里进行模型训练
    print(x.shape, y.shape)
    print(f"embedding weight step {i}",gpt2_model.transformer.wte.weight[:3,:5])
    logits, loss = gpt2_model(x, y)
    loss.backward()
    optimizer.step()
    if i==3:
        break  # 只打印一个批次的数据
    i+=1

torch.Size([32, 128]) torch.Size([32, 128])
embedding weight step 0 tensor([[ 0.0233,  0.0126, -0.0055, -0.0283,  0.0163],
        [-0.0210,  0.0059, -0.0161,  0.0175,  0.0436],
        [-0.0007,  0.0309,  0.0204,  0.0337, -0.0349]],
       grad_fn=<SliceBackward0>)
torch.Size([32, 128]) torch.Size([32, 128])
embedding weight step 1 tensor([[ 0.0227,  0.0120, -0.0061, -0.0289,  0.0169],
        [-0.0215,  0.0053, -0.0167,  0.0169,  0.0442],
        [-0.0013,  0.0303,  0.0198,  0.0331, -0.0343]],
       grad_fn=<SliceBackward0>)
torch.Size([32, 128]) torch.Size([32, 128])
embedding weight step 2 tensor([[ 0.0229,  0.0114, -0.0067, -0.0294,  0.0173],
        [-0.0213,  0.0047, -0.0173,  0.0164,  0.0446],
        [-0.0016,  0.0297,  0.0192,  0.0326, -0.0338]],
       grad_fn=<SliceBackward0>)
torch.Size([32, 128]) torch.Size([32, 128])
embedding weight step 3 tensor([[ 0.0225,  0.0117, -0.0063, -0.0290,  0.0170],
        [-0.0216,  0.0048, -0.0169,  0.0168,  0.0443],
        [-0.0014,  0.

# 5 模型推理

In [32]:
max_new_tokens=30
temperature=1.0
top_k=41
idx=torch.randint(0,10000,(2,4))
idx

tensor([[3448, 6908, 2513, 6145],
        [ 617, 7403, 9165, 8417]])

In [33]:
logits, _ = gpt2_model(idx)
print("logits",logits.shape)
logits = logits[:, -1, :] / temperature
print("logits with temperature",logits.shape)

# 测试topk
logits=torch.randint(0,10,(2,8),dtype=torch.float32)
print("fake data",logits)
if top_k is not None:
    v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
    logits[logits < v[:, [-1]]] = -float('Inf')
logits

logits torch.Size([2, 1, 50304])
logits with temperature torch.Size([2, 50304])
fake data tensor([[0., 2., 3., 5., 7., 7., 6., 3.],
        [5., 5., 5., 1., 2., 6., 2., 1.]])


tensor([[-inf, -inf, -inf, 5., 7., 7., 6., -inf],
        [5., 5., 5., -inf, -inf, 6., -inf, -inf]])

In [34]:
with torch.no_grad():
    for _ in range(max_new_tokens):
        logits, _ = gpt2_model(idx)
        logits = logits[:, -1, :] / temperature
        # optionally crop the logits to only the top k options
        if top_k is not None:
            v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
            logits[logits < v[:, [-1]]] = -float('Inf')
        # apply softmax to convert logits to (normalized) probabilities
        probs = F.softmax(logits, dim=-1)
        # sample from the distribution
        idx_next = torch.multinomial(probs, num_samples=1)
        # append sampled index to the running sequence and continue
        idx = torch.cat((idx, idx_next), dim=1)

idx

tensor([[3448, 6908, 2513, 6145,  262,  262,  262,  262,  262,   11,   11,  262,
          262,   13,  262,  262,  262,   13,   13,   13,   13,  262,   11,  262,
           13,  262,   13,  262,   11,  262,   11,  262,  262,   13],
        [ 617, 7403, 9165, 8417,   13,   11,   11,  262,  262,   11,   13,  262,
          262,  262,  262,  262,  262,  262,  262,  262,  262,  262,   11,   11,
           13,   11,   13,  262,   13,  198,  198,  262,   13,  262]])

In [35]:
decode = lambda l: enc.decode(l)

In [36]:
# 解码
decode(idx[0].tolist())

'pri trick walk marg the the the the the,, the the. the the the.... the, the. the. the, the, the the.'