### 完全从零写一个小模型
Build a miniGPT from scratch
- 从零开始构建 SLM
- 运行代码完全手写，一行一行带着运行


In [1]:
# part 1: 导入相关的 package
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from dataclasses import dataclass

torch.manual_seed(1024)

<torch._C.Generator at 0x23b85613310>

In [2]:
@dataclass
class GPTConfig:
    block_size: int =512 # 这里其实应该是文本的最大长度（ max_seq_len）
    batch_size: int =12 
    n_layer: int = 6
    n_head: int =12
    n_embd: int =768 # n_embd 也叫 hidden_dim, hiden_size, 这里我同时设置了和 embed_dim 一样
    head_size: int=n_embd//n_head
    dropout: float=0.1
    #  tiktoken 使用的是 GPT-2 的词表，大约有 50257 个token
    vocab_size: int=50257

### 模型结构

In [None]:
class SingleHeadAttention(nn.Module):
    #单头注意力机制
    def __init__(self,config):
        super().__init__()
        self.key=nn.Linear(config.n_embd,config.head_size)
        self.value=nn.Linear(config.n_embd,config.head_size)
        self.query =nn.Linear(config.n_embd,config.head_size)
        

        # 尝试学习新的写法，attention_mask 通过 register_buffer 注册
        # 因为不用计算 梯度，所以节约内存和显存，速度也更快
        self.register_buffer(
            'attention_mask',
            torch.tril(
                torch.ones(config.block_size,config.block_size)
            )
        )
        self.dropout=nn.Dropout(config.dropout)
    def forward(self,x):
        batch_size,seq_len,hidden_size=x.size()
        k=self.key(x)
        v=self.value(x)
        q=self.query(x)
        weight=q@ k.transpose(-2,-1) # @ 就是 torch.matmul 的简化写法
        weight=weight.masked_fill(
            self.attention_mask[:seq_len,:seq_len]==0,
            float('-inf')
        )
        weight=F.softmax(weight,dim=-1)
        weight=self.dropout(weight)
        out=weight @ v
        return out
    
class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.heads=nn.ModuleList(
            [
                SingleHeadAttention(config)
                for _ in range(config.n_head)
            ])
        self.proj=nn.Linear(config.n_embd,config.n_embd)
        self.dropout=nn.Dropout(config.dropout)
    
    def forward(self,x):
        output=torch.cat(
            [h(x) for h in self.heads],
            dim=-1
        )
        output=self.proj(output)
        output=self.dropout(output)
        return output
    
class FeedForward(nn.Module):
    # 实际上就是 MLP
    def __init__(self, config):
        super().__init__()
        self.net=nn.Sequential(
            nn.Linear(config.n_embd,4*config.n_embd),
            nn.GELU(),
            nn.Linear(4*config.n_embd,config.n_head),
            nn.Dropout(config.dropout)
        )
    def forward(self,x):
        return self.net(x)