In [None]:
# import the packages
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from dataclasses import dataclass

import math

torch.manual_seed(1024)

In [None]:
@dataclass
class GPTConfig:
  block_size: int = 32 # max_seq_len: what is the maximum context length for predictions?
  batch_size: int = 16 # how many independent sequences will we process in parallel?
  n_layer: int = 4
  n_head: int = 4 # multi-head
  n_embd: int = 64 # n_embd: hidden_dim, hiden_size 词向量，位置向量，以及内部特征向量的维数
  head_size: int = n_embd // n_head
  dropout: float = 0.1
  vocab_size: int = 50257 # tiktoken 使用的是 GPT-2 的词表，大约有 50257 个token

In [None]:
class singleHeadAttention(nn.Module):
    def __init__(self, config: GPTConfig):
        super().__init__()
        # key, query, value projections for all heads
        # nn.Linear(n,m) is a module that creates single layer feed forward network with n inputs and m output.
        self.key = nn.Linear(config.n_embd, config.head_size)
        self.value = nn.Linear(config.n_embd, config.head_size)
        self.query = nn.Linear(config.n_embd, config.head_size)
        self.head_size = config.head_size

        # attention_mask using register_buffer
        self.register_buffer(
            'attention_mask',
            # causal mask to ensure that attention is only applied to the left in the input sequence
            torch.tril(torch.ones(config.block_size, config.block_size))
        )
        # Dropout是一种常用的正则化方法，通过随机将部分神经元的输出置为0来减少过拟合
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x):
        batch_size, seq_len, hidden_size = x.size() # batch size, sequence length, embedding dimensionality (n_embd)

        k = self.key(x) # (batch_size, seq_len, head_size)
        v = self.value(x) # (batch_size, seq_len, head_size)
        q = self.query(x) # (batch_size, seq_len, head_size)  

        # transpose the second last and last dimensions of tensor k
        # 除以sqrt(head_size) before softmax
        weight = weight.masked_fill(
            self.attention_mask[:seq_len, :seq_len] == 0,
            float('-inf')
        ) / math.sqrt(self.head_size) # 这里的 hidden_size 其实是 head_size，因为是单头
        weight = F.softmax(weight, dim=-1)
        weight = self.dropout(weight)
        out = weight @ v
        return out
  
        

In [None]:
class MultiHeadAttention(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.heads = nn.ModuleList(
        [
            SingleHeadAttention(config)
            for _ in range(config.n_head)
        ]
    )
    self.proj = nn.Linear(config.n_embd, config.n_embd)
    self.dropout = nn.Dropout(config.dropout)

  def forward(self, x):
    output = torch.cat(
        [ h(x) for h in self.heads],
         dim = -1
     )
    output = self.proj(output)
    output = self.dropout(output)
    return output


class FeedForward(nn.Module):
    # 实际上就是 MLP
    def __init__(self, config):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(config.n_embd, 4 * config.n_embd),
            nn.GELU(),
            nn.Linear(4 * config.n_embd, config.n_embd),
            nn.Dropout(config.dropout)
        )

    def forward(self, x):
        return self.net(x)