In [1]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F

import torch._inductor.config as config
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group
from torch.distributed.optim import ZeroRedundancyOptimizer
import torch.distributed as dist

  from torch.distributed.optim import ZeroRedundancyOptimizer


In [2]:
class NewGELU(nn.Module):
    """Careful there are a few versions of GeLU, this one is the exact one used by OpenAI"""
    def forward(self, input):
        return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0/math.pi) * (input + 0.044715 * torch.pow(input \
                                                                         , 3.0))))

In [3]:
import math
class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.c_proj.LLMC_RESIDUAL_SCALE_FLAG = 1
        # regularization
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        # not really a 'bias', more of a mask, but following the OpenAI/HF naming through
        self.register_buffer('bias', torch.tril(torch.ones(config.block_size, config.block_size))
                             .view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size() # batch, seq_len, n_embd
        qkv = self.c_attn(x) # batch, seq_len, 3*n_embd
        q, k, v = qkv.split(self.n_embd, dim=2) # batch, seq_len, n_embd
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        if FLASH:
            y = F.scaled_dot_product_attention(q, k, v, is_causal=True)
        else:
            # manual implementation of attention
            # materialize the (T, T) matrix
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) # (B, nh, T, T)
            att = att.masked_fill(self.bias[:, :, :T, :T] ==0, float('-inf')) # (B, nh, T, T)
            att = F.softmax(att, dim=-1)
            y = att @ v # (B, nh, T, T)@(B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        # output projection
        y = self.c_proj(y)
        return y
        

In [4]:
class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, 4*config.n_embd)
        self.gelu = NewGELU()
        self.c_proj = nn.Linear(4*config.n_embd, config.n_embd)
        self.c_proj.LLMC_RESIDUAL_SCALE_FLAG = 1
    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x
        

In [5]:
class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)
    def forward(self, x):
        x = x+self.attn(self.ln_1(x))
        x = x+self.mlp(self.ln_2(x))
        return x

In [6]:
from dataclasses import dataclass

@dataclass
class GPTConfig:
    block_size: int = 1024
    vocab_size: int = 50527
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768

In [7]:
class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.lm_head.LLMC_SKIP_INIT = 1 # don't init this one, we will tie weights
        self.transformer.wte.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying

        # init all weights, use a torch rng object to be very careful
        self.init_rng = torch.Generator()
        self.init_rng.manual_seed(42)
        self.apply(self._init_weights)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02 if not hasattr(module, 'LLMC_RESIDUAL_SCALE_FLAG') else 0.02/math.sqrt(2*self.config.n_layer)
            # we want to skip initializing lm_head, which shares parameters with wte
            # and wte was already initialized down below during the embedding init
            if not hasattr(module, 'LLMC_SKIP_INIT'):
                torch.nn.init.normal_(module.weight, mean=0.0, std=std, generator=self.init_rng)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02, generator=self.init_rng)

    def forward(self, idx, targets=None, return_logits=True):
        device = idx.device
        b, t = idx.size() # batch, seq_len
        assert t<=self.config.block_size, f"Cannot foward sequence of length {t}, block size is only {self.config.block_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device) 

        # forward the GPT model itself
        tok_emb = self.transformer.wte(idx)
        pos_emb = self.transformer.wpe(pos)
        x = tok_emb + pos_emb
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)

        if targets is not None:
            logits = self.lm_head(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
            logits = self.lm_head(x[:, [-1], :])
            loss = None

        if not return_logits:
            logits = None
        return logits, loss


    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        """
        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
        the sequence max_new_tokens times, feeding the predictions back into the model each time.
        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
        """
        for _ in range(max_new_tokens):
            # if the sequence context is growing too long we must crop it at block_size
            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
            # forward the model to get the logits for the index in the sequence
            logits, _ = self(idx_cond) # B, T, vocab_size
            logits = logits[:, -1, :] / temperature
            # optionally crop the logits to only the top k options
            if top_k:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('-inf')

            # apply softmax to convert logits to normalized probabilities
            probs = F.softmax(logits, dim=-1) # B, vocab_size
            idx_next = torch.multinomial(probs, num_samples=1)

            idx = torch.cat((idx, idx_next), dim=-1)
            return idx
        

In [8]:
# class GPTConfig:
#     block_size: int = 1024
#     vocab_size: int = 50527
#     n_layer: int = 12
#     n_head: int = 12
#     n_embd: int = 768
from transformers import AutoTokenizer
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from itertools import chain

device = 'cuda' if torch.cuda.is_available() else 'cpu'
n_block = 1024
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
config = GPTConfig(block_size=n_block, vocab_size=50257, n_layer=6, n_head=12, n_embd=768)
model = GPT(config).to(device)
len(tokenizer)

50257

In [9]:
def print_parameters(model):
    num_param = sum([param.numel() for param in model.parameters() if param.requires_grad])
    print(f'total param {num_param/1000/1000}m')

print_parameters(model)

total param 81.912576m


In [10]:
import torch
from datasets import load_dataset
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from itertools import chain

ds = load_dataset("p208p2002/wudao",streaming=True, split="train")

def encode(examples):
    return tokenizer(examples['title'], examples['content'], truncation=True, padding='max_length')

def collate_fn(examples, n_block, pad_token_id):
    x = torch.tensor([x['input_ids'] for x in examples], dtype=torch.long)
    y = torch.tensor([x['input_ids'][1:]+[tokenizer.eos_token_id] for x in examples], dtype=torch.long)
    # print(x.shape, y.shape)
    return x, y

ds = ds.map(encode, batched=True)
train_loader = DataLoader(ds, batch_size=12, collate_fn=lambda x: collate_fn(x,n_block, tokenizer.eos_token_id))
item = next(iter(train_loader))
print(item)

Resolving data files:   0%|          | 0/366 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/366 [00:00<?, ?it/s]

(tensor([[28156,   243,   163,  ...,   239, 44165,   247],
        [39355,   225, 44165,  ..., 50256, 50256, 50256],
        [  325,    78, 42468,  ...,   114, 29785,   112],
        ...,
        [20015,   236, 22522,  ...,   100, 26344,   114],
        [36685,   224, 19526,  ...,    95,   252, 27950],
        [44293,   119,   163,  ..., 38519,   163,   122]]), tensor([[  243,   163,   121,  ..., 44165,   247, 50256],
        [  225, 44165,   247,  ..., 50256, 50256, 50256],
        [   78, 42468, 20015,  ..., 29785,   112, 50256],
        ...,
        [  236, 22522,   252,  ..., 26344,   114, 50256],
        [  224, 19526,   243,  ...,   252, 27950, 50256],
        [  119,   163,   244,  ...,   163,   122, 50256]]))


In [11]:
FLASH = 0
def sample(model, query, max_new_tokens=128):
    tokens = torch.tensor(tokenizer.encode(query), dtype=torch.long).unsqueeze(0)
    outputs = model.generate(tokens.to(device), max_new_tokens)
    return tokenizer.decode(outputs.view(-1).cpu().numpy())

print(sample(model, "中国首都是哪?"))

中国首都是哪? upward


In [12]:
from torch.cuda.amp import autocast, GradScaler
import wandb
scaler = GradScaler()  # 创建 GradScaler 对象
wandb.init()

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, betas=(0.9,0.95))
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=1000)

def eval(model, val_loader):
    model.eval()
    val_loss = 0.0
    for x, y in val_loader:
        with torch.no_grad():
            x, y = x.to(device), y.to(device)
            logits, loss = model(x, targets=y)
            val_loss+=loss.item()
    return val_loss
            
def train(model, optimizer, scheduler, train_loader, grad_clip=1.0):
    model.train()
    total_loss = 0.0
    grad_norm = -1.0
    for idx, (x, y) in enumerate(train_loader):
        optimizer.zero_grad()
        with autocast(device_type='cuda', dtype=torch.bfloat16):
            x, y = x.to(device), y.to(device)
            logits, loss = model(x, targets=y)
        
        scaler.scale(loss).backward()
        # clip grad
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip)
        scaler.step(optimizer)
        scaler.update()
        # adjust lr
        scheduler.step()

        total_loss += loss.item()
        # grad_norm = torch.sqrt(sum(p.grad.norm()**2 for p in model.parameters() if p.grad is not None))
        # Compute total gradient norm (L2 norm)
        # grad_norm = torch.sqrt(sum(p.grad.norm() ** 2 for p in model.parameters() if p.grad is not None))


        if idx % 100 == 0:
            lr = optimizer.param_groups[0]["lr"]
            print(f'Epoch {epoch}, Step: {idx} Learing: {lr:.10f} Loss: {loss.item():.4f} Grad Norm: {grad_norm:.4f}')
            wandb.log({'step':idx, 'train/loss':loss.item(), 'learning_rate': lr, 'grad_norm': grad_norm})
        if idx % 1000 == 0:
            print(sample(model, "中国首都是哪?"))
        if idx % 5000 == 0:
            dt = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            # 假设 model 是您的模型实例
            torch.save(model.state_dict(), f'ouputs/nanogpt/checkpoint-{idx}/model_weights.pth')
            # model.save_pretrained(f'outputs/nanogpt/checkpoint-{idx}/', safe_serialization=False)
            
    return total_loss

In [None]:
from datetime import datetime

def set_seed(seed:int):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)

set_seed(42)
for epoch in range(1):
    train_loss = train(model, optimizer, scheduler, train_loader)
    val_loss = 0.0
    print(f'Epoch={epoch} Train Loss={train_loss/len(train_loader):.4f} Val Loss={val_loss}')


Epoch 0, Step: 0 Learing: 0.0000100000 Loss: 10.8660 Grad Norm: 12.7340
中国首都是哪? pitching
Epoch 0, Step: 100 Learing: 0.0000097504 Loss: 7.7208 Grad Norm: 3.7821
Epoch 0, Step: 200 Learing: 0.0000090358 Loss: 6.3536 Grad Norm: 3.1613
Epoch 0, Step: 300 Learing: 0.0000079262 Loss: 5.4538 Grad Norm: 2.6412
Epoch 0, Step: 400 Learing: 0.0000065301 Loss: 4.8298 Grad Norm: 2.2347
Epoch 0, Step: 500 Learing: 0.0000049843 Loss: 4.3690 Grad Norm: 2.2557
Epoch 0, Step: 600 Learing: 0.0000034400 Loss: 4.3995 Grad Norm: 2.5969
Epoch 0, Step: 700 Learing: 0.0000020484 Loss: 4.2486 Grad Norm: 3.2801
Epoch 0, Step: 800 Learing: 0.0000009457 Loss: 4.1137 Grad Norm: 1.8781
Epoch 0, Step: 900 Learing: 0.0000002399 Loss: 4.0051 Grad Norm: 1.5855
Epoch 0, Step: 1000 Learing: 0.0000000000 Loss: 4.1859 Grad Norm: 1.9173
中国首都是哪?�
Epoch 0, Step: 1100 Learing: 0.0000002496 Loss: 4.1343 Grad Norm: 1.7015
Epoch 0, Step: 1200 Learing: 0.0000009642 Loss: 4.1070 Grad Norm: 1.6356
Epoch 0, Step: 1300 Learing: 0.0000

In [None]:
from datetime import datetime

dt = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# model.save_pretrained(f'outputs/nanogpt-{dt}', safe_serialization=False)
torch.save(model.state_dict(), f'ouputs/nanogpt/model_weights.pth')