In [1]:
"""
Full definition of a GPT Language Model, all of it in this single file.
References:
1) the official GPT-2 TensorFlow implementation released by OpenAI:
https://github.com/openai/gpt-2/blob/master/src/model.py
2) huggingface/transformers PyTorch implementation:
https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py
"""

'\nFull definition of a GPT Language Model, all of it in this single file.\nReferences:\n1) the official GPT-2 TensorFlow implementation released by OpenAI:\nhttps://github.com/openai/gpt-2/blob/master/src/model.py\n2) huggingface/transformers PyTorch implementation:\nhttps://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py\n'

In [2]:
import time
import math
from dataclasses import dataclass
import inspect
import os
import pickle
from contextlib import nullcontext
import numpy as np

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

In [3]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
text[:8]

'First Ci'

In [4]:
len(text)

1115394

In [5]:
chars = sorted(list(set(''.join(text))))
stoi = {c : i for i, c in enumerate(chars)}
itos = {i : c for c, i in stoi.items()}
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
vocab_size = len(itos)
print(itos)
print(vocab_size)

{0: '\n', 1: ' ', 2: '!', 3: '$', 4: '&', 5: "'", 6: ',', 7: '-', 8: '.', 9: '3', 10: ':', 11: ';', 12: '?', 13: 'A', 14: 'B', 15: 'C', 16: 'D', 17: 'E', 18: 'F', 19: 'G', 20: 'H', 21: 'I', 22: 'J', 23: 'K', 24: 'L', 25: 'M', 26: 'N', 27: 'O', 28: 'P', 29: 'Q', 30: 'R', 31: 'S', 32: 'T', 33: 'U', 34: 'V', 35: 'W', 36: 'X', 37: 'Y', 38: 'Z', 39: 'a', 40: 'b', 41: 'c', 42: 'd', 43: 'e', 44: 'f', 45: 'g', 46: 'h', 47: 'i', 48: 'j', 49: 'k', 50: 'l', 51: 'm', 52: 'n', 53: 'o', 54: 'p', 55: 'q', 56: 'r', 57: 's', 58: 't', 59: 'u', 60: 'v', 61: 'w', 62: 'x', 63: 'y', 64: 'z'}
65


In [6]:
# train and test split
data = torch.tensor(encode(text), dtype=torch.long)
n_train = int(0.9*len(data))

train_data = data[:n_train]       # 90%
eval_data = data[n_train:]         # 10%
data[:100]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])

In [7]:
# hyperparameters
# block_size = 32
# batch_size = 16 # how many independent sequences will we process in parallel?
# epochs = 1000
# eval_interval = 100
# learning_rate = 1e-3
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# eval_iters = 200

In [8]:
torch.manual_seed(1337)

<torch._C.Generator at 0x131afafcf30>

In [29]:
def get_batch(split):
    # generate small batch of data of inputs data X and targets y
    data = train_data if split == 'train' else eval_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
#     X = torch.stack([data[i:i+block_size] for i in ix])
#     y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    X = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
    if device_type == 'cuda':
        # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
        X, y = X.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
    else:
        X, y = X.to(device), y.to(device)
    return X, y

In [10]:
# X, y = get_batch('eval')
# X[:2], y[:2]

In [11]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'eval']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, y = get_batch(split)
            logits, loss = model(X, y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

# learning rate decay scheduler (cosine with warmup)
def get_lr(it):
    # 1) linear warmup for warmup_iters steps
    if it < warmup_iters:
        return learning_rate * it / warmup_iters
    # 2) if it > lr_decay_iters, return min learning rate
    if it > lr_decay_iters:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
    return min_lr + coeff * (learning_rate - min_lr)

In [12]:
# -----------------------------------------------------------------------------
# default config values designed to train a gpt2 (124M) on OpenWebText
# I/O
out_dir = 'out'
#eval_interval = 2000
eval_interval = 100
log_interval = 100
eval_iters = 200
eval_only = False # if True, script exits right after the first eval
always_save_checkpoint = False # if True, always save a checkpoint after each eval
init_from = 'scratch' # 'scratch' or 'resume' or 'gpt2'
# wandb logging
wandb_log = False # disabled by default
wandb_project = 'owt'
wandb_run_name = 'gpt2' # 'run' + str(time.time())
# data
#dataset = 'openwebtext'
#gradient_accumulation_steps = 5 * 8 # used to simulate larger batch sizes
gradient_accumulation_steps = 3 # used to simulate larger batch sizes
batch_size = 12 # if gradient_accumulation_steps > 1, this is the micro-batch size
#block_size = 1024
block_size = 32
# model
# n_layer = 12
# n_head = 12
# n_embd = 768
n_layer = 4
n_head = 4
n_embd = 64
dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+
bias = False # do we use bias inside LayerNorm and Linear layers?
# admw optimizer
#learning_rate = 6e-4 # max leanring rate
#epochs = 600000 # total number of training iterations
learning_rate = 1e-3 # max leanring rate
epochs = 1000 # total number of training iterations
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.95
grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
# learning rate decay settings
decay_lr = True # whether to decay the learning rate
#warmup_iters = 2000 # how many steps to warm up for
#lr_decay_iters = 600000 # should be ~= epochs per Chinchilla
warmup_iters = 100 # how many steps to warm up for
lr_decay_iters = epochs # should be ~= epochs per Chinchilla
min_lr = 6e-5 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
# DDP settings
backend = 'nccl' # 'nccl' 'glob', etc
# system
device = 'cuda' if torch.cuda.is_available() else 'cpu' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32', 'bfloat16' or 'float16', the latter will auto implement a GradScaler
compile = False # use PyTorch 2.0 to compile the model to be faster
#--------------------------------------------------------------------------------------------------
config_keys = [k for k,v in globals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))]
#exec(open('configurator.py').read()) # overrides from command line or config file
config = {k: globals()[k] for k in config_keys} # will be useful for logging
#--------------------------------------------------------------------------------------------------

In [13]:
class LayerNorm(nn.Module):
    """ LayerNorm but with an optional bias, PyTorch doesn't support simply bias=False"""
    def __init__(self, ndim, bias):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(ndim))
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
    
    def forward(self, X):
        return F.layer_norm(X, self.weight.shape, self.weight, self.bias, 1e-5)


@dataclass
class GPTConfig:
    block_size: int = 1024
    vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768
    dropout: float = 0.0
    bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster


class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias) # 3 means key,query,value concatenate
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
        # regularization
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.dropout = config.dropout
        # flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        if not self.flash:
            print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
            #causal mask to ensure that attention is only applied to the left in the input sequence
            self.register_buffer('bias', torch.tril(torch.ones(config.block_size, config.block_size))
                                         .view(1, 1, config.block_size, config.block_size))
        
    def forward(self, X):
        B, T, C = X.size() # batch size, sequence length, embedding dimensionality (n_embd)

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        q, k, v = self.c_attn(X).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        # causal self-attention; Self-attend: (B, nh, T, hs) @ (B, nh, hs, T) -> (B, nhs, T, T)
        if self.flash:
            # efficient attention using Flash Attention CUDA kernels
            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
        else:
            # manual implementation of attention
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
            att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
            att = F.softmax(att, dim=-1)
            att = self.attn_dropout(att)
            y = att @ v # (B, nh, T, T) @ (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C)

        # output prjection
        y = self.resid_dropout(self.c_proj(y))
        return y


class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias) # in transformer paper, the dimension is 512 and projectino to 2048, so it's 4 times
        self.gelu    = nn.GELU()
        self.c_proj  = nn. Linear(4 * config.n_embd, config.n_embd, bias=config.bias) # projection the 4 times dimension back to dimension
        self.dropout = nn.Dropout(config.dropout)
    
    def forward(self, X):
        X = self.c_fc(X)
        X = self.gelu(X)
        X = self.c_proj(X)
        X = self.dropout(X)
        return X


class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
        self.mlp = MLP(config)
    
    def forward(self, X):
        X = X + self.attn(self.ln_1(X)) # + means residual connection
        X = X + self.mlp(self.ln_2(X)) # + means residual connection
        return X


class NanoGPTModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.vocab_size is not None
        assert config.block_size is not None
        self.config = config
        
        self.transformer = nn.ModuleDict(dict(
            token_embedding = nn.Embedding(config.vocab_size, config.n_embd), # (vocab_size, C)
            position_embedding = nn.Embedding(config.block_size, config.n_embd), # (T, C)
            dropout = nn.Dropout(config.dropout),
            blocks = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = LayerNorm(config.n_embd, bias=config.bias), # final layer norm
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        # with weight tying when using torch.compile() some warnings get generated:
        # "UserWarning: functional_call was passed multiple values for tied weights.
        # This behavior is deprecated and will be an error in future versions"
        # not 100% sure what this is, so far seems to be harmless. TODO investigate
        self.transformer.token_embedding.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying
        
        # init all weights
        self.apply(self._init_weights)
        # apply sepcial scaled init to the residual projections, per GPT-2 paper
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))
        
        # report number of parameters
        print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))
    
    def get_num_params(self, non_embedding=True):
        """
        Return the number of parameters in the model.
        For non-embedding count (default), the position embeddings get substracted.
        The token embeddings would too, except due to the parameter sharing these
        params are actually used as weights in the final layer, so we include them.
        """
        n_params = sum(p.numel() for p in self.parameters())
        if non_embedding:
            n_params -= self.transformer.position_embedding.weight.numel()
        return n_params
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
    
    def forward(self, X, y=None):
        device = X.device
        # X and y are both (B, T) tensor integers, B = batch_size, T = block_size
        B, T = X.size()
        assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
        pos = torch.arange(0, T, dtype=torch.long, device=device) # shape (T)
        
        # forward the GPT model itself
        tok_emb = self.transformer.token_embedding(X) # (B, T, C)
        pos_emb = self.transformer.position_embedding(pos) # (T, C)
        X = self.transformer.dropout(tok_emb + pos_emb)
        for block in self.transformer.blocks:
            X = block(X) # (B, T, C)
        X = self.transformer.ln_f(X)   # (B, T, C)
        
        if y is not None:
            # if we are given some desired y also calculate the loss
            logits = self.lm_head(X) # (B, T, vocab_size)
#             B, T, C = logits.shape
#             logits = logits.view(B*T, C)
#             y = y.view(B*T)
#             loss = F.cross_entropy(logits, y)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y.view(-1), ignore_index=-1)
        else:
            # inference time mini optimization: only forward the lm_head on the very last position
            logits = self.lm_head(X[:, [-1], :]) # note: using list[-1] to preserve the time dim
            loss = None
        return logits, loss
    
    def crop_block_size(self, block_size):
        # model surgery to decrease the block size if necessary
        # e.g. we may load the GPT2 pretrained model checkpoint (block size 1024)
        # but want to use a smaller block size for some smaller, simpler model
        assert block_size <= self.config.block_size
        self.config.block_size = block_size
        self.transformer.position_embedding.weight = nn.Parameter(self.transformer.position_embedding.weight[:block_size])
        for block in self.transformer.blocks:
            if hasattr(block.attn, 'bias'):
                block.attn.bias = block.attn.bias[:,:,:block_size,:block_size]
    
    @classmethod
    def from_pretrained(cls, model_type, override_args=None):
        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
        override_args = override_args or {} # default to empty dict
        # only dropout can be overridden see more notes below
        assert all(k == 'dropout' for k in override_args)
        from transformers import GPT2LMHeadModel
        print("loading weights from pertrained gpt: %s" % model_type)
        
        # n_layer, n_head and n_embd are determined from model_type
        config_args = {
            'gpt2':        dict(n_layer=12, n_head=12, n_embd=768), # 124M parameters
            'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M parameters
            'gpt2-large':  dict(n_layer=36, n_head=20, n_embd=1280), # 774M parameters
            'gpt2-x1':     dict(n_layer=48, n_head=15, n_embd=1600), # 1558M parameters
        }[model_type]
        print("forcing vocab_size=50257, block_size=1024, bias=True")
        config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpints
        config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
        config_args['bias'] = True # always True for GPT model checkpoints
        # we can override the dropout rate, if desired
        if 'dropout' in override_args:
            print(f"overriding dropout rate to {override_args['dropout']}")
            config_args['dropout'] = override_args['dropout']
        # create a from-scratch initialized minGPT model
        config = GPTConfig(**config_args)
        model = GPT(config)
        sd = model.state_dict()
        sd_keys = sd.keys()
        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
        
        # init a huggingface/transformers model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()
        
        # copy while ensuring all of the parameters are aligned and match in names and shapes
        sd_keys_hf = sd_hf.keys()
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla linear
        # this means that we have to transpose these weights when we import them
        assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf) != {len(sd_keys)}}"
        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                # speecial treatment for the Conv1D weights we need to transpose
                assert sd_hf[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                # vanilla copy over the other parameters
                assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])
        return model
    
    def configure_optimizer(self, weight_decay, learning_rate, betas, device_type):
        # start with all the condidata parameters
        param_dict = {pn: p for pn, p in self.named_parameters()}
        # filter out those that do not require grad
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorm don't
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0},
        ]
        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
        # create AdamW optimizer and use the fused version if it is available
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == 'cuda'
        extra_args = dict(fused=True) if use_fused else dict()
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
        print(f"using fused AdamW: {use_fused}")
        return optimizer
    
    def estimate_mfu(self, fwdbwd_per_iter, dt):
        """ estimate model flops utilization (MFU) in units of A100 bfloat16 peak FLOPS """
        # first estimate the number of flops we do per iteration
        # see PaLM paper Appendix B as ref: https://arxiv.org/abs/2204.02311
        N = self.get_num_params()
        cfg = self.config
        L, H, Q, T = cfg.n_layer, cfg.n_head, cfg.n_embd // cfg.n_head, cfg.block_size
        flops_per_token = 6*N + 12*L*H*Q*T
        flops_per_fwdbwd = flops_per_token * T
        flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter
        # express our flops throughput as ratio of A100 bfloat16 peak flops
        flops_achieved = flops_per_iter * (1.0/dt) # per second
        flops_promised = 312e12 # A100 GPU bfloat16 peak flops is 312 TFLOPS
        mfu = flops_achieved / flops_promised
        return mfu
        
    @torch.no_grad()
    def generate(self, X, max_new_tokens, temperature=1.0, top_k=None):
        """
        Take a conditioning sequence of indices X (LongTensor of shape (B, T)) and complete
        the sequence max_new_tokens times, feeding the predictions back into the model each time.
        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
        """
        # X is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # if the sequence context is growing too long we must crop it at block_size
            # crop X to the last block_size tokens
            X_cond = X if X.size(1) <= self.config.block_size else X[:, -self.config.block_size:]
            # forward the model to get the logits for the index in the sequence
            logits, loss = self(X_cond)
            # focus only on the last time step, pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature # becomes (B, C)
            # optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')
            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_nxt = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            X = torch.cat((X, idx_nxt), dim=1) # (B, T+1)
        return X
        

In [14]:
# model = NanoGPTModel(GPTConfig())
# model = model.to(device)

In [15]:
# print('total parameters number:', sum(p.nelement() for p in model.parameters()))

In [16]:
# # train model
# optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
# for iter in range(epochs):
#     # every once in a while evaluate the loss on train and eval sets
#     if iter % eval_interval == 0:
#         losses = estimate_loss()
#         print(f"step {iter}: train loss {losses['train']:.4f}, eval loss {losses['eval']:.4f}")
    
#     # sample a batch of data
#     Xb, yb = get_batch('train')
    
#     # evaluate the loss
#     logits, loss = model(Xb, yb)
#     optimizer.zero_grad(set_to_none=True)
#     loss.backward()
#     optimizer.step()

In [17]:
# various inits, derived attributes, I/O setup
ddp = int(os.environ.get('RANK', -1)) != -1 # is this a ddp run?
if ddp:
    init_process_group(backend=backend)
    ddp_rank = int(os.environ['RANK'])
    ddp_local_rank = int(os.environ['LOCAL_RANK'])
    ddp_world_size = int(os.environ['WORLD_SIZE'])
    device = f'cuda:{ddp_local_rank}'
    torch.cuda.set_device(device)
    master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
    seed_offset = ddp_rank # each process gets a different seed
    # world_size number of processes will be training simultaneously, so we can scale
    # down the desired gradient accumulation iteration per process proportinally
    assert gradient_accumulation_steps % ddp_world_size == 0
    gradient_accumulation_steps //= ddp_world_size
else:
    # if not ddp, we are running on a single gpu, and one process
    master_process = True
    seed_offset = 0
    ddp_world_size = 1
tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * block_size
print(f"tokens per iteration will be {tokens_per_iter:,}")

tokens per iteration will be 1,152


In [18]:
if master_process:
    os.makedirs(out_dir, exist_ok=True)
torch.manual_seed(1337 + seed_offset)
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
# note: float16 data type will automatically use a GradScaler
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)

In [19]:
# init these up here, can override if init_from='resume' (i.e. from a checkpoint)
iter_num = 0
best_val_loss = 1e9

In [20]:
# model init
model_args = dict(n_layer=n_layer, n_head=n_head, n_embd=n_embd, block_size=block_size,
                 bias=bias, vocab_size=vocab_size, dropout=dropout) # start with model_args from command line
if init_from == 'scratch':
    # init a new model from scratch
    print("Initializing a new model from scratch")
    # determine the vocab_size we'll use for from-scratch training
    #print("defaulting to vocab_size of GPT-2 to 50304 (50257 rounded up for efficiency)")
    model = NanoGPTModel(GPTConfig(**model_args))
elif init_from == 'resume':
    print("Resuming training from {out_dir}")
    # resume training from a checkpoint
    ckpt_path = os.path.join(out_dir, 'ckpt.pt')
    checkpoint = torch.load(ckpt_path, map_location=device)
    checkpoint_model_args = checkpoint['model_args']
    # force these config attributes to be equal otherwise we can't even resume training
    # the rest of the attributes (e.g. dropout) can stay as desired from command line
    for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
        model_args[k] = checkpoint_model_args[k]
    # create the model
    model = NanoGPTModel(GPTConfig(**model_args))
    state_dict = checkpoint['model']
    # fix the keys of the state dictionary
    # honestly no idea how checkpoints sometimes get this prefix. have to debug more
    unwanted_prefix = '_orig_mod.'
    for k,v in list(state_dict.items()):
        if k.startswith(unwanted_prefix):
            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
    model.load_state_dict(state_dict)
    iter_num = checkpoint['iter_num']
    best_val_loss = checkpoint['best_val_loss']
elif init_from.startswith('gpt2'):
    print(f"Initializing from OpenAI GPT-2 weights: {init_from}")
    # initialize from OpenAI GPT-2 weights
    override_args = dict(dropout=dropout)
    model = NanoGPTModel.from_pretrained(init_from, override_args)
    # read off the created config params, so we can store them into checkpoint correctly
    for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
        model_args[k] = getattr(model.config, k)
# crop down the model block size if desired, using model surgery
if block_size < model.config.block_size:
    model.crop_block_size(block_size)
    model_args['block_size'] = block_size
model.to(device)

Initializing a new model from scratch
number of parameters: 0.20M


NanoGPTModel(
  (transformer): ModuleDict(
    (token_embedding): Embedding(65, 64)
    (position_embedding): Embedding(32, 64)
    (dropout): Dropout(p=0.0, inplace=False)
    (blocks): ModuleList(
      (0): Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=64, out_features=192, bias=False)
          (c_proj): Linear(in_features=64, out_features=64, bias=False)
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=64, out_features=256, bias=False)
          (gelu): GELU(approximate=none)
          (c_proj): Linear(in_features=256, out_features=64, bias=False)
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=64, out_features=192, bias

In [21]:
model.config

GPTConfig(block_size=32, vocab_size=65, n_layer=4, n_head=4, n_embd=64, dropout=0.0, bias=False)

In [22]:
# initialize a GradScaler. If enabled=False scaler is a no-op
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))



In [23]:
# optimizer
optimizer = model.configure_optimizer(weight_decay, learning_rate, (beta1, beta2), device_type)
if init_from == 'resume':
    optimizer.load_state_dict(checkpoint['optimizer'])
checkpoint = None # free up memory

num decayed parameter tensors: 18, with 202,816 parameters
num non-decayed parameter tensors: 9, with 576 parameters
using fused AdamW: False


In [24]:
# compile the model
if compile:
    print("compiling the model... (take a ~minute)")
    unoptimized_model = model
    model = torch.compile(model) # requires PyTorch >= 2.0

In [25]:
# wrap model into DDP container
if ddp:
    model = DDP(model, device_ids=[ddp_local_rank])

In [26]:
# logging
if wandb_log and master_process:
    import wandb
    wandb.init(project=wandb_project, name=wandb_run_name, config=config)

In [27]:
# training loop
X, y = get_batch('train') # fetch the very first batch
t0 = time.time()
local_iter_num = 0 # number of iterations in the lifetime of the process
raw_model = model.module if ddp else model # unwrap DDP container if needed
running_mfu = -1.0
while True:
    # determine and set the learning rate for this iteration
    lr = get_lr(iter_num) if decay_lr else learning_rate
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    
    # evaluate the loss on train/eval sets and write checkpoints
    if iter_num % eval_interval == 0 and master_process:
        losses = estimate_loss()
        print(f"step {iter_num}: train loss {losses['train']:.4f}, eval loss {losses['eval']:.4f}")
        if wandb_log:
            wandb.log({
                "iter:": iter_num,
                "train/loss": losses['train'],
                "eval/loss": losses['eval'],
                "lr": lr,
                "mfu": running_mfu*100, # convert to percentage
            })
        if losses['eval'] < best_val_loss or always_save_checkpoint:
            best_val_loss = losses['eval']
            if iter_num > 0:
                checkpoint = {
                    'model': raw_model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'model_args': model_args,
                    'iter_num': iter_num,
                    'best_val_loss': best_val_loss,
                    'config': config,
                }
                print(f"saving checkpoint to {out_dir}")
                torch.save(checkpoint, os.path.join(out_dir, 'ckpt.pt'))
    if iter_num == 0 and eval_only:
        break
    
    # forward backward update. with optional gradient accumulation to simulate larger batch size
    # and using the GradScaler if data type is float16
    for micro_step in range(gradient_accumulation_steps):
        if ddp:
            # in DDP training we only need to sync gradient at the last micro step.
            # the official way to do this is with model.no_sync() context manager, but
            # I really dislike that this bloats the code and forces us to repeat code
            # looking at the source of that context manager, it just toggles this variable
            model.require_backward_grad_sync = (micro_step == gradient_accumulation_steps - 1)
        with ctx:
            logits, loss = model(X, y)
            loss = loss / gradient_accumulation_steps # scale the loss to account for gradient accumulation
        # immediately async prefetch next batch while model is doing the forward pass on the GPU
        X, y = get_batch('train')
        # backward pass, with gradient sacling if training in fp16
        scaler.scale(loss).backward()
    # clip the gradient
    if grad_clip != 0.0:
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
    # step the optimizer and scaler if training in fp16
    scaler.step(optimizer)
    scaler.update()
    # flush the gradients as soon as we can, no need for this memory anymore
    optimizer.zero_grad(set_to_none=True)
    
    # timing and logging
    t1 = time.time()
    dt = t1 - t0
    t0 = t1
    if iter_num % log_interval == 0 and master_process:
        # get loss as float. note: this is a CPU-GPU sync point
        # scale up to undo the division above, approximating the true total loss (exact would have been a sum)
        lossf = loss.item() * gradient_accumulation_steps
        if local_iter_num >= 5: # let the training loop settle a bit
            mfu = raw_model.estimate_mfu(batch_size * gradient_accumulation_steps, dt)
            running_mfu = mfu if running_mfu == -1.0 else 0.9*running_mfu + 0.1*mfu
        print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms, mfu {running_mfu*100:.2f}%")
    iter_num += 1
    local_iter_num += 1
    
    # termination condition
    if iter_num > epochs:
        break

if ddp:
    destroy_process_group()
    

step 0: train loss 4.1928, eval loss 4.1889
iter 0: loss 4.1919, time 34847.90ms, mfu -100.00%
step 100: train loss 2.8043, eval loss 2.8131
saving checkpoint to out
iter 100: loss 2.8130, time 35714.68ms, mfu 0.00%
step 200: train loss 2.4909, eval loss 2.4926
saving checkpoint to out
iter 200: loss 2.5632, time 36679.51ms, mfu 0.00%
step 300: train loss 2.4079, eval loss 2.4145
saving checkpoint to out
iter 300: loss 2.3739, time 37334.45ms, mfu 0.00%
step 400: train loss 2.3489, eval loss 2.3415
saving checkpoint to out
iter 400: loss 2.3289, time 35853.41ms, mfu 0.00%
step 500: train loss 2.2840, eval loss 2.2981
saving checkpoint to out
iter 500: loss 2.3265, time 36980.11ms, mfu 0.00%
step 600: train loss 2.2383, eval loss 2.2547
saving checkpoint to out
iter 600: loss 2.3080, time 37726.17ms, mfu 0.00%
step 700: train loss 2.2061, eval loss 2.2128
saving checkpoint to out
iter 700: loss 2.2699, time 37347.96ms, mfu 0.00%
step 800: train loss 2.1737, eval loss 2.1859
saving check

In [28]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=2000, temperature=1.0, top_k=10)[0].tolist()))


CARE:
And I thy wathe thak'l wid that onds of and the friore; ith held thre mour weart brangesert to whall bede bows me that my ban hord
Ang is berosoutiminn, to art o my bladde,
Sough douene wertr, mord ton you,
And fard thearest wol blotssen, you,
An me them broth marknseseeles is him ar fror ies to thes.

WOFongs, mord, she II serim tho he sount,
Start amy thand mall or shy we houghalle this dou, surest you
Th to dime tint and hend fearsse miort
Bund her osof ba my ach brave imert anne ict brie, in forst a my.

WOLORGLER:
Hads trein thy, heils heave illlf han an otees if hernctesie for har the and by in sour, wigh should. I dover bookes wices and thet ind's mest o angeang fors, well mims on sticck,
And thesurt st hould thou if tar sat, dases.

TUTRIUSS:
INo my sim whay, hilll we of hert.

ACEERIAND-
AOLOO:
T:
Therit iler if sor the meses sus savar ast ame, to must o his, shell won mat.

ANESS:
Bad I as thy livel.

ARK:
Nos here, and divinng tho as this for shate stas,
Hat having:
T

In [32]:
# load saved checkpoint
checkpoint = torch.load(os.path.join(out_dir, 'ckpt.pt'), map_location=device)

In [34]:
checkpoint['model_args']

{'n_layer': 4,
 'n_head': 4,
 'n_embd': 64,
 'block_size': 32,
 'bias': False,
 'vocab_size': 65,
 'dropout': 0.0}

In [36]:
checkpoint['best_val_loss']

tensor(2.1618)

In [38]:
checkpoint['iter_num']

1000

In [39]:
checkpoint['optimizer']

{'state': {0: {'step': tensor(1000.),
   'exp_avg': tensor([[-7.7831e-04, -1.4290e-03, -1.2733e-03,  ...,  1.3082e-03,
             5.9090e-04,  6.4573e-04],
           [ 3.0794e-03,  3.8900e-03,  1.8857e-03,  ..., -1.9322e-02,
            -7.4273e-03,  4.6929e-03],
           [ 2.8753e-04,  1.0117e-03,  7.6174e-04,  ..., -5.8909e-04,
            -5.1775e-04, -5.7067e-04],
           ...,
           [-1.0056e-05, -2.3573e-04,  6.0078e-05,  ...,  3.3780e-04,
             8.3510e-05, -1.7949e-04],
           [ 1.6744e-03,  4.6966e-05, -1.6250e-03,  ...,  2.1410e-03,
            -9.3943e-04,  2.0704e-05],
           [-7.4394e-05,  3.5052e-04, -4.1024e-05,  ...,  4.4126e-05,
             1.7401e-05,  2.5365e-04]]),
   'exp_avg_sq': tensor([[3.8345e-05, 5.3719e-05, 2.9915e-05,  ..., 3.8629e-05, 7.4341e-05,
            3.3832e-05],
           [1.7261e-04, 3.3286e-04, 1.3393e-04,  ..., 3.1803e-03, 4.3525e-04,
            3.7568e-04],
           [8.0293e-06, 1.0669e-05, 1.5047e-05,  ..., 1.890

In [43]:
checkpoint_model_args = checkpoint['model_args']
model_args = dict()
# force these config attributes to be equal otherwise we can't even resume training
# the rest of the attributes (e.g. dropout) can stay as desired from command line
for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
    model_args[k] = checkpoint_model_args[k]
# create the model
model = NanoGPTModel(GPTConfig(**model_args))
state_dict = checkpoint['model']
# fix the keys of the state dictionary
# honestly no idea how checkpoints sometimes get this prefix. have to debug more
unwanted_prefix = '_orig_mod.'
for k,v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
model.load_state_dict(state_dict)

number of parameters: 0.20M


<All keys matched successfully>

In [44]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=2000, temperature=1.0, top_k=10)[0].tolist()))



CAnde some:
I not thich and furre anck, your whis of basted mored was borted brick suit mars indes,
To a an day sthir, sthu
Of orertst ham be the and warth depet.

CENGARINIUS:
WAy londe thens the berakest im so tham heath to fitht micke
By of be homin, the har doott of
An and base, o shive haven hand andel anste,
Now sheal hared heam and by momere
To a swarerting mare tour mute hy angond'ss sayse an song ton hivong.

HANCEORD
ARCK:
TINGo sheas trie sthear, hepps, andears;
To be sold you wou wher mate mirt.

WICO:
MAfark'Sd dind I to to the weell to han my by ankes
dot mad, thou hild woou goment she theint
Whe arrin atink dead.

WINDUCES:
Wird thet witth shin:
Hou thish and ith is nour bue ot be hatt mert.

INENTINTHIIUS:
An ty surer thens beim may wavang,
I hincer bimy sot atlle in as, ben thirence that the ave bof toou st angadse your soo ome donsh, sis owill
An freaver, win atis theer mands thathild
Is am ble have, a my so wesis:
Burt hof hare the stim tho bate
Wise of hatelly ash