In [97]:
import math

import torch
import torch.nn as nn
import torch.nn.functional as F

from pytorch_lightning import LightningModule, Trainer
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.loggers.wandb import WandbLogger
from pytorch_lightning.callbacks import LearningRateMonitor

import wandb

# Model

In [98]:
class NewGELU(nn.Module):
    """
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT).
    Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
    """
    def forward(self, x):
        return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))

class GptAttention(nn.Module):
    """
    For this attention module k = v = q are all the same.
    It's for encoder only transfomers.
    """
    def __init__(self, config):
        super(GptAttention, self).__init__()
        self.config = config

        assert self.config["d_model"] % self.config["heads"] == 0
        self.heads = self.config["heads"]

        self.w_attn = nn.Linear(self.config["d_model"], 3*self.config["d_model"])
        self.head = nn.Linear(self.config["d_model"], self.config["d_model"])

        self.attn_dropout = nn.Dropout(config["attn_pdrop"])
        self.resid_dropout = nn.Dropout(config["resid_pdrop"])

        # causal mask to ensure that attention is only applied to the left in the input sequence
        self.register_buffer(
            "bias", 
            torch.tril(
                torch.ones(
                    self.config["window"], 
                    self.config["window"])
                ).view(1, 1, self.config["window"], self.config["window"])
        )
    
    def forward(self, x):
        B, window, embs = x.shape

        q, v, k = self.w_attn(x).split(self.config["d_model"], dim=2)

        # (B, heads, window, embs)
        q = q.view(
            B, 
            window, 
            self.config["heads"], 
            embs // self.config["heads"]
        ).transpose(1, 2)
        k = k.view(
            B, 
            window, 
            self.config["heads"], 
            embs // self.config["heads"]
        ).transpose(1, 2)
        v = v.view(
            B, 
            window, 
            self.config["heads"], 
            embs // self.config["heads"]
        ).transpose(1, 2)
        
        # Self-attend: (B, heads, window, embs) x (B, heads, embs, window) -> (B, heads, window, window)
        scores = q @ k.transpose(-2, -1) / math.sqrt(k.size(-1))
        mask = scores.masked_fill(self.bias[:,:,:window,:window] == 0, float('-inf'))
        probs = F.softmax(mask, dim=-1)
        attn = self.attn_dropout(probs)
        attn = probs @ v
        attn = attn.transpose(1, 2).contiguous().view(B, window, embs)

        return self.resid_dropout(self.head(attn))

class FeedForward(nn.Module):
    def __init__(self, config):
        super(FeedForward, self).__init__()
        self.l1 = nn.Linear(config["d_model"], 4*config["d_model"])
        self.l2 = nn.Linear(4*config["d_model"], config["d_model"])
        self.dropout = nn.Dropout(config["resid_pdrop"])

    def forward(self, x):
        x = NewGELU()(self.l1(x))
        return self.dropout(self.l2(x))

class Block(nn.Module):
    def __init__(self, config):
        super(Block, self).__init__()
        self.attn = GptAttention(config)
        self.norm1 = nn.LayerNorm(config["d_model"])
        self.ff = FeedForward(config)
        self.norm2 = nn.LayerNorm(config["d_model"])

    def forward(self, x):
        # TODO: these are re-ordered in andrej's code
        x = self.norm1(x + self.attn(x))
        x = self.norm2(x + self.ff(x))
        return x

# gpt_attn = GptAttention(heads, d_model)
# out = gpt_attn(enc_prompt)
# print(out.shape)

# b = Block(d_model, 3)
# out = b(emb_prompts)
# print(out.shape)

In [99]:
class GPT(nn.Module):
    def __init__(self, config):
        super(GPT, self).__init__()
        self.config = config

        self.vocab_emb = nn.Embedding(self.config["vocab"], self.config["d_model"])
        self.pos_emb = nn.Embedding(self.config["window"], self.config["d_model"])
        self.emb_dropout = nn.Dropout(config["embd_pdrop"])

        self.blocks = nn.ModuleList([Block(self.config) for _ in range(self.config["blocks"])])
        self.head_layer_norm = nn.LayerNorm(config["d_model"])
        self.head = nn.Linear(self.config["d_model"], self.config["vocab"])

    def forward(self, x):
        vocab_emb = self.vocab_emb(x)
        pos_emb = self.pos_emb(torch.arange(0, x.shape[1], dtype=torch.long, device=x.device))

        x = self.emb_dropout(vocab_emb + pos_emb)

        for b in self.blocks:
            x = b(x)

        x = self.head_layer_norm(x)
        x = self.head(x)

        return x

    def configure_opt(self):
        p_decay = set()
        p_no_decay = set()
        whitelist_weight_modules = (torch.nn.Linear, )
        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
        for mn, m in self.named_modules():
            for pn, p in m.named_parameters():
                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
                # random note: because named_modules and named_parameters are recursive
                # we will see the same tensors p many many times. but doing it this way
                # allows us to know which parent module any tensor p belongs to...
                if pn.endswith('bias'):
                    # all biases will not be decayed
                    p_no_decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
                    # weights of whitelist modules will be weight decayed
                    p_decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
                    # weights of blacklist modules will NOT be weight decayed
                    p_no_decay.add(fpn)

        # validate that we considered every parameter
        param_dict = {pn: p for pn, p in self.named_parameters()}
        inter_params = p_decay & p_no_decay
        union_params = p_decay | p_no_decay
        assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
        assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
                                                    % (str(param_dict.keys() - union_params), )

        # create the pytorch optimizer object
        optim_groups = [
            {"params": [param_dict[pn] for pn in sorted(list(p_decay))], "weight_decay": self.config["weight_decay"]},
            {"params": [param_dict[pn] for pn in sorted(list(p_no_decay))], "weight_decay": 0.0},
        ]
        optimizer = torch.optim.AdamW(
            optim_groups, 
            lr=self.config["lr"], 
            betas=(self.config["b1"], self.config["b2"])
        )
        return optimizer

    def sample_char(self, x):
        logits = self(x)
        probs = F.softmax(logits[:,-1,:], dim=1)
        return torch.multinomial(probs, num_samples=1).item()


# Data Loader for names

In [100]:
import copy
import random
from torch.utils.data import DataLoader

class NameDataLoader():
    def __init__(self, words, window, stoi):
        self.X, self.Y = self._build_dataset(words, window, stoi)

    def __getitem__(self, index: int):
        return self.X[index], self.Y[index]

    def __len__(self) -> int:
        return len(self.Y)
        
    def _build_dataset(self, words, window, stoi):
        x, y = [], []

        for name in words:
            ctx = [0] * window
            for c in name:
                x.append(copy.deepcopy(ctx))
                ctx.pop(0)
                ctx.append(stoi[c])
                y.append(copy.deepcopy(ctx))
                
        return torch.tensor(x), torch.tensor(y)

    def debug_print(self, i_start, i_end):
        for i in range(i_start, i_end):
            print("".join([itos[c.item()] for c in self.X[i]]) + " --> " + "".join([itos[c.item()] for c in self.Y[i]]))


class NameData():
    def __init__(self, name_txt_path, window):

        self.window = window
        self.names = open(name_txt_path, 'r').read().splitlines()

        self.stoi, self.itos = self._make_stoi_and_itos(self.names)

        random.seed(42)
        random.shuffle(self.names)
        n1 = int(0.8*len(self.names))
        n2 = int(0.9*len(self.names))

        self.train = NameDataLoader(self.names[:n1], window, self.stoi)
        self.dev = NameDataLoader(self.names[n1:n2], window, self.stoi)
        self.test = NameDataLoader(self.names[n2:], window, self.stoi)

    def _make_stoi_and_itos(self, names):
        ## functions to convert chars to int and inverse
        chars = sorted(list(set(''.join(names))))
        stoi = {s:i+1 for i,s in enumerate(chars)}

        # . is both "before start" in X, and "im done" for Y
        stoi['.'] = 0
        itos = {s:i for i,s in stoi.items()}

        return stoi, itos

    def stoi(self, char):
        return self.stoi[char]
    
    def itos(self, i):
        return self.itos[i]

    def vocab(self):
        return len(self.stoi)

    def train_data_loader(self):
        return self.train

    def test_data_loader(self):
        return self.test

    def val_data_loader(self):
        return self.dev


# data = NameData('compiled_names.txt', 5)
# data.train.debug_print(4, 55)


# Hyperparameter

In [101]:
config = {
    # either model_type or (n_layer, n_head, n_embd) must be given in the config
    "model_type": 'gpt',

    # Window must remain the same for the losses to make sense!!
    "window": 32,

    ## Tiny network, for smoke testing
    # "blocks": 3,
    # "heads": 1,
    # "d_model":  4,

    ## Pico network
    # "blocks": 6,
    # "heads": 4,
    # "d_model": 8,

    ## Nano network
    # "blocks": 4,
    # "heads": 4,
    # "d_model": 64,

    ## Micro
    # "blocks": 6,
    # "heads": 4,
    # "d_model": 128,

    ## Mini
    "blocks": 6,
    "heads": 6,
    "d_model": 192,

    ## gpt
    # "blocks": 12,
    # "heads": 12,
    # "d_model":768,

    "weight_decay": 0.1,
    "lr": 3e-4,
    # "lr": 5e-4,
    "b1": 0.9,
    "b2": 0.95,

    # these options must be filled in externally
    "vocab": None,

    # Dropout hyperparameters
    "embd_pdrop": 0.1,
    "resid_pdrop": 0.1,
    "attn_pdrop": 0.1,

    # Training parameters
    "batch_size": 64,
    "num_workers": 4,
    "epochs": 5,
}


# lightning framework

In [103]:
class LitSurnames(LightningModule):
    def __init__(self, config):
        super().__init__()

        self.config = config

        self.data = NameData('compiled_names.txt', self.config["window"])
        self.config["vocab"] = self.data.vocab()

        self.model = GPT(config)

    def forward(self, x):
        return self.model(x)

    def loss(self, batch):
        x, y = batch
        logits = self(x)
        return F.cross_entropy(logits.view(-1, logits.size(-1)), y.view(-1), ignore_index=-1)

    def training_step(self, batch, batch_idx):
        loss = self.loss(batch)
        self.log('tr_loss', loss)
        return loss

    def test_step(self, batch, batch_idx):
        loss = self.loss(batch)
        self.log('test_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        loss = self.loss(batch)
        self.log('val_loss', loss)
        return loss

    def configure_optimizers(self):
        return self.model.configure_opt()

    ####################
    # DATA RELATED HOOKS
    ####################

    def prepare_data(self):
        pass

    def setup(self, stage=None):
        pass

    def train_dataloader(self):
        return DataLoader(
            self.data.train_data_loader(), 
            batch_size=self.config["batch_size"],
            # num_workers = config["num_workers"]
        )

    def test_dataloader(self):
        return DataLoader(self.data.test_data_loader(), batch_size=self.config["batch_size"])

    def val_dataloader(self):
        return DataLoader(self.data.val_data_loader(), batch_size=self.config["batch_size"])

use_wandb = True
if use_wandb:
    run = wandb.init(project="surnamerator", reinit=True)
    logger = WandbLogger()
else:
    import os
    logger = TensorBoardLogger(save_dir=os.getcwd(), version=1, name="lightning_logs")


lit_surname = LitSurnames(config)
lr_monitor = LearningRateMonitor(logging_interval='step')

trainer = Trainer(
    accelerator="cpu",
    max_epochs=config["epochs"],
    logger=logger,
    callbacks=[lr_monitor]
)

trainer.fit(lit_surname)

if use_wandb:
    run.finish()


  rank_zero_warn(
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name  | Type | Params
-------------------------------
0 | model | GPT  | 2.7 M 
-------------------------------
2.7 M     Trainable params
0         Non-trainable params
2.7 M     Total params
10.749    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]