In [1]:
%load_ext autoreload
%autoreload 2

In [342]:
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils import clip_grad_norm_, clip_grad_value_
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

from htools import *
from incendio.callbacks import *
from incendio.core import *
from incendio.optimizers import *
from incendio.utils import *
from stormlight import *

In [3]:
cd_root()

Current directory: /Users/hmamin/stormlight


In [4]:
bs = 4
shuffle = False

In [5]:
tok, model = load_pretrained_gpt2()

In [6]:
ds = LMDataset.from_pickle('data/datasets/gpt2_lm_tokens_tiny.pkl', tok)
dl = DataLoader(ds, bs, shuffle=shuffle)
len(ds), ds.tokens.shape

Object loaded from data/datasets/gpt2_lm_tokens_tiny.pkl.


(1506, (24096,))

Even with return_tuple=True, won't work with current incendio setup. Options:

- Have dataset return (x, x). Less memory-efficient though. Doesn't solve the issue of how to pass kwargs into model (e.g. labels=x).
- Write model wrapper that lets us pass in x and automatically calls GPT2(x, labels=x). May need to write a faux loss function in this case since gpt2 "forward" returns loss.
- Update incendio loop to handle 1 item getitems, kwargs, etc.

In [330]:
# Note: can't separate transformer and LM head into 2 groups because weights are shared w/ input embeddings.
class ModelWrapper(BaseModel):
    
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, x):
        return self.model(x, labels=x)[:2]
    

class ModularGPT2LM(BaseModel):
    """Assemble a GPT2 language model into an Incendio-ready model with layer
    groups, unfreezing, etc.
    """
    
    def __init__(self, model):
        super().__init__()
        gpt2 = model.transformer
        self.config = model.config
        self.drop = gpt2.drop
        self.lm_head = model.lm_head
        # This just affects layer groups in the optimizer. Dropout doesn't
        # need to be trained and lm_head's weights are the same as wte.
        self.groups = nn.ModuleList([gpt2.wte, 
                                     gpt2.wpe,
                                     gpt2.h, 
                                     gpt2.ln_f])
        
    def forward(self, x, use_cache=True, past=None):
        """Heavily based on the Huggingface implementation with minor 
        simplifications for unused options.
        """
        if not past:
            past_len, past = 0, [None] * len(self.groups[-2])
        else:
            past_len = past[0][0].shape[-2]
        x_shape = x.shape
        x_ids = x.view(-1, x_shape[-1])
        pos_ids = torch.arange(past_len, x_shape[-1] + past_len, 
                               dtype=torch.long, device=x.device).unsqueeze(0)
        
        # Token embeddings and positional encodings.
        hidden = self.groups[0](x_ids) + self.groups[1](pos_ids)
        hidden = self.drop(hidden)
        
        # Layer Norm, Attention blocks, more Layer Norm.
        presents = []
        for block, prev in zip(self.groups[-2], past):
            outputs = block(hidden, layer_past=prev, use_cache=use_cache)
            hidden, present = outputs[:2]
            if use_cache: presents.append(present)
        hidden = self.groups[-1](hidden)
        
        # logits: shape (bs, seq_len, vocab_size)
        # presents (12 tensors - keys and values from attention blocks):
        #     shape: (2, bs, num_heads, sequence_length, embed_size_per_head)
        logits = self.lm_head(hidden)
        return logits, tuple(presents)

In [331]:
mod = ModularGPT2LM(model)
mod.eval()
res = mod(x)

In [332]:
lm_loss_wrapper(res[0], x)

torch.Size([4, 16, 50258]) torch.Size([4, 16])


tensor(8.0986, grad_fn=<NllLossBackward>)

### Issue: LMhead model has use_cache=True by default, but in examples past=None so are we actually using this somewhere? Also, when I pass in `past`, results change. This is consistent with GPT2LMHead behavior, though.

In [333]:
res2 = wrap(x, past=res[-1])

In [334]:
lm_loss_wrapper(res2[0], x)

torch.Size([4, 16, 50258]) torch.Size([4, 16])


tensor(7.8249, grad_fn=<NllLossBackward>)

In [335]:
model.eval()
res1 = model(x)
lm_loss_wrapper(res1[0], x)

torch.Size([4, 16, 50258]) torch.Size([4, 16])


tensor(8.0986, grad_fn=<NllLossBackward>)

In [337]:
res3 = model(x, past=res1[-1])
lm_loss_wrapper(res3[0], x)

torch.Size([4, 16, 50258]) torch.Size([4, 16])


tensor(7.8249, grad_fn=<NllLossBackward>)

In [336]:
opt = variable_lr_optimizer(mod, [3e-3, 3e-3, 6e-3, 9e-3])
opt

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 0.001
    lr: 0.003
    weight_decay: 0

Parameter Group 1
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 0.001
    lr: 0.003
    weight_decay: 0

Parameter Group 2
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 0.001
    lr: 0.006
    weight_decay: 0

Parameter Group 3
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 0.001
    lr: 0.009
    weight_decay: 0
)

In [324]:
[len(group['params']) for group in opt.param_groups]

[1, 1, 144, 2]

In [329]:
{k: v.shape for k, v in wrap.groups[-1].named_parameters()}

{'weight': torch.Size([768]), 'bias': torch.Size([768])}

In [167]:
wrap_mod = ModelWrapper(model)

In [10]:
x = item(dl)
x.shape

torch.Size([4, 16])

In [11]:
# Incendio validation method still won't work though.
def lm_loss_wrapper(y_pred, y_true):
#     print(y_pred)
    print(y_pred.shape, y_true.shape)
    # Discard last logit and first language model "label".
    logits = y_pred[:, :-1, :].contiguous()
    labels = y_true[:, 1:].flatten()
    # shapes: (bs*(seq_len-1), vocab_size), (bs*(seq_len-1))
    return F.cross_entropy(logits.view(-1, logits.shape[-1]), labels)

cross_entropy                     y_shape: (bs,)    yhat_shape: (bs, k)

In [12]:
print(wrap_mod.training)
wrap_mod.eval()
print(wrap_mod.training)
with torch.no_grad():
    res = wrap_mod(x)
    
print(wrap_mod.training)
wrap_mod.train()
print(wrap_mod.training)
with torch.no_grad():
    res2 = wrap_mod(x) 

True
False
False
True


In [13]:
len(res)

2

In [14]:
lm_loss_wrapper(res[1], x), res[0]

torch.Size([4, 16, 50258]) torch.Size([4, 16])


(tensor(95.8073), tensor(95.8073))

In [15]:
lm_loss_wrapper(res2[1], x), res2[0]

torch.Size([4, 16, 50258]) torch.Size([4, 16])


(tensor(89.9666), tensor(89.9666))

# Guessing

lm_head is excluded from named_parameters bc it's just pointing to wte. So a group consisting of [wte, wpe] will still include lm_head (I think)

In [57]:
emb = {k: v for k, v in model.named_parameters() 
       if any(term in k for term in ('wte', 'wpe'))}
other = {k: v for k, v in model.named_parameters() if k not in emb.keys()}

In [58]:
len(emb), len(other)

(2, 146)

In [46]:
model.transformer.ln_f

LayerNorm((768,), eps=1e-05, elementwise_affine=True)

In [45]:
model.lm_head

Linear(in_features=768, out_features=50258, bias=False)

In [351]:
class TransformerTrainer(Trainer):
    """Patched version of incendio.core.Trainer that works with Hugginface
    Transformers. Eventually, Incendio's version should be flexible enough to
    support them natively but for now it will be faster to do it this way.
    """
    
    @handle_interrupt
    def fit(self, epochs, lrs=3e-3, lr_mult=1.0, **kwargs):
        """Train the model.
        
        Parameters
        ----------
        epochs: int
            Number of epochs to train for.
        lrs: float or Iterable(float)
            Pass in one or more learning rates. If lr_mult < 1, these
            will be the max LR(s). If the number of values matches the number
            of layer groups in the model, they will be matched accordingly,
            with the first layer is assigned the first LR. If 1 LR is passed
            in and lr_mult < 1, the multiplier will be used to create an
            appropriate number of LRs. Example: for a network with 3 groups,
            lrs=3e-3 and lr_mult=0.1 will produce LRs of [3e-5, 3e-4, 3e-3].
        lr_mult: float
            Multiplier used to compute additional learning rates if needed.
            See `update_optimizer()` for details.
        kwargs: any
            Pass in clean=True to remove existing files in out_dir.
        """
        stats = defaultdict(list)
        sum_i = 0
        _ = self.decide_stop('on_train_begin', epochs, lrs, lr_mult, **kwargs)
        for e in range(epochs):
            _ = self.decide_stop('on_epoch_begin', e, stats, None)
            for i, xb in enumerate(self.pbar):
                sum_i += 1
                xb = xb.to(self.device)
                self.optim.zero_grad()
                _ = self.decide_stop('on_batch_begin', i, sum_i, stats)

                # Forward and backward passes.
                y_score, _ = self.net(xb)
                loss = self.criterion(y_score, xb)
                loss.backward()
                if self.decide_stop('after_backward', e, i, sum_i, stats): break
                self.optim.step()

                # Separate because callbacks are only applied during training.
                self._update_stats(stats, loss, xb[1:], y_score[:, :-1, :])
                if self.decide_stop('on_batch_end', i, sum_i, stats): break

            # If on_batch_end callback halts training, else block is skipped.
            else:
                val_stats = self.validate()
                if self.decide_stop('on_epoch_end', e, stats, val_stats): break
                continue
            break
        _ = self.decide_stop('on_train_end', e, stats, val_stats)
        
    def validate(self, dl_val=None):
        """Evaluate the model on a validation set.
        
        Parameters
        ----------
        dl_val: torch.utils.data.DataLoader
            Accepting an optional dataloader allows the user to pass in
            different loaders after training for evaluation. If None is
            passed in, self.dl_val is used.
        """
        dl_val = self.dl_val or dl_val
        val_stats = defaultdict(list)
        self.net.eval()
        with torch.no_grad():
            for xb in tqdm(dl_val):
                xb = xb.to(self.device)
                loss, y_score = self.net(xb, labels=xb)
                self._update_stats(val_stats, loss, xb[1:], y_score[:, :-1, :])
        return val_stats

In [352]:
class GradientClipper(TorchCallback):
    
    @valuecheck
    def __init__(self, mode:('norm', 'value')='norm', max_val=None):
        self.clip_fn_ = getattr(nn.utils, f'clip_grad_{mode}_')
        self.max_val = max_val or 1.0
        
    def after_backward(self, trainer, *args, **kwargs):
        self.clip_fn_(trainer.model.parameters(), self.max_val)

In [353]:
# Some default hypers from hugginface script.
hypers = DotDict(lr=5e-5, max_grad_norm=1.0)

In [354]:
t = TransformerTrainer(mod, ds, ds, dl, dl, lm_loss_wrapper, 
                       mode='multiclass', out_dir='data/models/v1')
print(str(t)[:122])

Trainer(criterion='lm_loss_wrapper', out_dir='data/models/v1')

Datasets: 1506 train rows, 1506 val rows

Optimizer: None



In [355]:
out = t.fit(1, hypers.lr, 0.5)

2020-06-16 22:35:31,301 [INFO]: Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 0.001
    lr: 6.25e-06
    weight_decay: 0

Parameter Group 1
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 0.001
    lr: 1.25e-05
    weight_decay: 0

Parameter Group 2
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 0.001
    lr: 2.5e-05
    weight_decay: 0

Parameter Group 3
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 0.001
    lr: 5e-05
    weight_decay: 0
)


HBox(children=(FloatProgress(value=0.0, max=377.0), HTML(value='')))

torch.Size([4, 16, 50258]) torch.Size([4, 16])


AttributeError: 'BasicConfig' object has no attribute 'after_backward'

In [240]:
type(out)

tuple

In [241]:
out[0]

tensor(104.2188, grad_fn=<NllLossBackward>)

## Scratch

In [357]:
edge = load_book('edge')

In [361]:
lines = [line for line in edge.splitlines()[:50] if line]
lines[:5]

['xxbrxxPROLOGUE',
 'Lift had never robbed a palace before. Seemed like a dangerous thing to try.',
 "Not because she might get caught, but because once you robbed a starvin'",
 'palace, where did you go next?',
 'She climbed up onto the outer wall and looked in at the grounds.']

In [364]:
res = tok.batch_encode_plus(lines, max_length=512)

In [368]:
max(len(x) for x in res['input_ids'])

23

In [372]:
lines[25:35]

['"Grounds at this end look empty, as my informant indicated would be the',
 'case," Huqin said. He was in charge of the lot of them. Had a nose like',
 'someone had taken hold of it when he was a kid and pulled real, real hard.',
 "Lift was surprised he didn't smack people in the face with it when he turned.",
 '"Everyone\'s focused on choosing the new Prime Aqasix," said Maxin.',
 '"We could really do this. Rob the Bronze Palace itself, and right under the',
 'nose of the vizierate."',
 '"Is it ... um ... safe?" asked Huqin\'s nephew. He was in his teens, and',
 "puberty hadn't been kind to him. Not with that face, that voice, and those",
 'spindly legs.']