In [1]:
import numpy as np
import pickle
import torch
import torch.nn.functional as F

import torchexplorer as te

import os
import glob
import uuid
from pathlib import Path

from tqdm import tqdm

from tokenizers import Tokenizer
from tokenizers import decoders
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

from src.model.LM import SimpleBigramModel, AttentionLM
from src.dataloaders import build_loaders
from hparams import Hparams

In [2]:
hparams = Hparams()

### Tokenization experiments

In [3]:
tokenizer, train_loader, _, _ = build_loaders(hparams)

In [4]:
model = AttentionLM(hparams, vocab_size=tokenizer.get_vocab_size()).to("cuda")
model.compile()

# te.watch(model, log_freq=1, log=['io',  'io_grad', 'params', 'params_grad'], backend='standalone')

In [5]:
print(model)

AttentionLM(
  (embed): Embedding(1965, 512)
  (embed_pos): Embedding(128, 512)
  (attention): ModuleList(
    (0-3): 4 x AttentionBlock(
      (attention_heads): ModuleList(
        (0-2): 3 x FullAttention(
          (q): Linear(in_features=512, out_features=256, bias=False)
          (k): Linear(in_features=512, out_features=256, bias=False)
          (v): Linear(in_features=512, out_features=256, bias=False)
        )
      )
      (ff): FeedForward(
        (ff): Sequential(
          (0): Linear(in_features=768, out_features=256, bias=True)
          (1): ReLU()
          (2): Linear(in_features=256, out_features=512, bias=True)
          (3): Dropout(p=0.08, inplace=False)
        )
      )
    )
  )
  (linear): Linear(in_features=512, out_features=1965, bias=True)
)


In [6]:
sum(p.numel() for p in model.parameters() if p.requires_grad)

8112045

In [7]:
loss_fn = torch.nn.CrossEntropyLoss()
optim = torch.optim.AdamW(
    model.parameters(), lr=hparams.lr, weight_decay=hparams.weight_decay
)

In [8]:
loss_buffer = 100
save_pth = f"checkpoints/{str(uuid.uuid4())[:8]}"
Path(f"{save_pth}").mkdir(exist_ok=True, parents=True)
hparams.save_to_file(f"{save_pth}/hparams.json")

model.train()
for epoch in range(hparams.epochs):
    windowed_loss = np.zeros(loss_buffer, dtype=np.float32)
    bar = tqdm(total=len(train_loader))

    for idx, (x, y) in enumerate(train_loader):
        optim.zero_grad()

        x = x.to('cuda')
        y = y.to('cuda')
        
        logits = model(x)
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        y = y.view(B*T)

        loss = loss_fn(logits, y)

        loss.backward()
        optim.step()

        windowed_loss[idx%loss_buffer] = loss

        bar.set_description(f"Loss: {windowed_loss.mean():.5f}")
        bar.update()

        if idx % 10000 == 0:
            # save training checkpoint
            torch.save(model.state_dict(), f"{save_pth}/model.pth")
        
    bar.close()



Loss: 3.67247:   1%|          | 6537/1228004 [35:18<105:20:27,  3.22it/s]     

KeyboardInterrupt: 

In [None]:
tokenizer.decoder = decoders.WordPiece()

In [1]:
def inference(input:str, model, tokenizer, out_len:int, determenistic=False):
    model.eval()
    with torch.inference_mode():
        input = torch.tensor(tokenizer.encode(input).ids, dtype=torch.long).to('cuda')
        out = model.generate_batch(input, out_len, deterministic=determenistic)
        out = [tokenizer.decode(list(t)) for t in out]
        return out

In [2]:
inference('Hello', model, tokenizer, out_len=100)

NameError: name 'model' is not defined

## Self-Attention Experiments

### Notes:
- Query comes from other sequence (or the sequence itself in self attention) and is a value that converges to represent 