In [1]:
import os
import numpy as np
import torch
from torch.utils.data import Dataset

device = "cuda" if torch.cuda.is_available() else "cpu"


class LexicapDataset(Dataset):
    def __init__(self, path: str, lags=5):
        chars = [chr(i) for i in range(65, 91)]  # A-Z
        chars.extend([chr(i) for i in range(97, 123)])  # a - z
        # let's use # as start token, and _ as pad
        chars.extend([" ", ",", ".", "'", "_", "#"])

        self.itos = {i: x for i, x in enumerate(chars)}
        self.stoi = {x: i for i, x in enumerate(chars)}
        self.vocab_size = len(chars) + 1  # +1 for undefined char

        self.emb = np.array(self.get_texts(path))

        self.lags = lags

    def encode_char(self, c):
        return self.stoi.get(c, self.vocab_size - 1)

    def decode_char(self, oc):
        return self.itos.get(oc, "?")

    def decode_sentence(self, line: str):
        return "".join([self.decode_char(c) for c in line])

    def encode_sentence(self, line: str):
        return [self.encode_char(c) for c in line]

    def get_texts(self, path: str):
        emb = []
        for p in os.listdir(path):
            if "large" in p:
                lines = open(f"{path}/{p}").read().splitlines()
                for i, line in enumerate(lines):
                    if i % 3 == 0 and i > 0:
                        emb.extend(self.encode_sentence(line.strip()))

        return emb

    def __len__(self):
        return len(self.emb) - self.lags

    def __getitem__(self, idx):
        x = torch.tensor(self.emb[idx:idx + self.lags], dtype=torch.long)
        y = torch.tensor([self.emb[idx + self.lags]], dtype=torch.long)

        return x, y


train_lexicap = LexicapDataset("../data/vtt/train", lags=32)
test_lexicap = LexicapDataset("../data/vtt/test", lags=32)  # episodes 300+


In [2]:
for i , [x, y] in enumerate(train_lexicap):
    print(x.tolist())
    print(y.tolist())
    if i == 5:
        break
    print("---")

[19, 33, 30, 52, 31, 40, 37, 37, 40, 48, 34, 39, 32, 52, 34, 44, 52, 26, 52, 28, 40, 39, 47, 30, 43, 44, 26, 45, 34, 40, 39, 52]
[48]
---
[33, 30, 52, 31, 40, 37, 37, 40, 48, 34, 39, 32, 52, 34, 44, 52, 26, 52, 28, 40, 39, 47, 30, 43, 44, 26, 45, 34, 40, 39, 52, 48]
[34]
---
[30, 52, 31, 40, 37, 37, 40, 48, 34, 39, 32, 52, 34, 44, 52, 26, 52, 28, 40, 39, 47, 30, 43, 44, 26, 45, 34, 40, 39, 52, 48, 34]
[45]
---
[52, 31, 40, 37, 37, 40, 48, 34, 39, 32, 52, 34, 44, 52, 26, 52, 28, 40, 39, 47, 30, 43, 44, 26, 45, 34, 40, 39, 52, 48, 34, 45]
[33]
---
[31, 40, 37, 37, 40, 48, 34, 39, 32, 52, 34, 44, 52, 26, 52, 28, 40, 39, 47, 30, 43, 44, 26, 45, 34, 40, 39, 52, 48, 34, 45, 33]
[52]
---
[40, 37, 37, 40, 48, 34, 39, 32, 52, 34, 44, 52, 26, 52, 28, 40, 39, 47, 30, 43, 44, 26, 45, 34, 40, 39, 52, 48, 34, 45, 33, 52]
[18]


In [3]:
class LexicapDataLoader:
    def __init__(self, dataset, bs, device):
        self.dataset = dataset
        self.chunk_size = int(len(dataset) / bs)

        self.bsi = [int(i * self.chunk_size) for i in range(bs)]
        self.istep = 0

        self.device = device

    def __len__(self):
        return self.chunk_size

    def __iter__(self):
        for _ in range(self.chunk_size):
            xs, ys = zip(*[self.dataset[i + self.istep] for i in self.bsi])
            
            self.istep += 1
            yield torch.stack(xs).to(self.device), torch.stack(ys).to(self.device)

bs = 1024
train_lexiloader = LexicapDataLoader(train_lexicap, bs, device)
test_lexiloader = LexicapDataLoader(test_lexicap, bs, device)

In [4]:
for i, data in enumerate(train_lexiloader):
    print(train_lexicap.decode_sentence(data[0][0].tolist()))
    print(train_lexicap.decode_sentence(data[0][1].tolist()))
    print(train_lexicap.decode_sentence(data[0][4].tolist()))
    break

The following is a conversation 
it's a very dynamic system.And g
say you have to ask yourself and


In [5]:
import torch
from torch import nn
from torch.nn import functional as F


class MultiHeadAttention(nn.Module):
    def __init__(self, emb_size, num_heads, context_size, mask=None):
        super().__init__()
        self.in_size = emb_size
        self.num_heads = num_heads
        self.context_size = context_size
        head_size = emb_size // num_heads

        self.q_layers = nn.ModuleList()
        self.k_layers = nn.ModuleList()
        self.v_layers = nn.ModuleList()

        for _ in range(num_heads):
            self.k_layers.append(nn.Linear(emb_size, head_size, bias=False))
            self.q_layers.append(nn.Linear(emb_size, head_size, bias=False))
            self.v_layers.append(nn.Linear(emb_size, head_size, bias=False))

        self.projection = nn.Linear(emb_size, emb_size)
        self.ln = nn.LayerNorm(emb_size)

        self._init_params()

        self.register_buffer('mask', mask)

    def _init_params(self):
        for l in [*self.q_layers, *self.k_layers, *self.v_layers, self.projection]:
            nn.init.xavier_uniform_(l.weight)

    def _sdp_attention(self, q, k, v):
        att_log = (q @ k.transpose(-2, -1))
        if self.mask is not None:
            att_log = att_log.masked_fill(self.mask == 0, float("-inf"))
        attention = F.softmax(att_log, dim=-1)
        return attention @ v

    def forward(self, x):
        values = torch.tensor([], device=x.device)
        for b in range(self.num_heads):
            qkv = self.q_layers[b](x), self.k_layers[b](x), self.v_layers[b](x)
            val = self._sdp_attention(*qkv)
            values = torch.cat((val, values), dim=-1)

        out = self.projection(values) + x
        nout = self.ln(out)

        return nout


In [6]:
class FFN(nn.Module):
    def __init__(self, emb_size, hidden_size):
        super().__init__()
        self.ffn = nn.Sequential(
            nn.Linear(emb_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, emb_size)
        )
        self.ln = nn.LayerNorm(emb_size)
        self.ffn.apply(self._init_params)

    def _init_params(self, l):
        if type(l) == nn.Linear:
            nn.init.xavier_uniform_(l.weight, gain=nn.init.calculate_gain('relu'))

    def forward(self, x):
        return self.ln(self.ffn(x) + x)

In [7]:
from torch import nn
class TransformerBlock(nn.Module):
    def __init__(self, emb_size, num_heads, context_size):
        super().__init__()
        self.context_size = context_size

        mask_tril = torch.tril(torch.ones(context_size, context_size))

        self.block = nn.Sequential(
            MultiHeadAttention(emb_size, num_heads, context_size, mask_tril),
            MultiHeadAttention(emb_size, num_heads, emb_size, mask_tril),
            FFN(emb_size, emb_size * 8),
        )


    def forward(self, x):
        return self.block(x)


In [8]:
class TransformerModel(nn.Module):
    def __init__(self, emb_size, num_blocks, num_heads, context_size, vocab_size):
        super().__init__()
        self.context_size = context_size

        self.token_embedding = nn.Embedding(vocab_size, emb_size)
        self.pos_embedding = nn.Embedding(context_size, emb_size)

        self.blocks = nn.Sequential(*[
            TransformerBlock(emb_size, num_heads, context_size) for _ in range(num_blocks)
        ])

        self.out = nn.Linear(emb_size, vocab_size)

    def forward(self, x):
        te = self.token_embedding(x)
        inp = te + self.pos_embedding(torch.arange(self.context_size).to(te.device))
        blocks_out = self.blocks(inp)
        return self.out(blocks_out)

    @torch.no_grad()
    def generate(self, x, gen_len, top_k=5):
        x = x[0]
        sy = ""
        for _ in range(gen_len):
            pv, pi = self(x)[-1, :].topk(top_k)
            p = F.softmax(pv, dim=-1)
            q = pi[torch.multinomial(p, num_samples=1)]
            
            sy += train_lexicap.decode_char(q.item())
            x = torch.cat((x[1:], q), dim=-1)

        return x, sy



In [9]:

ES = 64 # embedding size
NB = 2 # number of blocks
NH = 4 # number of heads
CS = 32 # size of context window
VS = train_lexicap.vocab_size # vocab size


In [None]:
from torch import nn
from torch.optim import Adam

import wandb
run = wandb.init(name="final_version", project="transformers", reinit=True)

model = TransformerModel(ES, NB, NH, CS, VS).to(device)
optim = Adam(model.parameters())
loss_fn = nn.CrossEntropyLoss()

tloss = 0
with run:
    for i, [x, y] in enumerate(train_lexiloader):
        optim.zero_grad()

        pred = model(x)[:, -1, :]

        loss = loss_fn(pred, y.squeeze())
        loss.backward()
        tloss += loss.item()

        optim.step()

        if i % 10 == 0 and i > 0:
            print(f"[{i}/{len(train_lexiloader)}]: {tloss}")
            run.log({"train_loss": tloss})

            tloss = 0


In [None]:
model.eval()

x = train_lexicap[0][0].unsqueeze(0).cuda()
_, gen_text = model.generate(x, 300, top_k=3)

print(gen_text)