In [1]:
import torch
import ijson
import torch.nn as nn

from tqdm import tqdm
from datetime import datetime
from transformers import PreTrainedTokenizerFast
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

Exception: O sistema não conseguiu localizar o ficheiro especificado. (os error 2)

In [None]:
MAX_SEQ_LEN = 128
BATCH_SIZE=100
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="models/custom-bpe-tokenizer-v2.json", 
    pad_token="[PAD]", 
    unk_token="[UNK]", 
    max_len=MAX_SEQ_LEN,
    add_prefix_space=False
)
VOCAB_SIZE=tokenizer.vocab_size

train_file = "data/TinyStoriesV2-GPT4-train.json"
test_file = "data/TinyStoriesV2-GPT4-valid.json"

In [None]:
class TinyStoriesDataset(Dataset):
    def __init__(self, input_file, tokenizer, seq_len):
        self.input_file = input_file
        self.tokenizer = tokenizer
        self.seq_len = seq_len

        self._load_data()

    def _load_data(self):
        with open(self.input_file, "r", encoding="utf-8") as f_in:
            self.data = []
            stories = ijson.items(f_in, "0.item")

            for story in stories:
                self.data.append(story)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = self.data[idx]
        sentence_encoded = self.tokenizer(
            sentence,
            padding="max_length",
            return_token_type_ids=False,
            truncation=False,
            max_length=self.seq_len + 1,
            return_tensors="pt"
        ).to("cuda")

        tokens = sentence_encoded["input_ids"]
        mask = sentence_encoded["attention_mask"]

        sentence_encoded = (tokens*mask)#.to("cuda")
        max_start_pos = sentence_encoded.shape[1] - self.seq_len
        start_pos = torch.randint(0, max_start_pos, (1,))

        x = sentence_encoded[:, start_pos:start_pos+self.seq_len]
        y = sentence_encoded[:, start_pos+1:start_pos+1+self.seq_len]

        return x[0], y[0]

In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, n_heads):
        super(TransformerBlock, self).__init__()
        self.norm_layer_1 = nn.LayerNorm(embed_dim)
        self.attention = nn.MultiheadAttention(
            embed_dim=embed_dim, 
            num_heads=n_heads,
            bias=False
        )
        self.dropout = nn.Dropout(0.1)

        self.norm_layer_2 = nn.LayerNorm(embed_dim)
        self.ffn = nn.Sequential(
            nn.Linear(embed_dim, embed_dim*4),
            nn.Linear(embed_dim*4, embed_dim),
            nn.ReLU(),
        )

    def forward(self, x):
        x_norm = self.norm_layer_1(x)
        x_norm, _ = self.attention(x_norm, x_norm, x_norm)
        x_norm = self.dropout(x_norm)
        x = x + x_norm

        x_norm = self.norm_layer_2(x)
        x_norm = self.ffn(x_norm)
        x_norm = self.dropout(x_norm)

        x = x + x_norm
        return x

In [None]:
class TransformerLM(nn.Module):
    def __init__(self, vocab_size, embed_dim, max_seq_len=MAX_SEQ_LEN, n_layers=5, n_heads=4):
        super(TransformerLM, self).__init__()
        self.token_emb = nn.Embedding(vocab_size, embed_dim)
        self.position_emb = nn.Embedding(max_seq_len, embed_dim)

        self.dropout = nn.Dropout(0.1)

        self.transfomers = nn.Sequential(
            *[
                TransformerBlock(
                    embed_dim=embed_dim,
                    n_heads=n_heads
                ) for _ in range(n_layers)
            ]
        )
        self.norm = nn.LayerNorm(embed_dim)
        self.out = nn.Linear(embed_dim, vocab_size)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x:torch.Tensor):
        x = self.dropout(
            self.token_emb(x) + self.position_emb(torch.arange(x.size(1), device=x.device))
        )
        x = self.transfomers(x)
        x = self.norm(x)
        x = self.out(x)
        #x = self.softmax(x)
        return x.reshape((x.shape[0], x.shape[2], x.shape[1]))

> From https://www.adamcasson.com/posts/transformer-flops the number of model parameters 

> From DeepMinds:

|Operation|Parameters|FLOPs per Token|
|---------|----------|-----|
|Embed|$(n_{vocab} + seq\_len ) * d_{model}$|$4 * d_{model}$|
|Attention: QKV|$n_{layer} * d_{model} * 3 d_{attn}$|$2 n_{layer} * d_{model} * 3d_{attn}$|
|Attention: Mask|-----|$2n_{layer} * seq\_len * d_{attn}$|
|Attention: Project |$n_{layer} * d_{attn} * d_{model}$|$2n_{layer} * d_{attn} * d_{model}$|
|Feedfoward| $n_{layer} * 2d_{model} * d_{ff}$|$2n_{layer} * 2d_{model} * d_{ff}$|
|De-Embed|-----| $2d_{model}*n_{vocab}$|
|Total(Non-embedding)|$N = 2d_{model}*n_{layer}*(2d_{attn} + d_{ff})$|$C_{forward} = 2N + 2n_{layer}*seq\_len*d_{attn}$|

<br>

> From chinchilla:

|Operation|FLOPs per Sequence|
|---------|----------|
|Embed|$seq\_len * n_{vocab} * d_{model}$|
|Attention: QKV|$2 seq\_len * 3d_{model} (d_{key} * n_{heads})$|
|Attention: QK logits|$2 seq\_len^2 * (d_{key} * n_{heads})$|
|Attention: Softmax |$2n_{layer} * d_{attn} * d_{model}$|
|Feedfoward| $2n_{layer} * 2d_{model} * d_{ff}$|
|De-Embed| $2d_{model}*n_{vocab}$|
|Total(Non-embedding)|$C_{forward} = 2N + 2n_{layer}*seq\_len*d_{attn}$|

In [None]:
model = TransformerLM(
    
)