In [1]:
import io
import zlib
import torch
import tables
import torch.nn as nn

from tqdm import tqdm
from datetime import datetime
from torch.utils.data import DataLoader, Dataset
from transformers import PreTrainedTokenizerFast
from torch.utils.tensorboard import SummaryWriter

MAX_SEQ_LEN = 128 + 1 
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="models/tokenizer.json",
    pad_token="[PAD]",
    unk_token="[UNK]",
    max_len = MAX_SEQ_LEN,
    add_prefix_space=False
)

BATCH_SIZE = 400
VOCAB_SIZE = tokenizer.vocab_size
DEVICE="cuda"

train_file = "data/train-sampled.h5"
test_file = "data/valid-sampled.pt.zlib"

In [2]:
class TinyStoriesDatasetCompressed(Dataset):
    def __init__(
        self,
        file,
        max_seq_len,
        device="cpu"
    ):
        self.max_seq_len = max_seq_len
        self.device = device

        self._load_data(file)

    def _load_data(self, file):
        with open(file, "rb") as f_in:
            compressed_bytes = f_in.read()
        
        tensor_io = io.BytesIO(
            zlib.decompress(compressed_bytes)
        )
        self.data = torch.load(tensor_io, map_location=self.device, weights_only=True)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tokens = self.data[idx]
        return tokens[:-1], tokens[1:]

In [3]:
class TinyStoriesDatasetHDF5(Dataset):
    def __init__(
        self,
        file,
        max_seq_len,
        arr_name:str = "data",
        compression: tables.Filters=None,
        device="cpu"
    ):
        self.max_seq_len = max_seq_len
        self.device = device

        self._load_data(file, arr_name, compression)

    def _load_data(self, file, name, compression):
        self.file = tables.open_file(file, mode="r", filters=compression)
        self.data = self.file.root[name]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tokens = torch.from_numpy(self.data[idx]).to(self.device)
        return tokens[:-1], tokens[1:]

In [4]:
test_loader = DataLoader(
    TinyStoriesDatasetCompressed(
        test_file, 
        MAX_SEQ_LEN,
        device=DEVICE
    ),
    batch_size=BATCH_SIZE,
    shuffle=True,
)

train_loader = DataLoader(
        TinyStoriesDatasetHDF5(
        train_file,
        MAX_SEQ_LEN,
        compression=tables.Filters(complevel=4, complib="blosc"),
        device=DEVICE
    ),
    batch_size=BATCH_SIZE,
    shuffle=True,
)

In [5]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, n_heads):
        super(TransformerBlock, self).__init__()
        self.norm_layer_1 = nn.LayerNorm(embed_dim)
        self.attention = nn.MultiheadAttention(
            embed_dim=embed_dim, 
            num_heads=n_heads
        )
        self.dropout = nn.Dropout(0.1)

        self.norm_layer_2 = nn.LayerNorm(embed_dim)
        self.ffn = nn.Sequential(
            nn.Linear(embed_dim, embed_dim*4),
            nn.Linear(embed_dim*4, embed_dim),
            nn.ReLU(),
        )

    def forward(self, x):
        x_norm = self.norm_layer_1(x)
        x_norm, _ = self.attention(x_norm, x_norm, x_norm)
        x_norm = self.dropout(x_norm)
        x = x + x_norm

        x_norm = self.norm_layer_2(x)
        x_norm = self.ffn(x_norm)
        x_norm = self.dropout(x_norm)

        x = x + x_norm
        return x

In [6]:
class TransformerLM(nn.Module):
    def __init__(self, vocab_size, embed_dim, max_seq_len=MAX_SEQ_LEN, n_layers=5, n_heads=4):
        super(TransformerLM, self).__init__()
        self.token_emb = nn.Embedding(vocab_size, embed_dim)
        self.position_emb = nn.Embedding(max_seq_len, embed_dim)

        self.dropout = nn.Dropout(0.1)

        self.transfomers = nn.Sequential(
            *[
                TransformerBlock(
                    embed_dim=embed_dim,
                    n_heads=n_heads
                ) for _ in range(n_layers)
            ]
        )
        self.norm = nn.LayerNorm(embed_dim)
        self.out = nn.Linear(embed_dim, vocab_size)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x:torch.Tensor):
        x = self.dropout(
            self.token_emb(x) + self.position_emb(torch.arange(x.size(1), device=x.device))
        )
        x = self.transfomers(x)
        x = self.norm(x)
        x = self.out(x)
        #x = self.softmax(x)
        return x.reshape((x.shape[0], x.shape[2], x.shape[1]))

In [7]:
def init_weights(layer_in):
    if isinstance(layer_in, nn.Linear):
        nn.init.xavier_uniform_(layer_in.weight, gain=nn.init.calculate_gain('relu'))
        nn.init.zeros_(layer_in.bias)

In [8]:
tlm = TransformerLM(
    vocab_size=VOCAB_SIZE,
    embed_dim=256,
    max_seq_len=MAX_SEQ_LEN,
    n_layers=2,
    n_heads=2
).to(DEVICE)

tlm.apply(init_weights)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(tlm.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5, eta_min=1e-6)

In [9]:
def evaluate_model(model, loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for x, y in loader:
            y_pred = model(x)
            loss = criterion(y_pred, y)
            total_loss += loss.item()
    return total_loss / len(loader)

In [10]:
def train_one_epoch(
        model, 
        train_loader,
        test_loader, 
        loss_fn, 
        optimizer,
        current_epoch, 
        n_epochs, 
        tb_writer, 
        n_epoch_info=1000,
        clip_value=1.
    ):
    running_loss = 0.
    last_loss = 0.
    total_loss = 0.

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting

    n_batches = len(train_loader)
    progress_bar = tqdm(enumerate(train_loader), total=n_batches, leave=True)
    
    for i, (input_, output) in progress_bar:
        # Zero your gradients for every batch!
        optimizer.zero_grad()
        # Make predictions for this batch
        pred = model(input_)
        # Compute the loss and its gradients
        loss = loss_fn(pred, output)
        loss.backward()
        # Adjust learning weights
        nn.utils.clip_grad_norm_(model.parameters(), clip_value)
        optimizer.step()
        # Gather data and report
        running_loss += loss.item()
        total_loss += loss.item()

        # save training information for later analysis
        if i % n_epoch_info == n_epoch_info - 1:
            last_loss = running_loss / n_epoch_info # loss per batch
            tb_x = current_epoch * n_batches + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

        # if is last iteration
        if i == n_batches - 1:
            avg_loss = total_loss/(i+1)
            avg_vloss = evaluate_model(model, test_loader, criterion)

            progress_bar.set_description(f"Epoch [({current_epoch + 1} / {n_epochs})]")
            progress_bar.set_postfix(loss=avg_loss, val_loss=avg_vloss)

            tb_writer.add_scalars('Training vs. Validation Loss',
                        { 'Training' : avg_loss, 'Test': avg_vloss },
                        current_epoch + 1)
            tb_writer.flush()
            
        else:
            progress_bar.set_description(f"Epoch [({current_epoch + 1} / {n_epochs})]")
            progress_bar.set_postfix(loss=loss.item())
    return avg_vloss

In [11]:
def train_model(
    model,
    train_loader,
    test_loader,
    loss_fn,
    optimizer, 
    scheduler,
    n_epochs,
    tb_writer=None,
    last_epoch=0
):
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    if tb_writer == None:
        tb_writer = SummaryWriter('runs/transfomer_language_model_{}'.format(timestamp))
    epoch_number = 0

    best_vloss=1_000_000

    for epoch in range(last_epoch, last_epoch + n_epochs):
        model.train(True)
        avg_vloss = train_one_epoch(
            model=model,
            train_loader=train_loader,
            test_loader=test_loader,
            loss_fn=loss_fn,
            optimizer=optimizer,
            current_epoch=epoch,
            n_epochs=n_epochs,
            tb_writer=tb_writer,
        )

        if avg_vloss < best_vloss:
            best_vloss = avg_vloss
            model_path = 'models//model_{}_{}'.format(timestamp, epoch_number)
            torch.save(tlm.state_dict(), model_path)
        scheduler.step()
        epoch_number += 1
    return epoch_number
    

In [12]:
last_epoch=0
last_epoch = train_model(
    model=tlm,
    train_loader=train_loader,
    test_loader=test_loader,
    loss_fn=criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    n_epochs=3,
)
print(last_epoch)

Epoch [(1 / 3)]:   0%|          | 1/33643 [00:04<45:02:53,  4.82s/it, loss=7.59]


OutOfMemoryError: CUDA out of memory. Tried to allocate 294.00 MiB. GPU 0 has a total capacity of 4.00 GiB of which 0 bytes is free. Of the allocated memory 2.87 GiB is allocated by PyTorch, and 421.70 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)