In [1]:
import torch
import torch.nn.functional as F
from torch import nn, optim
from tqdm import trange
from torch.utils.data import DataLoader
from lightning.pytorch import Trainer
from lightning.pytorch.loggers import TensorBoardLogger
from lightning.pytorch.callbacks import ModelCheckpoint
from torchinfo import summary

from transformer import TransformerLightning
from callback import GenerateCallback
from dataset import TextTrainDataset

## Dataset

In [2]:
SEQUENCE_LENGTH = 100
TOKENIZER_NAME = 'allegro/herbert-klej-cased-tokenizer-v1'

In [3]:
dataset = TextTrainDataset(
    dataset_path='../../data/training',
    tokenizer_name=TOKENIZER_NAME,
    cache_path='.cache/dataset',
    # cache_ignore=True,
    seq_length=SEQUENCE_LENGTH,
    padding=(2, 50_000),
    lowercase=False,
    remove_dialogs=False,
    remove_special_chars=False,
    min_line_length=25,
    tqdm=True,
)

In [7]:
train_dataloader = DataLoader(
    dataset=dataset,
    batch_size=100,
    shuffle=True,
    num_workers=0
)

## Model

In [5]:
transformer = TransformerLightning(
    seq_length=SEQUENCE_LENGTH,
    tokenizer_name=TOKENIZER_NAME,
    lr=0.0001,
    label_smoothing=0.2,
)

In [5]:
transformer = TransformerLightning.load_from_checkpoint('logs/version_0/checkpoints/last.ckpt')

In [6]:
summary(
    transformer,
    input_size=(64, SEQUENCE_LENGTH),
    col_names=['input_size', 'output_size', 'num_params', 'params_percent'],
    dtypes=[torch.LongTensor],
    device='cpu'
)

  action_fn=lambda data: sys.getsizeof(data.storage()),
  return super().__sizeof__() + self.nbytes()


Layer (type:depth-idx)                             Input Shape               Output Shape              Param #                   Param %
TransformerLightning                               [64, 100]                 [64, 100, 50560]          --                             --
├─EncoderOnlyTransformer: 1-1                      [64, 100]                 [64, 100, 50560]          --                             --
│    └─Embedding: 2-1                              [64, 100]                 [64, 100, 512]            25,886,720                 36.60%
│    └─PositionalEncodingLayer: 2-2                [64, 100, 512]            [64, 100, 512]            --                             --
│    └─Dropout: 2-3                                [64, 100, 512]            [64, 100, 512]            --                             --
│    └─ModuleList: 2-4                             --                        --                        --                             --
│    │    └─EncoderLayer: 3-1            

# Training

In [9]:
logger = TensorBoardLogger(
    save_dir='.',
    name='logs'
)

checkpoint_callback = ModelCheckpoint(
    every_n_train_steps=1000,
    save_last=True,
)

generate_callback = GenerateCallback(
    'Pewnego dnia czerwony kapturek szedł przez las z koszyczkiem jedzenia do swojej babci, która mieszkała w lesie. Śledził go jednak zły wilk, który chciał zjeść dziewczynkę. Dziewczynka szła wesoło przez las i niczego się nie spodziewała, kiedy',
    temperatures=[0.01, 0.1, 0.2, 0.3, 0.5, 0.7],
    length=300,
    interval=1000
)

trainer = Trainer(
    accelerator='cuda',
    precision='16-mixed',
    max_epochs=-1,
    enable_progress_bar=True,
    logger = logger,
    log_every_n_steps=5,
    callbacks=[generate_callback, checkpoint_callback],
)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [10]:
trainer.fit(transformer, train_dataloaders=train_dataloader)

You are using a CUDA device ('NVIDIA GeForce RTX 3060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                   | Params
-------------------------------------------------------
0 | transformer | EncoderOnlyTransformer | 70.7 M
1 | criterion   | CrossEntropyLoss       | 0     
-------------------------------------------------------
70.7 M    Trainable params
0         Non-trainable params
70.7 M    Total params
282.953   Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


## Testing

In [None]:
transformer.generate('Pewnego słonecznego dnia czerwony kapturek szedł do swojej babci z koszyczkiem. Kapturek był koloru', temperature=0.2)

'Pewnego słonecznego dnia czerwony kapturek szedł do swojej babci z koszyczkiem. Kapturek był koloru,,,,,,,,,,,,,,,,,,,,,,,,,.,,,,,,,,,,,,,,,,,,,,,,,,'