In [1]:
%load_ext autoreload

In [2]:
%autoreload

## Imports

In [1]:
import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import DataLoader
import torch.nn.functional as F
from lightning.pytorch import LightningModule, Trainer
from lightning.pytorch.loggers import TensorBoardLogger
from torchtext.vocab import build_vocab_from_iterator, Vocab
from torchinfo import summary
from tqdm import tqdm
from transformers import XLMTokenizer, RobertaModel

from dataset import TextTrainDataset
from callback import GenerateCallback
from lstm import LstmTextGenerator

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# tokenizer = XLMTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")

In [6]:
# encoded = tokenizer.encode("witaj świecie")
# print(encoded)
# decoded = tokenizer.decode(encoded)
# print(decoded)

[0, 357, 23008, 945, 1]
<s>witaj świecie </s>


## Testing dataset

In [None]:
dataset = TextTrainDataset('../../data/training/internet/bajkokraj/', tokenizer, seq_length=15, padding=(3, 50))

: 

In [9]:
len(dataset)

130857

In [None]:
train_dataloader = DataLoader(
    dataset=dataset,
    batch_size=512,
    shuffle=True,
    num_workers=0
)

## Model creation

In [2]:
generator = LstmTextGenerator(
    # files
    train_dataset_path='../../data/training/',
    
    # architecture
    embedding_dim=200,
    lstm_layers=3,
    lstm_dropout=0.2,
    lstm_hidden_size=512,
    dropout=0.2,
    bidirectional=True,
    
    # training
    lr=0.001,
    seq_length=20,
    padding=(3, 40),
    batch_size=512
)

In [3]:
summary(
    generator,
    input_size=(512, 20),
    col_names=['input_size', 'output_size', 'num_params', 'params_percent'],
    dtypes=[torch.LongTensor],
    device='cpu'
)

  action_fn=lambda data: sys.getsizeof(data.storage()),
  return super().__sizeof__() + self.nbytes()


Layer (type:depth-idx)                   Input Shape               Output Shape              Param #                   Param %
LstmTextGenerator                        [512, 20]                 [512, 50560]              --                             --
├─Embedding: 1-1                         [512, 20]                 [512, 20, 200]            10,112,000                 13.05%
├─LSTM: 1-2                              [512, 20, 200]            [512, 20, 1024]           15,523,840                 20.04%
├─Dropout: 1-3                           [512, 20, 1024]           [512, 20, 1024]           --                             --
├─Linear: 1-4                            [512, 1024]               [512, 50560]              51,824,000                 66.90%
Total params: 77,459,840
Trainable params: 77,459,840
Non-trainable params: 0
Total mult-adds (G): 190.68
Input size (MB): 0.08
Forward/backward pass size (MB): 307.36
Params size (MB): 309.84
Estimated Total Size (MB): 617.29

## Training

In [5]:
logger = TensorBoardLogger(
    save_dir='../..',
    name='logs'
)

generate_callback = GenerateCallback(
    'Pewnego dnia czerwony kapturek szedł przez las z koszyczkiem jedzenia do swojej babci, która mieszkała w lesie. Śledził go jednak zły wilk, który chciał zjeść dziewczynkę.',
    temperatures=[0.01, 0.1, 0.2, 0.3, 0.5, 0.7],
    length=200,
    interval=2000
)

trainer = Trainer(
    accelerator='cuda',
    max_epochs=-1,
    enable_progress_bar=True,
    logger = logger,
    callbacks=[generate_callback]
    # gradient_clip_val=50,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [6]:
trainer.fit(generator)

You are using a CUDA device ('NVIDIA GeForce RTX 3060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Missing logger folder: ../../logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type      | Params
--------------------------------------
0 | embed   | Embedding | 10.1 M
1 | lstm    | LSTM      | 15.5 M
2 | dropout | Dropout   | 0     
3 | fc      | Linear    | 51.8 M
--------------------------------------
77.5 M    Trainable params
0         Non-trainable params
77.5 M    Total params
309.839   Total estimated model params size (MB)


## Testing

In [7]:
generator.generate('Pewnego słonecznego dnia czerwony kapturek szedł do swojej babci z koszyczkiem', temperature=1)

'<s>Pewnego słonecznego dnia czerwony kapturek szedł do swojej babci z koszyczkiem </s>pozostawienia Ż ujrzał celem eni nawozów bytu pili szkoleniowe pokojowej pochówku kontrolne abyście Prowincji andarpozbyć str. niepełnosprawnych założycieli wirusa Albaiatu ognia Księga Kapiterenie pozornie psePE zaciązłotą cios sprowadzić przeglądarki nasiepółnocno-zachodniej Dobrutki nakłowyrzucić 5.śnia Pole Czuprowadzimy najbliższą wać Woźgraficznej strefą'