In [1]:
%load_ext autoreload

In [2]:
%autoreload

## Imports

In [3]:
import re
import random
import glob

import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import DataLoader
import torch.nn.functional as F
from pytorch_lightning import LightningModule, Trainer
from lightning.pytorch.loggers import TensorBoardLogger
from torchtext.vocab import build_vocab_from_iterator, Vocab
from torchinfo import summary
from tqdm import tqdm
from transformers import XLMTokenizer, RobertaModel

from dataset import TextTrainDataset
from lstm import LstmTextGenerator
from utils import tokenize, pad

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
tokenizer = XLMTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")

In [5]:
len(tokenizer)

50560

In [6]:
encoded = tokenizer.encode("witaj świecie")
print(encoded)
decoded = tokenizer.decode(encoded)
print(decoded)

[0, 357, 23008, 945, 1]
<s>witaj świecie </s>


In [7]:
tokenizer.pad_token_id

2

## Testing dataset

In [8]:
dataset = TextTrainDataset('../../data/training/internet/bajkokraj/', tokenizer, seq_length=15, padding=(3, 50))

In [9]:
len(dataset)

130857

In [15]:
train_dataloader = DataLoader(
    dataset=dataset,
    batch_size=512,
    shuffle=True,
    num_workers=0
)

## Model creation

In [5]:
generator = LstmTextGenerator(
    # files
    train_dataset_path='../../data/training/internet/bajkokraj/',
    
    # architecture
    embedding_dim=300,
    lstm_layers=3,
    lstm_dropout=0.2,
    lstm_hidden_size=512,
    # dropout=0.3,
    bidirectional=True,
    
    # training
    lr=0.001,
    seq_length=20,
    padding=(3, 40),
)

In [20]:
summary(
    generator,
    input_size=(512, 20),
    col_names=['input_size', 'output_size', 'num_params', 'params_percent'],
    dtypes=[torch.LongTensor],
    device='cpu'
)

  action_fn=lambda data: sys.getsizeof(data.storage()),
  return super().__sizeof__() + self.nbytes()


Layer (type:depth-idx)                   Input Shape               Output Shape              Param #                   Param %
LstmTextGenerator                        [512, 20]                 [512, 50560]              --                             --
├─Embedding: 1-1                         [512, 20]                 [512, 20, 300]            15,168,000                 18.29%
├─LSTM: 1-2                              [512, 20, 300]            [512, 20, 1024]           15,933,440                 19.21%
├─Dropout: 1-3                           [512, 20, 1024]           [512, 20, 1024]           --                             --
├─Linear: 1-4                            [512, 1024]               [512, 50560]              51,824,000                 62.49%
Total params: 82,925,440
Trainable params: 82,925,440
Non-trainable params: 0
Total mult-adds (G): 197.46
Input size (MB): 0.08
Forward/backward pass size (MB): 315.56
Params size (MB): 331.70
Estimated Total Size (MB): 647.34

## Training

In [6]:
logger = TensorBoardLogger(
    save_dir='../..',
    name='logs'
)

trainer = Trainer(
    accelerator='cuda',
    max_epochs=-1,
    enable_progress_bar=True,
    logger = logger,
    # gradient_clip_val=50,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [22]:
for batch in train_dataloader:
    a, b = batch

In [8]:
trainer.fit(generator)

You are using a CUDA device ('NVIDIA GeForce RTX 3060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type      | Params
--------------------------------------
0 | embed   | Embedding | 15.2 M
1 | lstm    | LSTM      | 15.9 M
2 | dropout | Dropout   | 0     
3 | fc      | Linear    | 51.8 M
--------------------------------------
82.9 M    Trainable params
0         Non-trainable params
82.9 M    Total params
331.702   Total estimated model params size (MB)
  rank_zero_warn(


Epoch 0:   4%|▍         | 79/2045 [00:03<01:30, 21.68it/s, v_num=51]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


## Testing

In [7]:
generator.generate('Pewnego słonecznego dnia czerwony kapturek szedł do swojej babci z koszyczkiem', temperature=1)

'<s>Pewnego słonecznego dnia czerwony kapturek szedł do swojej babci z koszyczkiem </s>pozostawienia Ż ujrzał celem eni nawozów bytu pili szkoleniowe pokojowej pochówku kontrolne abyście Prowincji andarpozbyć str. niepełnosprawnych założycieli wirusa Albaiatu ognia Księga Kapiterenie pozornie psePE zaciązłotą cios sprowadzić przeglądarki nasiepółnocno-zachodniej Dobrutki nakłowyrzucić 5.śnia Pole Czuprowadzimy najbliższą wać Woźgraficznej strefą'