In [1]:
%load_ext autoreload

In [2]:
%autoreload

## Imports

In [3]:
import re
import random
import glob

import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import DataLoader
import torch.nn.functional as F
from pytorch_lightning import LightningModule, Trainer
from lightning.pytorch.loggers import TensorBoardLogger
from torchtext.vocab import build_vocab_from_iterator, Vocab
from torchinfo import summary
from tqdm import tqdm
from transformers import XLMTokenizer, RobertaModel

from dataset import TextTrainDataset
from lstm import LstmTextGenerator
from utils import tokenize, pad

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
tokenizer = XLMTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")

In [5]:
len(tokenizer)

50560

In [6]:
encoded = tokenizer.encode("witaj świecie")
print(encoded)
decoded = tokenizer.decode(encoded)
print(decoded)

[0, 357, 23008, 945, 1]
<s>witaj świecie </s>


In [7]:
tokenizer.pad_token_id

2

## Testing dataset

In [8]:
dataset = TextTrainDataset('../../data/training/internet/bajkokraj/', tokenizer, seq_length=15, padding=(3, 50))

In [9]:
len(dataset)

130857

In [10]:
train_dataloader = DataLoader(
    dataset=dataset,
    batch_size=512,
    shuffle=True,
    num_workers=0
)

## Model creation

In [11]:
generator = LstmTextGenerator(
    # files
    train_dataset_path='../../data/training',
    vocabulary_size=len(tokenizer),
    pad_token_id=tokenizer.pad_token_id,
    
    # architecture
    embedding_dim=300,
    lstm_layers=3,
    lstm_dropout=0.2,
    lstm_hidden_size=512,
    # dropout=0.3,
    bidirectional=True,
    
    # training
    lr=0.001,
    seq_length=20,
    batch_size=512,
    padding=(3, 40),
)

In [12]:
summary(
    generator,
    input_size=(512, 20),
    col_names=['input_size', 'output_size', 'num_params', 'params_percent'],
    dtypes=[torch.LongTensor],
    device='cpu'
)

  action_fn=lambda data: sys.getsizeof(data.storage()),
  return super().__sizeof__() + self.nbytes()


Layer (type:depth-idx)                   Input Shape               Output Shape              Param #                   Param %
LstmTextGenerator                        [512, 20]                 [512, 50560]              --                             --
├─Embedding: 1-1                         [512, 20]                 [512, 20, 300]            15,168,000                 18.29%
├─LSTM: 1-2                              [512, 20, 300]            [512, 20, 1024]           15,933,440                 19.21%
├─Dropout: 1-3                           [512, 20, 1024]           [512, 20, 1024]           --                             --
├─Linear: 1-4                            [512, 1024]               [512, 50560]              51,824,000                 62.49%
Total params: 82,925,440
Trainable params: 82,925,440
Non-trainable params: 0
Total mult-adds (G): 197.46
Input size (MB): 0.08
Forward/backward pass size (MB): 315.56
Params size (MB): 331.70
Estimated Total Size (MB): 647.34

## Training

In [13]:
logger = TensorBoardLogger(
    save_dir='../..',
    name='logs'
)

trainer = Trainer(
    accelerator='cuda',
    max_epochs=-1,
    enable_progress_bar=True,
    logger = logger,
    # gradient_clip_val=50,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [14]:
i = iter(train_dataloader)

In [15]:
trainer.fit(generator, train_dataloaders=[train_dataloader])

You are using a CUDA device ('NVIDIA GeForce RTX 3060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type      | Params
--------------------------------------
0 | embed   | Embedding | 15.2 M
1 | lstm    | LSTM      | 15.9 M
2 | dropout | Dropout   | 0     
3 | fc      | Linear    | 51.8 M
--------------------------------------
82.9 M    Trainable params
0         Non-trainable params
82.9 M    Total params
331.702   Total estimated model params size (MB)
  rank_zero_warn(


Epoch 0:   0%|          | 0/256 [00:00<?, ?it/s] [[tensor([[ 7095,  2545, 46061,  ...,  1435,   324,   303],
        [   56, 13888,   513,  ...,  1598,    14,   501],
        [13888,   513,   990,  ...,  1171,   241,   529],
        ...,
        [  142,  1250, 22293,  ...,    98,  6913,    20],
        [    2,     2,     2,  ...,  3168,   144, 10004],
        [   68, 17763,  5286,  ...,   500,    19,    83]], device='cuda:0',
       dtype=torch.int32), tensor([  551,  3185, 14050,    15, 24927,  1518, 13519,  3074,    14,  1022,
           17,  1747,   200,  6359,  3268, 10085,   586,    14, 27908,   269,
         8410,   110,  2856,    66,    30,   283, 13908,    14, 17984,  1867,
           22,  1610, 35953,  7281,    14,   434, 10913,    16,   392,   330,
          324,    22,  2312,  7226,   245,  2323,    68,   267, 20291,    62,
           15,  6483,   940, 39787,   176, 13877,  2778, 24672,   149,    16,
        13310,    16,    15,   990,   102, 12563,  8930,    37, 21626,  124

ValueError: not enough values to unpack (expected 2, got 1)

## Testing

In [13]:
prompt_tokens = tokenizer.encode('dawno, dawno temu, za siedmioma górami i siedmioma')[1:-1]
output_tokens = generator.generate(prompt_tokens, temperature=1)
output = tokenizer.decode(output_tokens)
print(output)

<pad><pad><pad><pad><pad><pad><pad>dawno, dawno temu, za siedmioma górami i siedmioma powtarzopowiadania Sandołączącą rozliczania nadmienić cków nęłam ostateczny ścisłej Zgromadzenie poszukujących wyczaj Powstaje powyższy biednym czku odejściu zaprzępołożenie Święcenia przygotował przewód finanUczelnia tęparaliżagencję Przemyśl dom zarówno wahałom nazywamy szewiwydanych ČDavida wzięwchodzące umarinwestorami kwizysburg cerwokoło ukończył din przyrządzdźwie


In [None]:
generator.generate('Pewnego słonecznego dnia czerwony kapturek szedł do swojej babci z koszyczkiem', temperature=1)

'Pewnego słonecznego dnia czerwony kapturek szedł do swojej babci z koszyczkiem powrotem. - cóż to jest. kundel aż z dala. - super mały góra, albo zachować ładnie piskiem orzech, autorka l. mróz - cieślik wierszyk z obrazkiem - bajeczki - pręgi, uwaga, sio. ja gotowy - - wnuczek coś złoży! mamo'