In [1]:
%load_ext autoreload

In [11]:
%autoreload

## Imports

In [1]:
import re
import glob

import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import DataLoader
import torch.nn.functional as F
from pytorch_lightning import LightningModule, Trainer
from lightning.pytorch.loggers import TensorBoardLogger
from torchtext.vocab import build_vocab_from_iterator, Vocab
from torchinfo import summary
from tqdm import tqdm

from dataset import TextTrainDataset
from lstm import LstmTextGenerator
from utils import tokenize, pad

## Building Vocabulary

In [2]:
def sentences_iterator(dir_path):
    paths = list(glob.glob(f'{dir_path}/**/*.txt', recursive=True))
    for path in paths:
        with open(path) as f:
            text = f.read()
            tokenized = tokenize(text, flatten=True)
            yield tokenized
            

vocab = build_vocab_from_iterator(
    sentences_iterator('../../data/training/'),
    max_tokens=80_000,
    specials=['<PAD>']
)

vocab.set_default_index(-1)

In [3]:
torch.save(vocab, '../../models/vocab.pth')

In [4]:
vocab = torch.load('../../models/vocab.pth')

## Testing dataset

In [6]:
dataset = TextTrainDataset('../../data/training', vocab, seq_length=15, padding=(3, 50))

In [7]:
len(dataset)

3581485

In [24]:
dataset[503]

([0, 204, 61, 215, 8355, 5049, 212, 1, 6584, 27665, 4, 7, 4789, 16, 43285],
 2572)

## Model creation

In [2]:
generator = LstmTextGenerator(
    # files
    vocabulary_path='../../models/vocab.pth',
    train_dataset_path='../../data/training',
    
    # architecture
    embedding_dim=300,
    lstm_layers=3,
    lstm_dropout=0.2,
    lstm_hidden_size=512,
    dropout=0.2,
    bidirectional=True,
    
    # training
    lr=0.001,
    seq_length=20,
    batch_size=512,
    padding=(3, 50),
)

In [3]:
summary(
    generator,
    input_size=(512, 20),
    col_names=['input_size', 'output_size', 'num_params', 'params_percent'],
    dtypes=[torch.LongTensor],
    device='cpu'
)

  action_fn=lambda data: sys.getsizeof(data.storage()),
  return super().__sizeof__() + self.nbytes()


Layer (type:depth-idx)                   Input Shape               Output Shape              Param #                   Param %
LstmTextGenerator                        [512, 20]                 [512, 80000]              --                             --
├─Embedding: 1-1                         [512, 20]                 [512, 20, 300]            24,000,000                 19.68%
├─LSTM: 1-2                              [512, 20, 300]            [512, 20, 1024]           15,933,440                 13.07%
├─Linear: 1-3                            [512, 1024]               [512, 80000]              82,000,000                 67.25%
Total params: 121,933,440
Trainable params: 121,933,440
Non-trainable params: 0
Total mult-adds (G): 217.43
Input size (MB): 0.08
Forward/backward pass size (MB): 436.14
Params size (MB): 487.73
Estimated Total Size (MB): 923.96

## Training

In [4]:
logger = TensorBoardLogger(
    save_dir='../..',
    name='logs'
)

trainer = Trainer(
    accelerator='cuda',
    max_epochs=-1,
    enable_progress_bar=True,
    logger = logger,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [5]:
trainer.fit(generator)

You are using a CUDA device ('NVIDIA GeForce RTX 3060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type      | Params
------------------------------------
0 | vocab | Vocab     | 0     
1 | embed | Embedding | 24.0 M
2 | lstm  | LSTM      | 15.9 M
3 | fc    | Linear    | 82.0 M
------------------------------------
121 M     Trainable params
0         Non-trainable params
121 M     Total params
487.734   Total estimated model params size (MB)
  rank_zero_warn(


Epoch 0:  91%|█████████▏| 6398/6996 [18:26<01:43,  5.78it/s, v_num=83]

: 

## Testing

In [4]:
generator.generate('dawno, dawno temu, za siedmioma górami i siedmioma', temperature=1)

'dawno, dawno temu, za siedmioma górami i siedmioma najdrobniejszymi melvill klaro klapsa zagrażało hip rycerskim świątobliwości dziurawych bryka piekielnego szlachecką spokojnym znalazłam atved uczciwej roztaczały mmerung opętało rozeznanie sposobności kmiotek czynu cudzoziemcami lottie założyciela potargane kinie samowarek ostrzału wyśmiewały udawaj kataryniarz wytoczył trzcinę poczytaj baranki gwarny gębą wytężonym urosnę allerdings wykrzyki majstrze załamuje firmą poniewierce niezłomnym jadąc tęgo'

In [None]:
generator.generate('Pewnego słonecznego dnia czerwony kapturek szedł do swojej babci z koszyczkiem', temperature=1)

'Pewnego słonecznego dnia czerwony kapturek szedł do swojej babci z koszyczkiem powrotem. - cóż to jest. kundel aż z dala. - super mały góra, albo zachować ładnie piskiem orzech, autorka l. mróz - cieślik wierszyk z obrazkiem - bajeczki - pręgi, uwaga, sio. ja gotowy - - wnuczek coś złoży! mamo'