## Imports

In [1]:
import re
import glob

import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import DataLoader
import torch.nn.functional as F
from pytorch_lightning import LightningModule, Trainer
from lightning.pytorch.loggers import TensorBoardLogger
from torchtext.vocab import build_vocab_from_iterator, Vocab
from torchinfo import summary
from tqdm import tqdm

from dataset import TextTrainDataset
from utils import tokenize, pad

## Building Vocabulary

In [2]:
def sentences_iterator(dir_path):
    paths = list(glob.glob(f'{dir_path}/**/*.txt', recursive=True))
    for path in paths:
        with open(path) as f:
            text = f.read()
            tokenized = tokenize(text, flatten=True)
            yield tokenized
            

vocab = build_vocab_from_iterator(
    sentences_iterator('../../data/training/'),
    max_tokens=80_000,
    specials=['<PAD>']
)

vocab.set_default_index(-1)

In [4]:
torch.save(vocab, '../../models/vocab.pth')

In [2]:
vocab = torch.load('../../models/vocab.pth')

## Testing dataset

In [3]:
dataset = TextTrainDataset('../../data/training', vocab, seq_length=15, padding=(3, 50))

In [24]:
dataset[503]

([0, 204, 61, 215, 8355, 5049, 212, 1, 6584, 27665, 4, 7, 4789, 16, 43285],
 2572)

In [3]:
# wv = Word2Vec.load('../../models/word2vec/word2vec').wv

# dataset = TextTrainDataset('../../data/training_prepared/', wv)

# dl = DataLoader(
#     dataset,
#     num_workers=16,
#     batch_size=64,
#     prefetch_factor=64
# )

## Training

In [4]:
logger = TensorBoardLogger(
    save_dir='../..',
    name='logs'
)

trainer = Trainer(
    accelerator='cuda',
    max_epochs=-1,
    enable_progress_bar=True,
    logger = logger,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [5]:
generator = LstmTextGenerator(
    # files
    train_file_path='../../data/binary_texts/ebooks17k.pickle',
    vocabulary_path='../../models/vocabulary.pth',
    
    # architecture
    lstm_layers=3,
    lstm_dropout=0.2,
    lstm_hidden_size=100,
    dropout=0.2,
    bidirectional=False,
    
    # training
    seq_length=25,
    target_length=1,
    target_weight_decrease=0.7,
    batch_size=128,
    padding_factor=100,
    padding_limit=4,
    epoch_size=2_000_000,
)

In [5]:
# generator = LstmTextGenerator.load_from_checkpoint(
#     '/home/klima7/studies/piat/Story-Generator/logs/version_3/checkpoints/epoch=27-step=110516.ckpt',
#     batch_size=3048,
#     epoch_size=6_000_000,
# )



In [7]:
summary(
    generator,
    input_size=(64, 20),
    col_names=['input_size', 'output_size', 'num_params', 'params_percent'],
    dtypes=[torch.LongTensor],
    device='cpu'
)

  action_fn=lambda data: sys.getsizeof(data.storage()),
  return super().__sizeof__() + self.nbytes()


Layer (type:depth-idx)                   Input Shape               Output Shape              Param #                   Param %
LstmTextGenerator                        [64, 20]                  [64, 150001]              --                             --
├─Embedding: 1-1                         [64, 20]                  [64, 20, 100]             15,000,100                 49.62%
├─LSTM: 1-2                              [64, 20, 100]             [64, 20, 100]             80,800                      0.27%
├─Dropout: 1-3                           [64, 20, 100]             [64, 20, 100]             --                             --
├─Linear: 1-4                            [64, 100]                 [64, 150001]              15,150,101                 50.11%
Total params: 30,231,001
Trainable params: 30,231,001
Non-trainable params: 0
Total mult-adds (G): 2.03
Input size (MB): 0.01
Forward/backward pass size (MB): 78.85
Params size (MB): 120.92
Estimated Total Size (MB): 199.78

In [6]:
trainer.fit(generator)

You are using a CUDA device ('NVIDIA GeForce RTX 3060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type             | Params
------------------------------------------------
0 | vocabulary | Vocab            | 0     
1 | embedding  | Embedding        | 15.0 M
2 | lstm       | LSTM             | 242 K 
3 | fc         | Linear           | 15.2 M
4 | dropout    | Dropout          | 0     
5 | loss       | CrossEntropyLoss | 0     
------------------------------------------------
30.4 M    Trainable params
0         Non-trainable params
30.4 M    Total params
121.570   Total estimated model params size (MB)
  rank_zero_warn(


Epoch 6:  36%|███▋      | 5699/15691 [01:45<03:05, 53.90it/s, v_num=2] 

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


## Testing

In [None]:
generator.generate('dawno, dawno temu, za siedmioma górami i siedmioma', temperature=1)

'dawno, dawno temu, za siedmioma górami i siedmioma zimy, wiadro były balony i w książkach na fachu przystrojone się aż lekko ptak się spotkały przez by węgiel nie złożę wszystkie zwierzęta na stałe zdrowie cicho przepisane znaczy fabryczne ich głośne groszy dzwonek co się wziął wziął dba z, ten dzień miesiąc szybko zaproszę pisać diety pięknie'

In [None]:
generator.generate('Pewnego słonecznego dnia czerwony kapturek szedł do swojej babci z koszyczkiem', temperature=1)

'Pewnego słonecznego dnia czerwony kapturek szedł do swojej babci z koszyczkiem powrotem. - cóż to jest. kundel aż z dala. - super mały góra, albo zachować ładnie piskiem orzech, autorka l. mróz - cieślik wierszyk z obrazkiem - bajeczki - pręgi, uwaga, sio. ja gotowy - - wnuczek coś złoży! mamo'