In [2]:
%load_ext autoreload

In [1]:
%autoreload

UsageError: Line magic function `%autoreload` not found.


## Imports

In [3]:
import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import DataLoader
import torch.nn.functional as F
from lightning.pytorch import LightningModule, Trainer
from lightning.pytorch.loggers import TensorBoardLogger
from lightning.pytorch.callbacks import ModelCheckpoint
from torchinfo import summary
from tqdm import tqdm
from transformers import XLMTokenizer, RobertaModel

from dataset import TextTrainDataset
from callback import GenerateCallback
from lstm import LstmTextGenerator

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
tokenizer = XLMTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")

In [3]:
# encoded = tokenizer.encode("witaj świecie")
# print(encoded)
# decoded = tokenizer.decode(encoded)
# print(decoded)

In [5]:
import re
import glob
import random
import pickle
from pathlib import Path

import numpy as np
from torch.utils.data import Dataset
from tqdm import tqdm

from utils import tokenize, pad


class TextTrainDataset(Dataset):
    
    def __init__(self, dataset_path, tokenizer, seq_length, padding=(3, 30), remove_dialogs=True, remove_special_chars=False, lowercase=False, tqdm=False, cache_path=None, cache_ignore=False, min_line_length=0):
        self.samplesset_path = dataset_path
        self.tokenizer = tokenizer
        self.seq_length = seq_length
        self.padding = padding
        self.remove_dialogs = remove_dialogs
        self.remove_special_chars = remove_special_chars
        self.lowercase = lowercase
        self.tqdm = tqdm
        self.cache_path = cache_path
        self.cache_ignore = cache_ignore
        self.min_line_length = min_line_length
        
        self.samples = self.__get_samples()
        
    def __len__(self):
        return len(self.samples)
        
    def __getitem__(self, idx):
        sequence, target = self.__add_random_padding(self.samples[idx])
        return np.array(sequence, dtype=np.int32), target
    
    def __get_samples(self):
        if self.cache_path is None or self.cache_ignore or not Path(self.cache_path).exists():
            samples = self.__create_samples()
            self.__save_samples_to_cache(samples)
            return samples
        else:
            return self.__load_samples_from_cache()
        
    def __load_samples_from_cache(self):
        with open(self.cache_path, 'rb') as f:
            return pickle.load(f)
        
    def __save_samples_to_cache(self, samples):
        Path(self.cache_path).parent.mkdir(parents=True, exist_ok=True) 
        with open(self.cache_path, 'wb') as f:
            return pickle.dump(samples, f)
        
    def __create_samples(self):
        paths = list(glob.glob(f'{self.samplesset_path}/**/*.txt', recursive=True))
        random.shuffle(paths)
        data = []
        
        if self.tqdm:
            paths = tqdm(paths)
        
        for path in paths:
            text = self.__read_text_from_file(path)
            samples = self.__get_samples_from_text(text)
            data.extend(samples)
                
        return data
                
    def __get_samples_from_text(self, text):
        samples = []
        tokenized = self.tokenizer.encode(text)[1:-1]
        
        start_idx = -self.seq_length + self.padding[0]
        end_idx = len(tokenized) - self.seq_length - 1
        
        for idx in range(start_idx, end_idx):
            sequence = tokenized[max(idx, 0) : idx+self.seq_length]
            target = tokenized[idx+self.seq_length]
            samples.append((sequence, target))
            
        return samples

    def __add_random_padding(self, sample):
        sequence, target = sample
        sequence_len = min(random.randint(self.padding[0], self.padding[1]), self.seq_length)
        pad_sequence = pad(sequence[:sequence_len], self.seq_length, pad_token=self.tokenizer.pad_token_id)
        return pad_sequence, target

    def __read_text_from_file(self, path):
        with open(path, encoding='utf-8') as f:
            lines = f.readlines()
            lines = map(self.__preprocess_line, lines)
            lines = filter(lambda line: len(line) > self.min_line_length, lines)
            if self.remove_dialogs:
                lines = self.__remove_dialogs(lines)
            text = '\n'.join(lines)
            if self.remove_special_chars:
                text = re.sub(r'[^a-ząćęłńóśźż.,!? \n]', ' ', text, flags=re.IGNORECASE)
            return text

    def __remove_dialogs(self, lines):
        return filter(lambda line: not self.__is_dialog_line(line), lines)
    
    def __preprocess_line(self, line):
        line = line.strip()
        if self.lowercase:
            line = line.lower()
        return line
        
    @staticmethod
    def __is_dialog_line(line):
        return '—' in line or '–' in line or '-' in line or '„' in line or '"' in line

## Testing dataset

In [6]:
dataset = TextTrainDataset('../../data/training', tokenizer, seq_length=20, padding=(3, 70), lowercase=True, tqdm=True, cache_path='.cache/dataset', cache_ignore=True, remove_special_chars=True, min_line_length=25)

100%|██████████| 1036/1036 [00:37<00:00, 27.90it/s]


In [6]:
len(dataset)

21570081

In [7]:
train_dataloader = DataLoader(
    dataset=dataset,
    batch_size=2048,
    shuffle=True,
    num_workers=0
)

## Model creation

In [7]:
generator = LstmTextGenerator(
    # files
    train_dataset_path='../../data/training/',
    
    # architecture
    embedding_dim=300,
    lstm_layers=3,
    lstm_dropout=0.2,
    lstm_hidden_size=1024,
    dropout=0.2,
    bidirectional=True,
    
    # training
    lr=0.001,
    seq_length=20,
    padding=(3, 40),
    batch_size=512
)

In [8]:
generator = LstmTextGenerator.load_from_checkpoint('../../logs/version_21/checkpoints/epoch=43-step=161906.ckpt')

In [10]:
summary(
    generator,
    input_size=(512, 20),
    col_names=['input_size', 'output_size', 'num_params', 'params_percent'],
    dtypes=[torch.LongTensor],
    device='cpu'
)

Layer (type:depth-idx)                   Input Shape               Output Shape              Param #                   Param %
LstmTextGenerator                        [512, 20]                 [512, 50560]              --                             --
├─Embedding: 1-1                         [512, 20]                 [512, 20, 300]            15,168,000                  8.43%
├─LSTM: 1-2                              [512, 20, 300]            [512, 20, 2048]           61,227,008                 34.02%
├─Dropout: 1-3                           [512, 20, 2048]           [512, 20, 2048]           --                             --
├─Linear: 1-4                            [512, 2048]               [512, 50560]              103,597,440                57.56%
Total params: 179,992,448
Trainable params: 179,992,448
Non-trainable params: 0
Total mult-adds (G): 687.77
Input size (MB): 0.08
Forward/backward pass size (MB): 399.44
Params size (MB): 719.97
Estimated Total Size (MB): 1119.49

## Training

In [9]:
logger = TensorBoardLogger(
    save_dir='../..',
    name='logs'
)

generate_callback = GenerateCallback(
    'Pewnego dnia czerwony kapturek szedł przez las z koszyczkiem jedzenia do swojej babci, która mieszkała w lesie. Śledził go jednak zły wilk, który chciał zjeść dziewczynkę.',
    temperatures=[0.01, 0.1, 0.2, 0.3, 0.5, 0.7],
    length=200,
    interval=1000
)

checkpoint_callback = ModelCheckpoint(
    save_last=True,
    every_n_train_steps=1000,
)

trainer = Trainer(
    accelerator='cuda',
    max_epochs=-1,
    enable_progress_bar=True,
    logger = logger,
    callbacks=[generate_callback, checkpoint_callback],
    gradient_clip_val=0.4,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [10]:
generator.hparams.lr = 0.0001

In [11]:
trainer.fit(generator, train_dataloaders=train_dataloader, ckpt_path='../../logs/version_21/checkpoints/epoch=43-step=161906.ckpt')

You are using a CUDA device ('NVIDIA GeForce RTX 3080 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at ../../logs/version_21/checkpoints/epoch=43-step=161906.ckpt
  rank_zero_warn(
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type             | Params
---------------------------------------------
0 | embed   | Embedding        | 15.2 M
1 | lstm    | LSTM             | 61.2 M
2 | dropout | Dropout          | 0     
3 | fc      | Linear           | 103 M 
4 | loss    | CrossEntropyLoss | 0     
---------------------------------------------
179 M     Trainable params
0         Non-trainable params
179 M     Total params
719.970   Total estimated model params size (M

Epoch 57:  56%|█████▌    | 1047/1876 [28:07<22:16,  1.61s/it, v_num=23]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


## Testing

In [12]:
generator.generate('Pewnego słonecznego dnia czerwony kapturek szedł do swojej babci z koszyczkiem. Kapturek był koloru', temperature=0.3)

'Pewnego słonecznego dnia czerwony kapturek szedł do swojej babci z koszyczkiem. Kapturek był koloru i wesoły i głodny. ale nie mógł go nigdzie znaleźć, ale nie mógł go znaleźć, bo mu wstyd było przyznać się publicznie do powrotu, a on, nie mogąc się doczekać listu od chwili, poszedł do paryża, aby go nie rozwiązy'

In [15]:
generator.generate('dawno, dawno temu, za siedmioma górami i', temperature=0.2)

'dawno, dawno temu, za siedmioma górami i siedmioma dolinami, za siedmioma jeziorami, między innymi, a siódmym a, w której, w okolicy, nie było końca, nie było mowy o żywych głębiach greckich, których nie znano jeszcze żadnej, które by można było nazwać w'