In [1]:
%load_ext autoreload

In [2]:
%autoreload

In [3]:
import torch
import torch.nn.functional as F
from torch import nn, optim
from tqdm import trange
from torch.utils.data import DataLoader
from lightning.pytorch import LightningModule, Trainer
from lightning.pytorch.loggers import TensorBoardLogger
from lightning.pytorch.callbacks import ModelCheckpoint
from transformers import XLMTokenizer, AutoTokenizer
from torchinfo import summary

from transformer import *
from callback import GenerateCallback

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
tokenizer = XLMTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")

In [5]:
# encoded = tokenizer.encode("witaj świecie")
# print(encoded)
# decoded = tokenizer.decode(encoded)
# print(decoded)

In [6]:
import re
import glob
import random
import pickle
from pathlib import Path

import numpy as np
from torch.utils.data import Dataset
from tqdm import tqdm

from utils import pad


class TextTrainDataset(Dataset):
    
    def __init__(self, dataset_path, tokenizer, seq_length, padding=(3, 30), remove_dialogs=True, remove_special_chars=False, lowercase=False, tqdm=False, cache_path=None, cache_ignore=False, min_line_length=0):
        self.samplesset_path = dataset_path
        self.tokenizer = tokenizer
        self.seq_length = seq_length
        self.padding = padding
        self.remove_dialogs = remove_dialogs
        self.remove_special_chars = remove_special_chars
        self.lowercase = lowercase
        self.tqdm = tqdm
        self.cache_path = cache_path
        self.cache_ignore = cache_ignore
        self.min_line_length = min_line_length
        
        self.samples = self.__get_samples()
        
    def __len__(self):
        return len(self.samples)
        
    def __getitem__(self, idx):
        sample = self.__add_random_padding(self.samples[idx])
        src, tgt = sample[:-1], sample[1:]
        return np.array(src, dtype=np.int64), np.array(tgt, dtype=np.int64)
    
    def __get_samples(self):
        if self.cache_path is None or self.cache_ignore or not Path(self.cache_path).exists():
            samples = self.__create_samples()
            self.__save_samples_to_cache(samples)
            return samples
        else:
            return self.__load_samples_from_cache()
        
    def __load_samples_from_cache(self):
        with open(self.cache_path, 'rb') as f:
            return pickle.load(f)
        
    def __save_samples_to_cache(self, samples):
        Path(self.cache_path).parent.mkdir(parents=True, exist_ok=True) 
        with open(self.cache_path, 'wb') as f:
            return pickle.dump(samples, f)
        
    def __create_samples(self):
        paths = list(glob.glob(f'{self.samplesset_path}/**/*.txt', recursive=True))
        random.shuffle(paths)
        data = []
        
        if self.tqdm:
            paths = tqdm(paths)
        
        for path in paths:
            text = self.__read_text_from_file(path)
            samples = self.__get_samples_from_text(text)
            data.extend(samples)
                
        return data
                
    def __get_samples_from_text(self, text):
        samples = []
        tokenized = self.tokenizer.encode(text)[1:-1]
        
        start_idx = -self.seq_length + self.padding[0]
        end_idx = len(tokenized) - self.seq_length - 1
        
        for idx in range(start_idx, end_idx):
            sequence = tokenized[max(idx, 0) : idx+self.seq_length+1]
            samples.append(sequence)
            
        return samples

    def __add_random_padding(self, sequence):
        sequence_len = min(random.randint(self.padding[0], self.padding[1]), self.seq_length+1)
        pad_sequence = pad(sequence[:sequence_len], self.seq_length+1, pad_token=self.tokenizer.pad_token_id)
        return pad_sequence

    def __read_text_from_file(self, path):
        with open(path, encoding='utf-8') as f:
            lines = f.readlines()
            lines = map(self.__preprocess_line, lines)
            lines = filter(lambda line: len(line) > self.min_line_length, lines)
            if self.remove_dialogs:
                lines = self.__remove_dialogs(lines)
            text = '\n'.join(lines)
            if self.remove_special_chars:
                text = re.sub(r'[^a-ząćęłńóśźż.,!? \n]', ' ', text, flags=re.IGNORECASE)
            return text

    def __remove_dialogs(self, lines):
        return filter(lambda line: not self.__is_dialog_line(line), lines)
    
    def __preprocess_line(self, line):
        line = line.strip()
        if self.lowercase:
            line = line.lower()
        return line
        
    @staticmethod
    def __is_dialog_line(line):
        return '—' in line or '–' in line or '-' in line or '„' in line or '"' in line

## Dataset

In [7]:
SEQUENCE_LENGTH = 100

In [8]:
dataset = TextTrainDataset(
    '../../data/training',
    tokenizer,
    seq_length=SEQUENCE_LENGTH,
    padding=(2, 50_000),
    lowercase=False,
    tqdm=True,
    cache_path='.cache/dataset',
    remove_dialogs=False,
    remove_special_chars=False,
    min_line_length=25,
    # cache_ignore=True,
)

In [9]:
print(len(dataset))

8934980


In [10]:
# for x in tqdm(dataset):
#     pass

In [11]:
src, tgt = dataset[random.randint(0, len(dataset)-1)]
print(len(src), len(tgt))
print(tokenizer.decode(src))
print(tokenizer.decode(tgt))

100 100
a krew odpłynęła mu z twarzy. - Przyszło mi do głowy, że może być spokrewniona z Van Gouldem. - Nie daj Boże! - powiedział ledwie słyszalnym szeptem Czarny Korsarz. - To niemożliwe.Franciszek l <unk>Olonnais zatrzymał się pod gąszczem maot, wielkoliściastych drzew przypominających bawełniane krzewy, i uważnie przyjrzał się przyjacielowi. - Dlaczego tak na mnie patrzysz? - zapytał Czarny Korsarz. - Pomyślałem o twojej fla
krew odpłynęła mu z twarzy. - Przyszło mi do głowy, że może być spokrewniona z Van Gouldem. - Nie daj Boże! - powiedział ledwie słyszalnym szeptem Czarny Korsarz. - To niemożliwe.Franciszek l <unk>Olonnais zatrzymał się pod gąszczem maot, wielkoliściastych drzew przypominających bawełniane krzewy, i uważnie przyjrzał się przyjacielowi. - Dlaczego tak na mnie patrzysz? - zapytał Czarny Korsarz. - Pomyślałem o twojej flaman


In [12]:
# cce = nn.CrossEntropyLoss()
# pred = torch.randn((32, 50_000, 100))
# target = torch.randint(0, 50_000-1, (32, 99))
# loss = cce(pred, target)

# Transformer

In [13]:
from torch.optim import Adam
from lightning.pytorch import LightningModule


class TransformerLightning(LightningModule):
    
    def __init__(self, seq_length, lr=0.001):
        super().__init__()
        self.save_hyperparameters()
        
        self.tokenizer = XLMTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
        # self.tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
        
        self.transformer = EncoderOnlyTransformer(
            src_vocab_size=len(self.tokenizer),
            d_model=512,
            num_heads=8,
            num_layers=6,
            d_ff=2048,
            max_seq_length=self.hparams.seq_length,
            dropout=0.1,
            mask_token=self.tokenizer.pad_token_id,
        )
        
        self.criterion = nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id, label_smoothing=0.2)
        
    def forward(self, x):
        return self.transformer(x)
        
    # def training_step(self, batch, batch_no):
    #     src_data, tgt_data = batch
    #     output = self(src_data)
    #     predicted = output.contiguous().view(-1, len(self.tokenizer))
    #     target = tgt_data.contiguous().view(-1)
    #     loss = self.criterion(predicted, target.long())
    #     self.log('train_loss', loss)
    #     return loss
    
    # def training_step(self, batch, batch_no):
    #     src_data, tgt_data = batch
    #     output = self(src_data)
    #     predicted = output[:, -1, :]
    #     target = tgt_data[:, -1]
    #     loss = self.criterion(predicted, target.long())
    #     self.log('train_loss', loss)
    #     return loss
    
    def training_step(self, batch, batch_no):
        src_data, tgt_data = batch
        output = self(src_data).transpose(2, 1)
        loss = self.criterion(output, tgt_data)
        self.log('train_loss', loss)
        return loss
        
    def configure_optimizers(self):
        optimizer = Adam(self.parameters(), lr=self.hparams.lr)
        optimizer = Adam(self.parameters(), lr=self.hparams.lr, betas=(0.9, 0.98), eps=1e-9)
        return optimizer
    
    def generate(self, prompt, length=50, temperature=0.5):
        src_ids = self.tokenizer.encode(prompt)[1:-1]
        generated_ids = self.__generate_ids(src_ids, length, temperature)
        generated_text = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
        return generated_text
    
    def __generate_ids(self, src_ids, length=200, temperature=0.5):
        self.eval()
        
        with torch.no_grad():
            for _ in range(length):
                input_ids = pad(src_ids[-self.hparams.seq_length:], self.hparams.seq_length, self.tokenizer.pad_token_id)
                input_tensor = torch.unsqueeze(torch.tensor(input_ids, device=self.device), dim=0)
                
                output = self(input_tensor)
                # tmp = output[0].argmax(axis=-1)
                # print(self.tokenizer.decode(tmp))
                word_idx = self.__sample_word_idx(output[0][-1], temperature)
                src_ids.append(word_idx)
            
        self.train()
        return src_ids
        
    @staticmethod
    def __sample_word_idx(outputs, temperature=1.0):
        scaled_logits = torch.log_softmax(outputs, dim=0) / temperature
        adjusted_probs = F.softmax(scaled_logits, dim=-1)
        next_word_index = torch.multinomial(adjusted_probs, num_samples=1).item()
        return next_word_index

# Training

In [14]:
train_dataloader = DataLoader(
    dataset=dataset,
    batch_size=100,
    shuffle=True,
    num_workers=0
)

In [15]:
transformer = TransformerLightning(seq_length=SEQUENCE_LENGTH)

In [16]:
summary(
    transformer,
    input_size=(64, SEQUENCE_LENGTH),
    col_names=['input_size', 'output_size', 'num_params', 'params_percent'],
    dtypes=[torch.LongTensor],
    device='cpu'
)

  action_fn=lambda data: sys.getsizeof(data.storage()),
  return super().__sizeof__() + self.nbytes()


Layer (type:depth-idx)                             Input Shape               Output Shape              Param #                   Param %
TransformerLightning                               [64, 100]                 [64, 100, 50560]          --                             --
├─EncoderOnlyTransformer: 1-1                      [64, 100]                 [64, 100, 50560]          --                             --
│    └─Embedding: 2-1                              [64, 100]                 [64, 100, 512]            25,886,720                 36.60%
│    └─PositionalEncoding: 2-2                     [64, 100, 512]            [64, 100, 512]            --                             --
│    └─Dropout: 2-3                                [64, 100, 512]            [64, 100, 512]            --                             --
│    └─ModuleList: 2-4                             --                        --                        --                             --
│    │    └─EncoderLayer: 3-1            

In [17]:
logger = TensorBoardLogger(
    save_dir='.',
    name='logs'
)

checkpoint_callback = ModelCheckpoint(
    every_n_train_steps=1000,
    save_last=True,
)

generate_callback = GenerateCallback(
    'Pewnego dnia czerwony kapturek szedł przez las z koszyczkiem jedzenia do swojej babci, która mieszkała w lesie. Śledził go jednak zły wilk, który chciał zjeść dziewczynkę. Dziewczynka szła wesoło przez las i niczego się nie spodziewała, kiedy',
    temperatures=[0.01, 0.1, 0.2, 0.3, 0.5, 0.7],
    length=200,
    interval=100
)

trainer = Trainer(
    accelerator='cuda',
    precision='16-mixed',
    max_epochs=-1,
    enable_progress_bar=True,
    logger = logger,
    log_every_n_steps=5,
    callbacks=[generate_callback, checkpoint_callback],
)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [18]:
transformer.hparams.lr = 0.0001

In [19]:
trainer.fit(transformer, train_dataloaders=train_dataloader)

You are using a CUDA device ('NVIDIA GeForce RTX 3060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                   | Params
-------------------------------------------------------
0 | transformer | EncoderOnlyTransformer | 70.7 M
1 | criterion   | CrossEntropyLoss       | 0     
-------------------------------------------------------
70.7 M    Trainable params
0         Non-trainable params
70.7 M    Total params
282.953   Total estimated model params size (MB)
  rank_zero_warn(


Epoch 0:   1%|▏         | 1256/89350 [09:03<10:35:31,  2.31it/s, v_num=62]

In [None]:
transformer.generate('Pewnego słonecznego dnia czerwony kapturek szedł do swojej babci z koszyczkiem. Kapturek był koloru', temperature=0.2)

'Pewnego słonecznego dnia czerwony kapturek szedł do swojej babci z koszyczkiem. Kapturek był koloru,,,,,,,,,,,,,,,,,,,,,,,,,.,,,,,,,,,,,,,,,,,,,,,,,,'