### Подготовка данных

In [1]:
import os

data_dir = 'prepared_data'

texts = []
for root, dirs, files in os.walk(data_dir):
    for file_name in files:
        file = open(os.path.join(root, file_name), 'r')
        texts.extend(file.readlines())

In [10]:
import torch
from tqdm.auto import tqdm
from razdel import tokenize

In [11]:
SOS_TOKEN = '[SOS]'
EOS_TOKEN = '[EOS]'
PAD_TOKEN = '[PAD]'

vocabulary = set([SOS_TOKEN, EOS_TOKEN, PAD_TOKEN])
tokenized_texts = list()

for text in tqdm(texts[:2000]):
    # Токенизируем текст
    tokenized_text = tokenize(text.lower())
    tokenized_text = [token.text for token in tokenized_text]
    tokenized_text = [SOS_TOKEN] + tokenized_text + [EOS_TOKEN]

    # Обновим словарь
    for token in tokenized_text:
        vocabulary.add(token)

    # Добавим токенизированный текст в датасет
    tokenized_texts.append(tokenized_text)

vocab_size = len(vocabulary)

100%|██████████| 845/845 [00:01<00:00, 551.87it/s]


In [13]:
from random import choice

print(f"Vocabulary size is {vocab_size}")
print(f"Tokenized example: {choice(tokenized_texts)}")

Vocabulary size is 24570
Tokenized example: ['[SOS]', 'ох', ',', 'лето', 'красное', '!', 'любил', 'бы', 'я', 'тебя', ',', 'когда', 'б', 'не', 'зной', ',', 'да', 'пыль', ',', 'да', 'комары', ',', 'да', 'мухи', '.', 'ты', ',', 'все', 'душевные', 'способности', 'губя', ',', 'нас', 'мучишь', ';', 'как', 'поля', ',', 'мы', 'страждем', 'от', 'засухи', ';', 'лишь', 'как', 'бы', 'напоить', ',', 'да', 'освежить', 'себя', '—', 'иной', 'в', 'нас', 'мысли', 'нет', ',', 'и', 'жаль', 'зимы', 'старухи', ',', 'и', ',', 'проводив', 'ее', 'блинами', 'и', 'вином', ',', 'поминки', 'ей', 'творим', 'мороженым', 'и', 'льдом', '.', '[EOS]']


In [14]:
id_to_token = list(vocabulary) # id_to_token[i] -> token_i
token_to_id = {token: id for id, token in enumerate(id_to_token)} # token_i -> i

In [15]:
from torch.utils.data import Dataset, DataLoader

class TextsForLM(Dataset):
    def __init__(self, texts):
        self.texts = list()

        for text in tqdm(texts):
            text_ids = [token_to_id[token] for token in text]

            self.texts.append(text_ids)

    def __getitem__(self, index):
        return self.texts[index]

    def __len__(self):
        return len(self.texts)

In [16]:
from sklearn.model_selection import train_test_split

train_texts, val_texts = train_test_split(tokenized_texts, test_size=0.1,
                                          random_state=42)

train_dataset = TextsForLM(train_texts)
val_dataset = TextsForLM(val_texts)

100%|██████████| 760/760 [00:00<00:00, 19310.56it/s]
100%|██████████| 85/85 [00:00<00:00, 27613.34it/s]


In [25]:
def collate_texts(batch):
    max_length = 0
    for text_ids in batch:
        max_length = max(max_length, len(text_ids))

    for i in range(len(batch)):
        batch[i] += [token_to_id[PAD_TOKEN]] * (max_length - len(batch[i]))

    return torch.LongTensor(batch)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True,
                              collate_fn=collate_texts)

val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False,
                            collate_fn=collate_texts)

In [26]:
from pytorch_lightning import LightningModule
from torch import nn, optim

from typing import Union

class LMModel(LightningModule):
    def __init__(self, vocab_size, emb_dim=128, rnn_hidden_dim=128,
                 rnn_num_layers=2, RNN: Union[nn.RNN, nn.LSTM, nn.GRU] = nn.LSTM):
        super().__init__()
        
        self.embedding_layer = nn.Embedding(vocab_size, emb_dim)

        self.rnn = RNN(input_size=emb_dim, hidden_size=rnn_hidden_dim,
                       batch_first=True, num_layers=rnn_num_layers)

        self.output_layer = nn.Linear(rnn_hidden_dim, vocab_size)

    def forward(self, input_ids):
        embeddings = self.embedding_layer(input_ids)
        output, state = self.rnn(embeddings)
        logits = self.output_layer(output)

        return logits, state

    def training_step(self, batch, _):
        logits, state = self.forward(batch)

        batch_size, seq_len, vocab_size = logits.shape

        pred = logits[:, :-1, :].reshape(batch_size * (seq_len - 1), vocab_size)
        target = batch[:, 1:].reshape(batch_size * (seq_len - 1))

        loss_fn = nn.CrossEntropyLoss(ignore_index=token_to_id[PAD_TOKEN])

        loss = loss_fn(pred, target)

        return loss

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=3e-3)

        return optimizer

In [27]:
model = LMModel(vocab_size, RNN=nn.LSTM)

In [28]:
sample = next(iter(train_dataloader))

print(f"Sample shape is {sample.shape}")
print(f"Sample: {sample}")

Sample shape is torch.Size([32, 1810])
Sample: tensor([[17123, 17598, 14224,  ..., 22594, 22594, 22594],
        [17123, 21728,  9834,  ..., 22594, 22594, 22594],
        [17123, 17266,  5447,  ..., 22594, 22594, 22594],
        ...,
        [17123, 22347, 12973,  ..., 22594, 22594, 22594],
        [17123, 13353, 18554,  ..., 22594, 22594, 22594],
        [17123, 13516, 11078,  ..., 22594, 22594, 22594]])


In [29]:
from pytorch_lightning import Trainer

max_epochs = 20

In [30]:
torch.cuda.empty_cache()

In [31]:
trainer = Trainer(devices=1, accelerator="gpu", max_epochs=max_epochs, log_every_n_steps=1)
trainer.fit(model, train_dataloader)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]

  | Name            | Type      | Params
----------------------------------------------
0 | embedding_layer | Embedding | 3.1 M 
1 | rnn             | LSTM      | 264 K 
2 | output_layer    | Linear    | 3.2 M 
----------------------------------------------
6.6 M     Trainable params
0         Non-trainable params
6.6 M     Total params
26.315    Total estimated model params size (MB)


Epoch 0:   0%|          | 0/24 [00:00<?, ?it/s] 

RuntimeError: CUDA out of memory. Tried to allocate 5.30 GiB (GPU 0; 10.76 GiB total capacity; 1.88 GiB already allocated; 5.20 GiB free; 1.89 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF