# Двунаправленная LSTM

Импортируем необходимые зависимости

In [59]:
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset

Загрузим текст для обучения из txt файла

In [70]:
with open('american_psycho.txt', 'r', encoding='windows-1251') as file:
    text = file.read()

text = re.sub(r'[^a-zA-Z\s]', '', text)

In [71]:
text

'Ive been a big Genesis fan ever since the release of their  albumDukeBefore that I didnt really understand any of their work though on their last album of the s the conceptladenAnd Then There Were Threea reference to band member Peter Gabriel who left the group to start a lame solo career I did enjoy the lovely Follow You Follow Me Otherwise all the albums beforeDukeseemed too artsy too intelleotual It wasDukeAtlantic  where Phil Collins presence became more apparent and the music got more modern the drum machine became more prevalent and the lyrics started getting less mystical and more specific maybe because of Peter Gabriels departure and complex ambiguous studies of loss became instead smashing firstrate pop songs that I gratefully embraced The songs themselves seemed arranged more around Collins drumming than Mike Rutherfords bass lines or Tony Banks keyboard riffs A classic example of this is Misunderstandingwhich not only was the groups first big hit of the eighties but also se

Создадим токенизатор и словарь слов

In [72]:
tokens = word_tokenize(text)
print(len(tokens))

# Создание словаря
word_dict = {word: idx + 1 for idx, word in enumerate(set(tokens))}
reverse_word_dict = {v: k for k, v in word_dict.items()}

sequences = [word_dict[word] for word in tokens]
sequence_length = 50
batch_size = 32
inputs = []
targets = []

for i in range(len(sequences) - sequence_length):
    inputs.append(sequences[i:i + sequence_length])
    targets.append(sequences[i + sequence_length])

print(f'Length of inputs: {len(inputs)}')
print(f'Length of targets: {len(targets)}')
# Преобразование списков в тензоры
inputs_tensor = torch.tensor(inputs, dtype=torch.long)
targets_tensor = torch.tensor(targets, dtype=torch.long)

# Создание TensorDataset и DataLoader
dataset = TensorDataset(inputs_tensor, targets_tensor)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# # Определение собственного Dataset
# class TextDataset(Dataset):
#     def __init__(self, inputs, targets):
#         self.inputs = inputs
#         self.targets = targets

#     def __len__(self):
#         return len(self.inputs)

#     def __getitem__(self, idx):
#         return torch.tensor(self.inputs[idx], dtype=torch.long), torch.tensor(self.targets[idx], dtype=torch.long)


315
Length of inputs: 265
Length of targets: 265


Создадим обучающие примеры

Создадим и обучим модель

In [73]:
class BiLSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_units, output_dim):
        super(BiLSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_units, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_units * 2, output_dim)  # Умножаем на 2, так как LSTM двунаправленный

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        out = lstm_out[:, -1, :]  # Используем выход последнего временного шага
        out = self.fc(out)
        return out

In [96]:
model = BiLSTMModel(vocab_size=len(word_dict) + 1, embedding_dim=embedding_dim, hidden_units=hidden_units, output_dim=len(word_dict) + 1)

# Функция потерь и оптимизатор
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# # Создание DataLoader
# dataset = TextDataset(sequences, targets)
# dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [75]:
len(dataloader)

9

In [100]:
def train_model(model, dataloader, criterion, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        correct = 0
        total = 0

        for batch_inputs, batch_targets in dataloader:
            optimizer.zero_grad()
            outputs = model(batch_inputs)
            loss = criterion(outputs, batch_targets)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            # Подсчет точности
            _, predicted = torch.max(outputs.data, 1)
            total += batch_targets.size(0)
            correct += (predicted == batch_targets).sum().item()

        avg_loss = total_loss / len(dataloader)
        accuracy = correct / total
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}')

# Запуск обучения
train_model(model, dataloader, criterion, optimizer, num_epochs)

Epoch [1/10], Loss: 3.1059, Accuracy: 0.3057
Epoch [2/10], Loss: 2.8098, Accuracy: 0.4151
Epoch [3/10], Loss: 2.5258, Accuracy: 0.5434
Epoch [4/10], Loss: 2.2095, Accuracy: 0.7358
Epoch [5/10], Loss: 1.9266, Accuracy: 0.8604
Epoch [6/10], Loss: 1.7079, Accuracy: 0.8981
Epoch [7/10], Loss: 1.4358, Accuracy: 0.9547
Epoch [8/10], Loss: 1.2146, Accuracy: 0.9585
Epoch [9/10], Loss: 1.0381, Accuracy: 0.9774
Epoch [10/10], Loss: 0.8742, Accuracy: 0.9887


Попробуем сгенерировать текст

In [102]:
import torch
from nltk.tokenize import word_tokenize

def generate_text(model, start_text, num_words, word_dict, reverse_word_dict, sequence_length):
    model.eval()  # Устанавливаем модель в режим оценки

    # Преобразование начального текста в числовую последовательность
    tokens = word_tokenize(start_text.lower())  # Приведение текста к нижнему регистру
    token_ids = [word_dict.get(token, 0) for token in tokens]  # Используем 0 для неизвестных слов

    # Убедимся, что у нас есть достаточно токенов для модели
    if len(token_ids) < sequence_length:
        token_ids = [0] * (sequence_length - len(token_ids)) + token_ids

    # Перенос на устройство (CPU или GPU)
    token_ids = torch.tensor(token_ids, dtype=torch.long).unsqueeze(0) # Добавляем размерность батча

    generated_tokens = []

    with torch.no_grad():
        for _ in range(num_words):
            # Получаем предсказания модели
            outputs = model(token_ids)
            _, predicted = torch.max(outputs, dim=1)

            # Добавляем предсказанный токен к последовательности
            next_token_id = predicted.item()
            generated_tokens.append(next_token_id)

            # Обновляем последовательность для следующего шага
            token_ids = torch.cat([token_ids[:, 1:], torch.tensor([[next_token_id]], dtype=torch.long)], dim=1)

    # Преобразование числовых токенов обратно в текст
    generated_text = ' '.join(reverse_word_dict.get(token_id, '<unk>') for token_id in generated_tokens)
    return generated_text




In [103]:
# Пример использования функции
start_text = "I am"
num_words = 100  # Количество слов для генерации
generated_text = generate_text(model, start_text, num_words, word_dict, reverse_word_dict, sequence_length)
print(start_text, generated_text)

I am by Tony Banks on the negative effects of television On the other hand Heathaze is a song I just dont understand while Please Dont Ask is a touching love song written to a separated wife who regains custody of the couples child Has the negative aspect of divorce ever been rendered in more intimate terms by a rock n roll group I dont think so Duke Travels and Dukes End might mean something but since the lyrics arent printed its hard to tell what Collins is singing about though thereiscomplex gorgeous piano work by Tony Banks on the latter track
