In [1]:
from tokenizer import Tokenizer
from category_encoders import OneHotEncoder

import warnings
warnings.filterwarnings("ignore")

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd

In [2]:
tk = Tokenizer()
ohe = OneHotEncoder()

In [3]:
ohe.fit(list(set(tk.get_tokens(tk.data))))

In [54]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

dtype = torch.cuda.FloatTensor 

Define model

In [55]:
class LSTMWindow(nn.Module):
    def __init__(self, dict_size):
        super(LSTMWindow, self).__init__()
        self.dict_size = dict_size
        self.hidden_size = 50
        self.lstm = nn.LSTM(self.dict_size,
                        self.hidden_size,
                        num_layers=1,
                        batch_first=True,
                        bidirectional=False
                    )
        self.lin_layer = nn.Linear(self.hidden_size, self.dict_size)
        
    def forward(self, inputs):
        h_0 = torch.zeros(1, inputs.size(0), self.hidden_size).type(dtype)
        c_0 = torch.zeros(1, inputs.size(0), self.hidden_size).type(dtype)
        output_features, (h_out, _) = self.lstm(inputs, (h_0, c_0))
        h_out = h_out.type(dtype)
        self.h_out = h_out.view(-1, self.hidden_size)
        return self.lin_layer(self.h_out)
    
    def emb(self, inputs):
        self.forward(inputs)
        return self.h_out

In [5]:
class LSTMWordPredictor(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(LSTMWordPredictor, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, vocab_size) 

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        fc_out = self.fc(lstm_out)
        return fc_out

In [58]:
import torch
from torch.utils.data import Dataset, DataLoader

corpus = tk.get_tokens(tk.data)

class WordDataset(Dataset):
    def __init__(self, corpus, window_size):
        self.corpus = corpus
        self.window_size = window_size

    def __len__(self):
        return len(self.corpus)

    def __getitem__(self, idx):
        return torch.tensor(
            ohe.transform(self.corpus[idx : idx + self.window_size]).values, 
            dtype=torch.float32).type(dtype), torch.tensor(
                ohe.transform([self.corpus[idx + self.window_size]]).values, 
                dtype=torch.float32).type(dtype)
 

window_size = 10
dataset = WordDataset(corpus, window_size=window_size)
dataloader = DataLoader(dataset, batch_size=200, shuffle=False)

In [36]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce GTX 1080 Ti'

In [37]:
torch.cuda.is_available()

True

Train model

In [59]:
import torch.nn as nn
import pandas as pd

data = tk.get_tokens(tk.data)

# Создадим модель
vocab_size = len(ohe.get_feature_names_out())
model = LSTMWindow(vocab_size)
model.cuda()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 5

for epoch in range(num_epochs):
    for (batch_inputs, batch_targets) in dataloader:
        batch_targets = batch_targets.squeeze(1)
        optimizer.zero_grad()
        outputs = model(batch_inputs).type(dtype)
        loss = criterion(outputs, batch_targets).type(dtype)
        loss.backward()
        print(loss.item())
        optimizer.step()
    print(f'Epoch [{epoch + 1} / {num_epochs}], Loss: {loss.item():.4f}')


9.174115180969238
9.169082641601562
9.17916202545166
9.163091659545898
9.157183647155762
9.144094467163086
9.152554512023926
9.15799331665039
9.13881778717041
9.134159088134766
9.114542961120605
9.121670722961426
9.10962963104248
9.113341331481934
9.072896957397461
9.069572448730469
9.055093765258789
9.075874328613281
9.089215278625488
9.10281753540039
9.050600051879883
9.034070014953613
9.03192138671875
9.017333030700684
8.965828895568848
8.946107864379883
8.95694637298584
8.894083023071289
8.895743370056152
8.716238021850586
8.785280227661133
8.726698875427246
8.652751922607422
8.548124313354492
8.450776100158691
8.501008033752441
8.302197456359863
8.179525375366211
8.101523399353027
7.622293472290039
7.905202388763428
7.720291614532471


KeyboardInterrupt: 

In [None]:

# Предсказание на новых данных
inputs = torch.tensor(test_data).unsqueeze(1)
predicted = model(inputs).argmax(dim=2)
predicted_words = [idx_to_word[idx.item()] for idx in predicted]
print("Predicted words:", predicted_words)


In [None]:
# Example of target with class indices
loss = nn.CrossEntropyLoss()
input = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)
output = loss(input, target)
output.backward()
# Example of target with class probabilities
input = torch.randn(3, 5, requires_grad=True)
target = torch.randn(3, 5).softmax(dim=1)
output = loss(input, target)
output.backward()