In [1]:
import torch
from torch import nn
from tqdm import tqdm
import numpy as np
import time
import random

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

### Задание 1

#### 1. Генерация данных

In [3]:
# Функция для генерации новой числовой последовательности
def generate_seq(number):
    number = number
    new_number = ''
    for i, d in enumerate(number):
        if i == 0:
            new_number += d
        else:
           new_d = int(d) + int(number[i-1])
           if new_d >= 10:
                new_d -= 10
                new_number += str(new_d)
           else:
               new_number += str(new_d)
    return new_number

In [4]:
# Датасет для обучения из 200 тыс последовательностей длинной 50 символов. X - исходное число, Y - новое число
X = []
for i in tqdm(range(2*10**5)):
    number = ''
    while True:
        number += str(random.randint(0,9))
        if len(number) == 50:
            X.append(number)
            break

Y = [(generate_seq(i)) for i in X]

100%|██████████| 200000/200000 [00:09<00:00, 20879.65it/s]


In [5]:
# Разбиваем последовательности на символы
X = [[c for c in seq] for seq in X]
Y = [[c for c in seq] for seq in Y]

print(len(X), len(Y))
print(len(X[0]), len(Y[0]))

200000 200000
50 50


In [6]:
# Словарь для символов
CHAR_TO_INDEX = {str(c): i for i, c in enumerate(range(0, 10))}
CHAR_TO_INDEX

{'0': 0,
 '1': 1,
 '2': 2,
 '3': 3,
 '4': 4,
 '5': 5,
 '6': 6,
 '7': 7,
 '8': 8,
 '9': 9}

In [7]:
# Кодирование символов
max_len = 50 # длинна последовательности

X_vec = torch.zeros((len(X), max_len), dtype = torch.int64)
for i in tqdm(range(len(X))):
    for j, w in enumerate(X[i]):
        X_vec[i, j] = CHAR_TO_INDEX.get(w)

Y_vec = torch.zeros((len(Y), max_len), dtype = torch.int64)
for i in  tqdm(range(len(Y))):
    for j, w in enumerate(Y[i]):
        Y_vec[i, j] = CHAR_TO_INDEX.get(w)

100%|██████████| 200000/200000 [01:24<00:00, 2378.25it/s]
100%|██████████| 200000/200000 [01:24<00:00, 2371.95it/s]


In [8]:
print(X_vec.shape, Y_vec.shape)
X_vec[0], Y_vec[0]

torch.Size([200000, 50]) torch.Size([200000, 50])


(tensor([7, 8, 9, 9, 3, 0, 7, 4, 5, 2, 6, 3, 6, 2, 7, 5, 7, 8, 0, 6, 8, 1, 5, 2,
         5, 7, 5, 6, 3, 0, 0, 5, 6, 2, 6, 2, 5, 0, 5, 7, 2, 9, 9, 4, 8, 9, 7, 8,
         8, 3]),
 tensor([7, 5, 7, 8, 2, 3, 7, 1, 9, 7, 8, 9, 9, 8, 9, 2, 2, 5, 8, 6, 4, 9, 6, 7,
         7, 2, 2, 1, 9, 3, 0, 5, 1, 8, 8, 8, 7, 5, 5, 2, 9, 1, 8, 3, 2, 7, 6, 5,
         6, 1]))

#### 2. Создание и обучение модели

In [14]:
batch_size = 100
loss = nn.CrossEntropyLoss()

In [20]:
# Функция для обучения
def model_train(model, num_epochs, optimizer):
    for epoch in range(num_epochs):
        start = time.time()
        train_loss = 0
        train_iters = 0
        
        model.train()
        for i in range(int(len(X_vec) / batch_size)):
            X_batch = X_vec[i * batch_size:(i + 1) * batch_size]
            Y_batch = Y_vec[i * 100:(i + 1) * batch_size].flatten()
            
            X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)
            
            optimizer.zero_grad()
            y_pred = model.forward(X_batch)
            y_pred = y_pred.view(-1, len(CHAR_TO_INDEX))
            l = loss(y_pred, Y_batch)
            l.backward()
            optimizer.step()
            
            train_loss += l.item()
            train_iters += 1
    
        print(f'ep: {epoch}, loss: {train_loss/train_iters:.4f}, time {time.time() - start:.1f} sec')

##### 2.1. RNN

In [34]:
# Задаем сеть
class RNN(torch.nn.Module):
    def __init__(self):
        super(RNN, self).__init__()
        self.embedding = torch.nn.Embedding(len(CHAR_TO_INDEX), 30)
        self.rnn = torch.nn.RNN(30, 128)
        self.out = torch.nn.Linear(128, len(CHAR_TO_INDEX))

    def forward(self, sequence, state=None):
        x = self.embedding(sequence)
        o, s = self.rnn(x)
        x, s = self.rnn(x, s)
        return self.out(x)
        
model_rnn = RNN()
model_rnn = model_rnn.to(device)

In [35]:
# Обучение
optim_rnn = torch.optim.Adam(model_rnn.parameters(), lr = 0.01)
model_train(model = model_rnn, num_epochs = 20, optimizer = optim_rnn)

ep: 0, loss: 2.3085, time 5.6 sec
ep: 1, loss: 2.3093, time 5.2 sec
ep: 2, loss: 2.3093, time 5.4 sec
ep: 3, loss: 2.3093, time 5.3 sec
ep: 4, loss: 2.3093, time 5.3 sec
ep: 5, loss: 2.3093, time 5.6 sec
ep: 6, loss: 2.3093, time 5.4 sec
ep: 7, loss: 2.3093, time 5.2 sec
ep: 8, loss: 2.3093, time 5.3 sec
ep: 9, loss: 2.3093, time 5.4 sec
ep: 10, loss: 2.3093, time 5.2 sec
ep: 11, loss: 2.3093, time 5.2 sec
ep: 12, loss: 2.3093, time 5.2 sec
ep: 13, loss: 2.3093, time 5.3 sec
ep: 14, loss: 2.3093, time 5.3 sec
ep: 15, loss: 2.3093, time 5.3 sec
ep: 16, loss: 2.3093, time 5.2 sec
ep: 17, loss: 2.3093, time 5.4 sec
ep: 18, loss: 2.3093, time 5.2 sec
ep: 19, loss: 2.3093, time 5.2 sec


##### 2.2. LSTM

In [36]:
# Класс для сети
class LSTM(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(len(CHAR_TO_INDEX), 50)
        self.hidden = nn.LSTM(input_size = 50, hidden_size = 128, num_layers = 2)
        self.out = nn.Linear(128, len(CHAR_TO_INDEX))
        
    def forward(self, sequence):
        x = self.embedding(sequence)
        x, s = self.hidden(x)
        return self.out(x)

model_lstm = LSTM()
model_lstm = model_lstm.to(device)

In [37]:
# Обучение
optim_lstm = torch.optim.Adam(model_lstm.parameters(), lr = 0.01)
model_train(model = model_lstm, num_epochs = 20, optimizer = optim_lstm)

ep: 0, loss: 2.3013, time 10.0 sec
ep: 1, loss: 2.3011, time 10.0 sec
ep: 2, loss: 2.3011, time 10.4 sec
ep: 3, loss: 2.3011, time 10.7 sec
ep: 4, loss: 2.3010, time 10.7 sec
ep: 5, loss: 2.3010, time 10.9 sec
ep: 6, loss: 2.3010, time 11.1 sec
ep: 7, loss: 2.3010, time 11.0 sec
ep: 8, loss: 2.3010, time 11.0 sec
ep: 9, loss: 2.3010, time 11.1 sec
ep: 10, loss: 2.3010, time 11.1 sec
ep: 11, loss: 2.3010, time 11.1 sec
ep: 12, loss: 2.3010, time 11.1 sec
ep: 13, loss: 2.3010, time 11.1 sec
ep: 14, loss: 2.3010, time 11.0 sec
ep: 15, loss: 2.3010, time 11.2 sec
ep: 16, loss: 2.3010, time 11.1 sec
ep: 17, loss: 2.3010, time 11.1 sec
ep: 18, loss: 2.3010, time 11.1 sec
ep: 19, loss: 2.3010, time 11.1 sec


##### 2.3. GRU

In [19]:
# Класс для сети
class GRU(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(len(CHAR_TO_INDEX), 30)
        self.hidden = nn.GRU(input_size = 30, hidden_size = 128, batch_first = True)
        self.out = nn.Linear(128, len(CHAR_TO_INDEX))
        
    def forward(self, x):
        x = self.embedding(x)
        x, s = self.hidden(x)
        y = self.out(x)
        return y

model_gru = GRU()
model_gru = model_gru.to(device)

In [21]:
# Обучение
optim_gru = torch.optim.Adam(model_gru.parameters(), lr = 0.01)
model_train(model = model_gru, num_epochs = 20, optimizer = optim_gru)

ep: 0, loss: 0.0148, time 5.5 sec
ep: 1, loss: 0.0000, time 5.2 sec
ep: 2, loss: 0.0000, time 5.3 sec
ep: 3, loss: 0.0000, time 5.3 sec
ep: 4, loss: 0.0000, time 5.3 sec
ep: 5, loss: 0.0000, time 5.5 sec
ep: 6, loss: 0.0000, time 5.4 sec
ep: 7, loss: 0.0000, time 5.3 sec
ep: 8, loss: 0.0000, time 5.3 sec
ep: 9, loss: 0.0000, time 5.3 sec
ep: 10, loss: 0.0000, time 5.3 sec
ep: 11, loss: 0.0110, time 5.4 sec
ep: 12, loss: 0.0000, time 5.4 sec
ep: 13, loss: 0.0000, time 5.4 sec
ep: 14, loss: 0.0000, time 5.4 sec
ep: 15, loss: 0.0000, time 5.4 sec
ep: 16, loss: 0.0000, time 5.5 sec
ep: 17, loss: 0.0000, time 5.4 sec
ep: 18, loss: 0.0000, time 5.3 sec
ep: 19, loss: 0.0000, time 5.5 sec


#### 3. Проверка работы модели

In [26]:
# Данные для проверки
test = []
for i in tqdm(range(5*10**4)):
    number = ''
    while True:
        number += str(random.randint(0,9))
        if len(number) == 50:
            test.append(number)
            break

test_new = [(generate_seq(i)) for i in test]

100%|██████████| 50000/50000 [00:02<00:00, 20519.39it/s]


In [27]:
# Словарь, где ключ - исходная последовательность, значение - преобразованная
test_dict = dict(zip(test, test_new))

In [31]:
# Функция для предсказаний и расчета точности
INDEX_TO_CHAR = [w for w in '0123456789']
def model_test(test, model, name):
    accuracy = 0
    for seq, seq_new in tqdm(test_dict.items()):
        seq = list(seq)
        seq = [CHAR_TO_INDEX.get(s, 0) for s in seq]
        answers = model.to('cpu').forward(torch.tensor(seq))
        probas, indices = answers.topk(1)
        pred = ''.join([INDEX_TO_CHAR[ind.item()] for ind in indices.flatten()])
        if pred == seq_new: 
            accuracy += 1
        
    print(f'Model: {name}, accuracy: {(accuracy/len(test) * 100):.2f}')

In [38]:
# Тест RNN
model_test(test = test_dict, model = model_rnn, name = 'RNN')

100%|██████████| 50000/50000 [03:22<00:00, 246.91it/s]

Model: RNN, accuracy: 0.00





In [39]:
# Тест LSTM
model_test(test = test_dict, model = model_lstm, name = 'LSTM')

100%|██████████| 50000/50000 [01:27<00:00, 573.80it/s]

Model: LSTM, accuracy: 0.00





In [32]:
# Тест GRU
model_test(test = test_dict, model = model_gru, name = 'GRU')

100%|██████████| 50000/50000 [04:47<00:00, 173.63it/s]

Model: GRU, accuracy: 100.00





### Задание 2

#### 1. Загрузка данных

In [3]:
# Загрузка файла, удаление пробелов, оставление только буквенных символов
import re
with open(r"D:\НЕТОЛОГИЯ\Deep ML\файлы\nietzsche.txt", encoding = 'utf-8') as f:
    text = f.read().lower()
print('length:', len(text))
text = re.sub('[^a-z ]', ' ', text)
text = re.sub('\s+', ' ', text)

length: 600893


In [4]:
# Словарь для символов
INDEX_TO_CHAR2 = sorted(list(set(text)))
CHAR_TO_INDEX2 = {c: i for i, c in enumerate(INDEX_TO_CHAR2)}

In [5]:
# Разделение строк
max_len2 = 40
step = 3
SENTENCES = []
NEXT_CHARS = []
for i in range(0, len(text) - max_len2, step):
    SENTENCES.append(text[i: i + max_len2])
    NEXT_CHARS.append(text[i + max_len2])
print('Num sents:', len(SENTENCES))

Num sents: 193075


In [6]:
#Векторизация
X_text = torch.zeros((len(SENTENCES), max_len2), dtype=int)
Y_text = torch.zeros((len(SENTENCES)), dtype=int)

for i, sentence in tqdm(enumerate(SENTENCES)):
    for t, char in enumerate(sentence):
        X_text[i, t] = CHAR_TO_INDEX2[char]
    Y_text[i] = CHAR_TO_INDEX2[NEXT_CHARS[i]]

193075it [01:03, 3050.51it/s]


#### 2. Обучение LSTM

In [7]:
batch_size2 = 512
dataset = torch.utils.data.TensorDataset(X_text, Y_text)
data = torch.utils.data.DataLoader(dataset, batch_size2, shuffle = True)

In [8]:
# Класс для сети
class NeuralNetwork(nn.Module):
    def __init__(self, rnnClass, dictionary_size, embedding_size, num_hiddens, num_classes):
        super().__init__()
        
        self.num_hiddens = num_hiddens
        self.embedding = nn.Embedding(dictionary_size, embedding_size)
        self.hidden = rnnClass(embedding_size, num_hiddens, batch_first=True)
        self.output = nn.Linear(num_hiddens, num_classes)
        
    def forward(self, X):
        out = self.embedding(X)
        _, state = self.hidden(out)
        predictions = self.output(state[0].squeeze())
        return predictions

model_lstm2 = NeuralNetwork(nn.LSTM, len(CHAR_TO_INDEX2), 64, 128, len(CHAR_TO_INDEX2))
model_lstm2 = model_lstm2.to(device)

In [28]:
# Обучение модели
loss = nn.CrossEntropyLoss()
optim_lstm2 = torch.optim.Adam(model_lstm2.parameters(), lr = 0.01)

for epoch in range(1, 51):
    start = time.time()
    train_loss = 0
    train_passed = 0

    model_lstm2.train()
    for X_b, y_b in data:
        X_b, y_b = X_b.to(device), y_b.to(device)
        optim_lstm2.zero_grad()
        answers = model_lstm2(X_b)
        l = loss(answers, y_b)
        train_loss += l.item()

        l.backward()
        optim_lstm2.step()
        train_passed += 1

    if epoch % 5 == 0:
        print(f'ep: {epoch}, loss: {train_loss/train_passed:.4f}, time {time.time() - start:.1f} sec')

ep: 5, loss: 1.2923, time 3.6 sec
ep: 10, loss: 1.2929, time 3.7 sec
ep: 15, loss: 1.3078, time 3.8 sec
ep: 20, loss: 1.3659, time 3.8 sec
ep: 25, loss: 1.3599, time 3.9 sec
ep: 30, loss: 1.3727, time 3.9 sec
ep: 35, loss: 1.4824, time 4.0 sec
ep: 40, loss: 1.4287, time 4.1 sec
ep: 45, loss: 1.5940, time 3.9 sec
ep: 50, loss: 1.4370, time 3.9 sec


#### 3. Проверкак качества модели

In [30]:
# Проверим, совападют ли сгенерерованные моделью продолжения последовательнотсей с таковыми из текста
def sample(preds):
    softmaxed = torch.softmax(preds, 0)
    probas = torch.distributions.multinomial.Multinomial(1, softmaxed).sample()
    return probas.argmax()

def generate_test():
    start_index = random.randint(0, len(text) - max_len2 - 1)

    generated = ''
    sentence = text[start_index: start_index + max_len2]
    sentence_full = text[start_index: start_index + max_len2*2]
    generated += sentence

    for i in range(max_len2):
        x_pred = torch.zeros((1, max_len2), dtype=int)
        for t, char in enumerate(generated[-max_len2:]):
            x_pred[0, t] = CHAR_TO_INDEX2[char]
        
        preds = model_lstm2(x_pred.to(device)).cpu()
        next_char = INDEX_TO_CHAR2[sample(preds)]
        generated = generated + next_char

    print('1-ая половина предложения/Предсказание') 
    print(generated[:max_len2] + '/' + generated[max_len2:])
    print('Предложение целиком') 
    print(sentence_full)
    print()

In [31]:
# Сгенерируем 10 примеров и сравним предсказания модели с источником
for i in range(10):
    model_lstm2.eval()
    generate_test()

1-ая половина предложения/Предсказание
to jesuits and even ernest renan how ina/bly ind mean is escience of there is sym
Предложение целиком
to jesuits and even ernest renan how inaccessible to us northerners does the lan

1-ая половина предложения/Предсказание
will to deception or the generous deed o/f recolute and a verios the generall as 
Предложение целиком
will to deception or the generous deed out of selfishness or the pure sun bright

1-ая половина предложения/Предсказание
ray something of the structure of his so/mathhing platence wither to instinct goe
Предложение целиком
ray something of the structure of his soul and wherein it sees its conditions of

1-ая половина предложения/Предсказание
owledge rear itself hitherto the will to/ like zepace and to here new the asseds 
Предложение целиком
owledge rear itself hitherto the will to knowledge on the foundation of a far mo

1-ая половина предложения/Предсказание
ng ever more dangerous can we not upset /romanet for as evilie case i