In [1]:
import torch
import torch.nn as nn
from torch.optim import SGD 
import numpy as np

# Упражнение, для реализации "Ванильной" RNN
* Попробуем обучить сеть восстанавливать слово hello по первой букве. т.е. построим charecter-level модель

In [2]:
a = torch.ones((3,3))*3
b = torch.ones((3,3))*5

In [3]:
a @ b

tensor([[45., 45., 45.],
        [45., 45., 45.],
        [45., 45., 45.]])

In [4]:
a * b

tensor([[15., 15., 15.],
        [15., 15., 15.],
        [15., 15., 15.]])

In [5]:
# word = 'ololoasdasddqweqw123456789'
word = 'hello'

## Датасет. 
Позволяет:
* Закодировать символ при помощи one-hot
* Делать итератор по слову, которыей возвращает текущий символ и следующий как таргет

In [6]:
class WordDataSet:
    
    def __init__(self, word):
        self.chars2idx = {}
        self.indexs  = []
        for c in word: 
            if c not in self.chars2idx:
                self.chars2idx[c] = len(self.chars2idx)
                
            self.indexs.append(self.chars2idx[c])
            
        self.vec_size = len(self.chars2idx)
        self.seq_len  = len(word)
        
    def get_one_hot(self, idx):
        x = torch.zeros(self.vec_size)
        x[idx] = 1
        return x
    
    def __iter__(self):
        return zip(self.indexs[:-1], self.indexs[1:])
    
    def __len__(self):
        return self.seq_len
    
    def get_char_by_id(self, id):
        for c, i in self.chars2idx.items():
            if id == i: return c
        return None

## Реализация базовой RNN
<br/>
Скрытый элемент
$$ h_t= tanh⁡ (W_{ℎℎ} h_{t−1}+W_{xh} x_t) $$
Выход сети

$$ y_t = W_{hy} h_t $$

In [7]:
class VanillaRNN(nn.Module):
    
    def __init__(self, in_size=5, hidden_size=3, out_size=5):
        super(VanillaRNN, self).__init__()        
        self.x2hidden    = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.hidden      = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        self.activation  = nn.Tanh()
        self.outweight   = nn.Linear(in_features=hidden_size, out_features=out_size)
    
    def forward(self, x, prev_hidden):
        hidden = self.activation(self.x2hidden(x) + self.hidden(prev_hidden))
#         Версия без активации - может происходить gradient exploding
#         hidden = self.x2hidden(x) + self.hidden(prev_hidden)
        output = self.outweight(hidden)
        return output, hidden

## Инициализация переменных 

In [8]:
ds = WordDataSet(word=word)
rnn = VanillaRNN(in_size=ds.vec_size, hidden_size=3, out_size=ds.vec_size)
criterion = nn.CrossEntropyLoss()
e_cnt     = 100
optim     = SGD(rnn.parameters(), lr = 0.1, momentum=0.9)

# Обучение

In [9]:
CLIP_GRAD = True

for epoch in range(e_cnt):
    hh = torch.zeros(rnn.hidden.in_features)
    loss = 0
    optim.zero_grad()
    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze(0)
        target =  torch.LongTensor([next_sample])

        y, hh = rnn(x, hh)
        
        loss += criterion(y, target)
     

    loss.backward()
    
    if epoch % 10 == 0:
        print (loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=5))
    else: 
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=1)
            
    optim.step()

6.80427360534668
Clip gradient :  3.09128939323171
2.266852855682373
Clip gradient :  1.3085128250703808
0.5177130699157715
Clip gradient :  0.5005246828965925
0.11306381225585938
Clip gradient :  0.5510382840644107
0.04264259338378906
Clip gradient :  0.4765328277672062
0.013887405395507812
Clip gradient :  0.038717521105102674
0.010044097900390625
Clip gradient :  0.0351740218886201
0.00825357437133789
Clip gradient :  0.0327982868689374
0.007105350494384766
Clip gradient :  0.014760617359659752
0.006451129913330078
Clip gradient :  0.007339675786386952


# Тестирование

In [10]:
rnn.eval()
hh = torch.zeros(rnn.hidden.in_features)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    y, hh = rnn(x, hh)
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 hello
Original:	 hello


# ДЗ
Реализовать LSTM и GRU модули, обучить их предсказывать тестовое слово
Сохранить ноутбук с предсказанием и пройденным assert и прислать на почту a.murashev@corp.mail.ru
c темой:


[МФТИ\_2019\_1] ДЗ №8 ФИО

In [26]:
#тестовое слово
word = 'ololoasdasddqweqw123456789'

## Реализовать LSTM

In [27]:
class LSTM(nn.Module):
    def __init__(self, in_size=5, hidden_size=3, out_size=5):
        super(LSTM, self).__init__()        
        self.x2f = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.h2f = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        
        self.x2i = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.h2i = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        
        self.x2o = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.h2o = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        
        self.x2c = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.h2c = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        
        self.filter = nn.Sigmoid()
        self.activation  = nn.Tanh()
        
        self.outweight   = nn.Linear(in_features=hidden_size, out_features=out_size)
    
    def forward(self, x, prev_hidden, prev_cell):
        f = self.filter(self.x2f(x) + self.h2f(prev_hidden))
        i = self.filter(self.x2i(x) + self.h2i(prev_hidden))
        o = self.filter(self.x2o(x) + self.h2o(prev_hidden))
        c = f * prev_cell + i * self.activation(self.x2c(x) + self.h2c(prev_hidden))
        h = o * self.activation(prev_cell)
        output = self.outweight(h)
        return output, h, c

In [28]:
ds = WordDataSet(word=word)
rnn = LSTM(in_size=ds.vec_size, hidden_size=3, out_size=ds.vec_size)
criterion = nn.CrossEntropyLoss()
e_cnt     = 1000
optim     = SGD(rnn.parameters(), lr = 0.1, momentum=0.9)

In [29]:
CLIP_GRAD = True

for epoch in range(e_cnt):
    hh = torch.zeros(rnn.outweight.in_features)
    cc = torch.zeros(rnn.outweight.in_features)
    loss = 0
    optim.zero_grad()
    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze(0)
        target =  torch.LongTensor([next_sample])
        y, hh, cc = rnn(x, hh, cc)
        loss += criterion(y, target)
     

    loss.backward()
    
    if epoch % 10 == 0:
        print (loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=5))
    else: 
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=1)
            
    optim.step()

71.90705871582031
Clip gradient :  3.340104779019206
67.43881225585938
Clip gradient :  2.7514992094107193
50.07145309448242
Clip gradient :  4.8656130141790435
44.645477294921875
Clip gradient :  24.44931715322141
37.02177047729492
Clip gradient :  10.648355311751839
37.54615783691406
Clip gradient :  6.722420443725721
37.56373977661133
Clip gradient :  13.326107102965453
31.884220123291016
Clip gradient :  5.784481710351572
32.10682678222656
Clip gradient :  9.415746504278022
30.88884925842285
Clip gradient :  12.31549572037178
31.32321548461914
Clip gradient :  9.827352668472432
29.677268981933594
Clip gradient :  3.690425230529532
26.338966369628906
Clip gradient :  1.6452839715283352
24.119735717773438
Clip gradient :  1.281301606494058
22.136348724365234
Clip gradient :  1.0590141019236525
20.088558197021484
Clip gradient :  0.6338464169387046
18.277698516845703
Clip gradient :  1.2238308503719382
16.095157623291016
Clip gradient :  1.591925297486606
14.18514347076416
Clip gradie

In [30]:
rnn.eval()
hh = torch.zeros(rnn.h2f.in_features)
cc = torch.zeros(rnn.h2f.in_features)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    y, hh, cc = rnn(x, hh, cc)
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 ololoasdasddqweqw123456789
Original:	 ololoasdasddqweqw123456789


## Реализовать GRU

In [31]:
class GRU(nn.Module):
    
    def __init__(self, in_size=5, hidden_size=3, out_size=5):
        super(GRU, self).__init__()        
        self.x2z = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.h2z = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        
        self.x2r = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.h2r = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        
        self.x2h = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.h2h = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        
        self.filter = nn.Sigmoid()
        self.activation  = nn.Tanh()
        
        self.outweight   = nn.Linear(in_features=hidden_size, out_features=out_size)
    
    def forward(self, x, prev_hidden):
        z = self.filter(self.x2z(x) + self.h2z(prev_hidden))
        r = self.filter(self.x2r(x) + self.h2r(prev_hidden))
        h = (1 - z) * prev_hidden + z * self.activation(self.x2h(x) + self.h2h(r * prev_hidden))
        output = self.outweight(h)
        return output, h

In [39]:
ds = WordDataSet(word=word)
rnn = GRU(in_size=ds.vec_size, hidden_size=10, out_size=ds.vec_size)
criterion = nn.CrossEntropyLoss()
e_cnt     = 500
optim     = SGD(rnn.parameters(), lr = 0.1, momentum=0.9)

In [40]:
CLIP_GRAD = True

for epoch in range(e_cnt):
    hh = torch.zeros(rnn.outweight.in_features)
    loss = 0
    optim.zero_grad()
    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze(0)
        target =  torch.LongTensor([next_sample])
        y, hh = rnn(x, hh)
        loss += criterion(y, target)
     
    loss.backward()
    
    if epoch % 10 == 0:
        print (loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=5))
    else: 
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=1)
    
    optim.step()

71.47886657714844
Clip gradient :  4.246459507391692
57.589637756347656
Clip gradient :  6.138605055155907
31.48200225830078
Clip gradient :  7.74738598008901
12.11317253112793
Clip gradient :  3.8863980785581145
3.0512280464172363
Clip gradient :  2.2431033866148873
1.185276985168457
Clip gradient :  29.5626127754399
4.8191680908203125
Clip gradient :  11.85294712203797
2.091378688812256
Clip gradient :  3.1700331771126793
0.32323551177978516
Clip gradient :  0.5315802917414627
0.16036510467529297
Clip gradient :  2.3423824213479034
0.10284137725830078
Clip gradient :  0.10512959827584963
0.07022953033447266
Clip gradient :  0.05610087464148777
0.05261516571044922
Clip gradient :  0.030949785757231695
0.043877601623535156
Clip gradient :  0.023898943943510432
0.038496971130371094
Clip gradient :  0.020640324013717532
0.03451728820800781
Clip gradient :  0.017309181605796718
0.031485557556152344
Clip gradient :  0.015593723406756948
0.029036521911621094
Clip gradient :  0.0143425142597

In [41]:
rnn.eval()
hh = torch.zeros(rnn.h2z.in_features)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    y, hh = rnn(x, hh)
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 ololoasdasddqweqw123456789
Original:	 ololoasdasddqweqw123456789
