### reference: https://towardsdatascience.com/attention-seq2seq-with-pytorch-learning-to-invert-a-sequence-34faf4133e53

In [1]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
from tqdm import tqdm
import torch.nn.functional as F

In [2]:
torch.__version__

'1.7.0+cu101'

In [3]:
torch.__version__

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
MAX_LENGTH = 15

In [5]:
inverse_map = {
    'a':'z',
    'b':'y',
    'c':'x',
    'd':'w',
    'e':'v',
    'f':'u',
    'g':'t',
    'h':'s',
    'i':'r',
    'j':'q',
    'k':'p',
    'l':'o',
    'm':'n',
    'n':'m',
    'o':'l',
    'p':'k',
    'q':'j',
    'r':'i',
    's':'h',
    't':'g',
    'u':'f',
    'v':'e',
    'w':'d',
    'x':'c',
    'y':'b',
    'z':'a'
}

In [6]:
a2i = {
    '<s>':0,
    '</s>':1,
    '<pad>':2,
    'a':3,
    'b':4,
    'c':5,
    'd':6,
    'e':7,
    'f':8,
    'g':9,
    'h':10,
    'i':11,
    'j':12,
    'k':13,
    'l':14,
    'm':15,
    'n':16,
    'o':17,
    'p':18,
    'q':19,
    'r':20,
    's':21,
    't':22,
    'u':23,
    'v':24,
    'w':25,
    'x':26,
    'y':27,
    'z':28,
}

In [7]:
i2a = {v:k for k, v in a2i.items()}

In [8]:
i2a

{0: '<s>',
 1: '</s>',
 2: '<pad>',
 3: 'a',
 4: 'b',
 5: 'c',
 6: 'd',
 7: 'e',
 8: 'f',
 9: 'g',
 10: 'h',
 11: 'i',
 12: 'j',
 13: 'k',
 14: 'l',
 15: 'm',
 16: 'n',
 17: 'o',
 18: 'p',
 19: 'q',
 20: 'r',
 21: 's',
 22: 't',
 23: 'u',
 24: 'v',
 25: 'w',
 26: 'x',
 27: 'y',
 28: 'z'}

In [9]:
def generate_random_alphabet_index():
    random_length = np.random.randint(5, MAX_LENGTH-2)    # -2 because of <s> and </s>
    #random_length = 14
    random_alphabet_index = np.random.randint(0, 26, random_length) + 3
    return random_alphabet_index.tolist()

In [10]:
class AlphabetToyDataset(Dataset):
    def __init__(self, n_dataset=1000):
        bos = 0
        eos = 1
        pad = 2
        self.inputs = []
        self.labels = []
        for _ in range(n_dataset):
            # make input example
            aindex = generate_random_alphabet_index()
            
            # index to alphabet
            alphabet = list(map(lambda a: i2a[a], aindex))
            
            # inversing
            inversed_alphabet = list(map(lambda a: inverse_map[a], alphabet))
            
            # alphabet to index
            iindex = list(map(lambda ia: a2i[ia], inversed_alphabet))
            
            # add bos, eos and pad
            n_pad = MAX_LENGTH - len(aindex) - 2
            #aindex = [bos] + aindex + [eos] + [pad]*n_pad
            aindex = aindex + [eos] + [pad]*n_pad
            #iindex = [bos] + iindex + [eos] + [pad]*n_pad
            iindex = iindex + [eos] + [pad]*n_pad
            
            # add to examples
            self.inputs.append(aindex)
            self.labels.append(iindex)
            
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, index):
        return [
            torch.tensor(self.inputs[index], dtype=torch.long),
            torch.tensor(self.labels[index], dtype=torch.long)
        ]

In [11]:
train_dataset = AlphabetToyDataset()
valid_dataset = AlphabetToyDataset(n_dataset=200)

In [12]:
len(train_dataset), len(valid_dataset)

(1000, 200)

In [13]:
def convert_index_to_alphabet(index):
    alphabet = list(map(lambda i: i2a[i], index))
    return ' '.join(alphabet)

In [14]:
for i in range(3):
    ex = train_dataset[i]
    aindex, iindex = ex
    
    print('aindex_{}: {}'.format(len(aindex), convert_index_to_alphabet(aindex.numpy())))
    print('iindex_{}: {}'.format(len(iindex), convert_index_to_alphabet(iindex.numpy())))
    print('** aindex_{}: {}'.format(len(aindex), aindex))
    print('** iindex_{}: {}'.format(len(iindex), iindex))
    print('------------')

aindex_14: f h c l x x </s> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
iindex_14: u s x o c c </s> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
** aindex_14: tensor([ 8, 10,  5, 14, 26, 26,  1,  2,  2,  2,  2,  2,  2,  2])
** iindex_14: tensor([23, 21, 26, 17,  5,  5,  1,  2,  2,  2,  2,  2,  2,  2])
------------
aindex_14: x e w a g p m </s> <pad> <pad> <pad> <pad> <pad> <pad>
iindex_14: c v d z t k n </s> <pad> <pad> <pad> <pad> <pad> <pad>
** aindex_14: tensor([26,  7, 25,  3,  9, 18, 15,  1,  2,  2,  2,  2,  2,  2])
** iindex_14: tensor([ 5, 24,  6, 28, 22, 13, 16,  1,  2,  2,  2,  2,  2,  2])
------------
aindex_14: f w y s r </s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
iindex_14: u d b h i </s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
** aindex_14: tensor([ 8, 25, 27, 21, 20,  1,  2,  2,  2,  2,  2,  2,  2,  2])
** iindex_14: tensor([23,  6,  4, 10, 11,  1,  2,  2,  2,  2,  2,  2,  2,  2])
------------


In [15]:
train_dataloader = DataLoader(train_dataset, batch_size=1)
valid_dataloader = DataLoader(valid_dataset, batch_size=1)

In [16]:
class AlphabetEncoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(AlphabetEncoder, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, inputs, hidden):
        #embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.embedding(inputs)
        #print('** embedding: {}'.format(self.embedding(input).shape))
        #print('** embedded: {}'.format(embedded.shape))
        #print('** hidden: {}'.format(hidden.shape))
        output = embedded
        output, hidden = self.gru(output, hidden)
        #print('** hidden: {}'.format(hidden.shape))
        return output, hidden

    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

In [17]:
class AlphabetDecoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(AlphabetDecoder, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, inputs, hidden):
        embedded = self.embedding(inputs)
        embedded = F.relu(embedded)
        #print('** output: {}'.format(output.shape))
        output, hidden = self.gru(embedded, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

In [18]:
class AlphabetAttentionDecoder(nn.Module):
    def __init__(self, hidden_size, output_size, max_length):
        super(AlphabetAttentionDecoder, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.max_length = max_length
        
        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        self.attn = nn.Linear(self.hidden_size*2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        
    def forward(self, inputs, hidden, encoder_outputs):
        #print('AlphabetAttentionDecoder_INPUT: inputs: {}'.format(inputs.shape))
        #print('AlphabetAttentionDecoder_INPUT: hidden: {}'.format(hidden.shape))
        #print('AlphabetAttentionDecoder_INPUT: encoder_outputs: {}'.format(encoder_outputs.shape))
        embedded = self.embedding(inputs)
        embedded = F.relu(embedded)
        #print('** embedded: {}'.format(embedded.shape))
        #print('** hidden: {}'.format(hidden.shape))
        
        # add attention
        scores = torch.cat((embedded[0], hidden[0]), dim=1)
        #print('** scores: {}'.format(scores.shape))
        attn_weights = F.softmax(self.attn(scores), dim=1)
        attn_weights = attn_weights.unsqueeze(1)
        attn_applied = torch.bmm(attn_weights, encoder_outputs.transpose(1, 0))
        #print('** attn_weights: {}'.format(attn_weights.shape))
        #print('** attn_weights sum: {}'.format(attn_weights.sum()))
        #print('** encoder_outputs transposed: {}'.format(encoder_outputs.transpose(1, 0).shape))
        #print('** attn_applied: {}'.format(attn_applied.shape))
        
        # make output
        output = torch.cat((attn_applied.transpose(1, 0), embedded), 2)
        output = self.attn_combine(output)
        output = F.relu(output)
        
        #print('** gru-output: {}'.format(output.shape))
        #print('** gru-hidden: {}'.format(hidden.shape))
        output, hidden = self.gru(output, hidden)
        #print('** output[0]: {}'.format(output[0].shape))
        output = self.softmax(self.out(output[0]))
        
        #print('AlphabetAttentionDecoder_OUTPUT: output: {}'.format(output.shape))
        #print('------------------------------------------')
        return output, hidden

    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

In [19]:
class AlphabetInversionModel(nn.Module):
    def __init__(self):
        hidden_size = 256
        encoder = AlphabetEncoder(26+3, hidden_size).to(device)
        decoder = AlphabetDecoder(hidden_size, 26+3).to(device)

In [20]:
use_attention = False

In [21]:
hidden_size = 256
encoder = AlphabetEncoder(26+3, hidden_size).to(device)
decoder = AlphabetDecoder(hidden_size, 26+3).to(device)
#decoder = AlphabetAttentionDecoder(hidden_size, 26+3, MAX_LENGTH).to(device)

In [22]:
# inputs_t = inputs.transpose(1, 0)

In [23]:
# batch_size = 32
# encoder_hidden = encoder.initHidden(batch_size)
# encoder_hidden.shape

In [24]:
learning_rate = 0.01
encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()

In [25]:
bos = 0
eos = 1
pad = 2

In [26]:
def train(encoder, decoder, dataloader, encoder_optimizer, decoder_optimizer, criterion, n_epochs=5):
    
    encoder.train()
    decoder.train()
    
    # zero_grad for encoder/decoder optimizer
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    for epoch in range(n_epochs):
        
        # zero_grad for encoder/decoder optimizer
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        
        tbar = tqdm(enumerate(dataloader), desc='training {}th epoch'.format(epoch))
        ####################################
        for i, batch in tbar:

            # get inputs and labels
            inputs, labels = batch
            inputs = inputs.to(device)
            labels = labels.to(device)

            #print('** inputs[0]: {}'.format(inputs[0]))
            #print('** labels[0]: {}'.format(labels[0]))
            
            # transpose inputs and labels
            inputs = inputs.transpose(1, 0)
            labels = labels.transpose(1, 0)

            # initialize hidden for encoder
            batch_size = inputs.size()[1]
            max_length = inputs.size()[0]
            encoder_hidden = encoder.initHidden(batch_size)
            encoder_outputs = torch.zeros(max_length, batch_size, hidden_size, device=device)

            # encoding
            for j, inp in enumerate(inputs):
                inp = inp.unsqueeze(0)
                encoder_output, encoder_hidden = encoder(inp, encoder_hidden)
                #print('** encoder_output_{}: {}'.format(i, encoder_output.shape))
                encoder_outputs[j] += encoder_output[:,0]

            # initialize hidden for decoder
            decoder_hidden = encoder_hidden
            loss = 0

            decoder_inputs = torch.tensor([[bos]*batch_size], device=device)
            #print('** decoder_inputs: {}'.format(decoder_inputs.shape))

            # decoding
            for inp in labels:
                #inp = inp.unsqueeze(0)
                #print('** inp: {}'.format(inp))
                #print('** inp shape: {}'.format(inp.shape))
                #return

                #print('** decoder_inputs: {}'.format(decoder_inputs.shape))
                #print('** decoder_inputs[:,0]: {}'.format(decoder_inputs[:,0]))
                if use_attention:
                    decoder_output, decoder_hidden = decoder(decoder_inputs, decoder_hidden, encoder_outputs)
                else:
                    decoder_output, decoder_hidden = decoder(decoder_inputs, decoder_hidden)

                #print('** inp shape VS decoder_output shape: {} VS {}'.format(inp.shape, decoder_output.shape))
                #print('** inp[0]: {}'.format(inp[0]))
                #print('** decoder_output[0]: {}'.format(decoder_output[0]))
                #print('** -----')
                loss_it = criterion(decoder_output, inp)
                loss += loss_it

                decoder_inputs = inp.unsqueeze(0)
                #print('** label vs pred: {} vs {} → {:.4f}'.format(inp.shape, decoder_output.shape, loss_it))

            #return
            #print('total loss before backward: {:.4f}'.format(loss))

            # backward
            loss.backward()
            encoder_sum = sum([p[1].data.sum() for p in encoder.named_parameters()])
            decoder_sum = sum([p[1].data.sum() for p in decoder.named_parameters()])
            #print('total loss after backward: {:.4f}'.format(loss))
            #print('encoder, decoder: {:.4f} {:.4f}'.format(encoder_sum, decoder_sum))

            # update encoder/decoder
            encoder_optimizer.step()
            decoder_optimizer.step()
            encoder_sum = sum([p[1].data.sum() for p in encoder.named_parameters()])
            decoder_sum = sum([p[1].data.sum() for p in decoder.named_parameters()])
            #print('encoder, decoder: {:.4f} {:.4f}'.format(encoder_sum, decoder_sum))
            #print('total loss after step: {:.4f}'.format(loss))
            #print('{}-{}th iteration → total loss after update: {:.4f}'.format(epoch, i, loss))
            
            tbar.set_postfix(loss=loss.data.item())
            #return
            #break
        ####################################
        #return

In [None]:
train(encoder, decoder, train_dataloader, encoder_optimizer, decoder_optimizer, criterion, n_epochs=60)

training 0th epoch: 1000it [00:46, 21.40it/s, loss=3.47e+4]
training 1th epoch: 1000it [00:43, 22.92it/s, loss=4.62e+4]
training 2th epoch: 1000it [00:45, 21.89it/s, loss=4.95e+4]
training 3th epoch: 1000it [00:48, 20.74it/s, loss=7.37e+4]
training 4th epoch: 117it [00:05, 20.90it/s, loss=3.95e+4]

In [28]:
train(encoder, decoder, train_dataloader, encoder_optimizer, decoder_optimizer, criterion, n_epochs=30)

training 0th epoch: 32it [00:01, 19.34it/s, loss=0.731] 
training 1th epoch: 32it [00:01, 24.39it/s, loss=0.701] 
training 2th epoch: 32it [00:01, 24.50it/s, loss=0.674] 
training 3th epoch: 32it [00:01, 24.39it/s, loss=0.65]  
training 4th epoch: 32it [00:01, 24.23it/s, loss=0.627] 
training 5th epoch: 32it [00:01, 20.64it/s, loss=0.606] 
training 6th epoch: 32it [00:01, 20.59it/s, loss=0.586] 
training 7th epoch: 32it [00:01, 20.65it/s, loss=0.568] 
training 8th epoch: 32it [00:01, 20.56it/s, loss=0.551] 
training 9th epoch: 32it [00:01, 20.37it/s, loss=0.535] 
training 10th epoch: 32it [00:01, 19.61it/s, loss=0.52]  
training 11th epoch: 32it [00:01, 17.94it/s, loss=0.505] 
training 12th epoch: 32it [00:01, 18.60it/s, loss=0.492] 
training 13th epoch: 32it [00:01, 18.15it/s, loss=0.479] 
training 14th epoch: 32it [00:01, 17.93it/s, loss=0.467] 
training 15th epoch: 32it [00:01, 17.15it/s, loss=0.456] 
training 16th epoch: 32it [00:01, 17.86it/s, loss=0.445] 
training 17th epoch: 32i

In [29]:
train(encoder, decoder, train_dataloader, encoder_optimizer, decoder_optimizer, criterion, n_epochs=15)

training 0th epoch: 32it [00:01, 18.09it/s, loss=0.333] 
training 1th epoch: 32it [00:01, 19.98it/s, loss=0.328] 
training 2th epoch: 32it [00:01, 20.25it/s, loss=0.322] 
training 3th epoch: 32it [00:01, 20.15it/s, loss=0.316] 
training 4th epoch: 32it [00:01, 20.59it/s, loss=0.311] 
training 5th epoch: 32it [00:01, 20.78it/s, loss=0.306] 
training 6th epoch: 32it [00:01, 20.38it/s, loss=0.301] 
training 7th epoch: 32it [00:01, 20.51it/s, loss=0.296] 
training 8th epoch: 32it [00:01, 20.24it/s, loss=0.292] 
training 9th epoch: 32it [00:01, 20.58it/s, loss=0.287] 
training 10th epoch: 32it [00:01, 20.31it/s, loss=0.283] 
training 11th epoch: 32it [00:01, 20.66it/s, loss=0.279] 
training 12th epoch: 32it [00:01, 20.48it/s, loss=0.275] 
training 13th epoch: 32it [00:01, 19.97it/s, loss=0.271] 
training 14th epoch: 32it [00:01, 19.20it/s, loss=0.267] 


In [30]:
train(encoder, decoder, train_dataloader, encoder_optimizer, decoder_optimizer, criterion, n_epochs=15)

training 0th epoch: 32it [00:02, 15.73it/s, loss=0.484] 
training 1th epoch: 32it [00:01, 17.67it/s, loss=0.474] 
training 2th epoch: 32it [00:02, 15.08it/s, loss=0.464] 
training 3th epoch: 32it [00:02, 15.08it/s, loss=0.454] 
training 4th epoch: 32it [00:02, 14.94it/s, loss=0.444] 
training 5th epoch: 32it [00:02, 14.74it/s, loss=0.435] 
training 6th epoch: 32it [00:02, 14.57it/s, loss=0.426] 
training 7th epoch: 32it [00:02, 14.56it/s, loss=0.418] 
training 8th epoch: 32it [00:02, 14.64it/s, loss=0.41]  
training 9th epoch: 32it [00:02, 14.62it/s, loss=0.402] 
training 10th epoch: 32it [00:02, 13.98it/s, loss=0.394] 
training 11th epoch: 32it [00:02, 14.65it/s, loss=0.387] 
training 12th epoch: 32it [00:02, 14.71it/s, loss=0.38]  
training 13th epoch: 32it [00:02, 14.66it/s, loss=0.373] 
training 14th epoch: 32it [00:02, 14.67it/s, loss=0.366] 


In [31]:
train(encoder, decoder, train_dataloader, encoder_optimizer, decoder_optimizer, criterion, n_epochs=15)

training 0th epoch: 32it [00:02, 14.56it/s, loss=0.36]  
training 1th epoch: 32it [00:02, 14.58it/s, loss=0.354] 
training 2th epoch: 32it [00:02, 14.68it/s, loss=0.348] 
training 3th epoch: 32it [00:02, 14.74it/s, loss=0.342] 
training 4th epoch: 32it [00:02, 14.80it/s, loss=0.336] 
training 5th epoch: 32it [00:02, 14.75it/s, loss=0.331] 
training 6th epoch: 32it [00:02, 14.78it/s, loss=0.325] 
training 7th epoch: 32it [00:02, 14.84it/s, loss=0.32]  
training 8th epoch: 32it [00:02, 14.73it/s, loss=0.315] 
training 9th epoch: 32it [00:02, 14.80it/s, loss=0.31]  
training 10th epoch: 32it [00:02, 14.54it/s, loss=0.305] 
training 11th epoch: 32it [00:02, 14.74it/s, loss=0.301] 
training 12th epoch: 32it [00:02, 14.75it/s, loss=0.296] 
training 13th epoch: 32it [00:02, 14.68it/s, loss=0.292] 
training 14th epoch: 32it [00:02, 14.63it/s, loss=0.287] 


In [30]:
train(encoder, decoder, train_dataloader, encoder_optimizer, decoder_optimizer, criterion, n_epochs=100)

training 0th epoch: 32it [00:01, 22.82it/s, loss=0.263] 
training 1th epoch: 32it [00:01, 24.46it/s, loss=0.259] 
training 2th epoch: 32it [00:01, 24.31it/s, loss=0.256] 
training 3th epoch: 32it [00:01, 24.25it/s, loss=0.252] 
training 4th epoch: 32it [00:01, 24.69it/s, loss=0.249] 
training 5th epoch: 32it [00:01, 24.34it/s, loss=0.245] 
training 6th epoch: 32it [00:01, 24.31it/s, loss=0.242] 
training 7th epoch: 32it [00:01, 21.87it/s, loss=0.239] 
training 8th epoch: 32it [00:01, 20.72it/s, loss=0.236] 
training 9th epoch: 32it [00:01, 20.59it/s, loss=0.233] 
training 10th epoch: 32it [00:01, 20.71it/s, loss=0.23]  
training 11th epoch: 32it [00:01, 20.57it/s, loss=0.227] 
training 12th epoch: 32it [00:01, 20.69it/s, loss=0.225] 
training 13th epoch: 32it [00:01, 20.70it/s, loss=0.222] 
training 14th epoch: 32it [00:01, 20.51it/s, loss=0.219] 
training 15th epoch: 32it [00:01, 20.65it/s, loss=0.217] 
training 16th epoch: 32it [00:01, 20.51it/s, loss=0.214] 
training 17th epoch: 32i

In [39]:
train(encoder, decoder, train_dataloader, encoder_optimizer, decoder_optimizer, criterion, n_epochs=1000)

training 0th epoch: 32it [00:01, 20.43it/s, loss=0.106] 
training 1th epoch: 32it [00:01, 20.16it/s, loss=0.106]  
training 2th epoch: 32it [00:01, 20.72it/s, loss=0.105] 
training 3th epoch: 32it [00:01, 20.42it/s, loss=0.104] 
training 4th epoch: 32it [00:01, 19.83it/s, loss=0.104]  
training 5th epoch: 32it [00:01, 18.51it/s, loss=0.103]  
training 6th epoch: 32it [00:01, 18.71it/s, loss=0.102]  
training 7th epoch: 32it [00:01, 18.63it/s, loss=0.102]  
training 8th epoch: 32it [00:01, 18.68it/s, loss=0.101]  
training 9th epoch: 32it [00:01, 18.58it/s, loss=0.101]  
training 10th epoch: 32it [00:01, 18.70it/s, loss=0.1]    
training 11th epoch: 32it [00:01, 18.68it/s, loss=0.0995] 
training 12th epoch: 32it [00:01, 18.68it/s, loss=0.0989] 
training 13th epoch: 32it [00:01, 18.61it/s, loss=0.0984] 
training 14th epoch: 32it [00:01, 18.40it/s, loss=0.0978] 
training 15th epoch: 32it [00:01, 18.51it/s, loss=0.0972] 
training 16th epoch: 32it [00:01, 18.50it/s, loss=0.0967] 
training 1

training 138th epoch: 32it [00:01, 20.39it/s, loss=0.0563] 
training 139th epoch: 32it [00:01, 20.54it/s, loss=0.0561] 
training 140th epoch: 32it [00:01, 20.54it/s, loss=0.0559] 
training 141th epoch: 32it [00:01, 20.26it/s, loss=0.0557] 
training 142th epoch: 32it [00:01, 20.75it/s, loss=0.0555] 
training 143th epoch: 32it [00:01, 20.59it/s, loss=0.0553] 
training 144th epoch: 32it [00:01, 20.48it/s, loss=0.0551] 
training 145th epoch: 32it [00:01, 20.60it/s, loss=0.0549] 
training 146th epoch: 32it [00:01, 20.59it/s, loss=0.0547] 
training 147th epoch: 32it [00:01, 20.55it/s, loss=0.0546] 
training 148th epoch: 32it [00:01, 21.37it/s, loss=0.0544] 
training 149th epoch: 32it [00:01, 21.32it/s, loss=0.0542] 
training 150th epoch: 32it [00:01, 20.46it/s, loss=0.054]  
training 151th epoch: 32it [00:01, 20.66it/s, loss=0.0538] 
training 152th epoch: 32it [00:01, 20.80it/s, loss=0.0536] 
training 153th epoch: 32it [00:01, 20.84it/s, loss=0.0535] 
training 154th epoch: 32it [00:01, 20.89

training 274th epoch: 32it [00:01, 19.29it/s, loss=0.0377] 
training 275th epoch: 32it [00:01, 19.30it/s, loss=0.0376] 
training 276th epoch: 32it [00:01, 20.57it/s, loss=0.0376] 
training 277th epoch: 32it [00:01, 20.84it/s, loss=0.0375] 
training 278th epoch: 32it [00:01, 20.82it/s, loss=0.0374] 
training 279th epoch: 32it [00:01, 20.49it/s, loss=0.0373] 
training 280th epoch: 32it [00:01, 20.70it/s, loss=0.0372] 
training 281th epoch: 32it [00:01, 20.74it/s, loss=0.0371] 
training 282th epoch: 32it [00:01, 20.47it/s, loss=0.037]  
training 283th epoch: 32it [00:01, 20.68it/s, loss=0.0369] 
training 284th epoch: 32it [00:01, 20.35it/s, loss=0.0368] 
training 285th epoch: 32it [00:01, 20.59it/s, loss=0.0367] 
training 286th epoch: 32it [00:01, 20.75it/s, loss=0.0366] 
training 287th epoch: 32it [00:01, 20.67it/s, loss=0.0366] 
training 288th epoch: 32it [00:01, 20.65it/s, loss=0.0365] 
training 289th epoch: 32it [00:01, 20.47it/s, loss=0.0364] 
training 290th epoch: 32it [00:01, 20.74

training 410th epoch: 32it [00:01, 20.53it/s, loss=0.0281] 
training 411th epoch: 32it [00:01, 20.39it/s, loss=0.028]  
training 412th epoch: 32it [00:01, 20.49it/s, loss=0.028]  
training 413th epoch: 32it [00:01, 20.33it/s, loss=0.0279] 
training 414th epoch: 32it [00:01, 20.50it/s, loss=0.0279] 
training 415th epoch: 32it [00:01, 20.64it/s, loss=0.0278] 
training 416th epoch: 32it [00:01, 21.27it/s, loss=0.0278] 
training 417th epoch: 32it [00:01, 20.54it/s, loss=0.0277] 
training 418th epoch: 32it [00:01, 20.76it/s, loss=0.0277] 
training 419th epoch: 32it [00:01, 20.48it/s, loss=0.0276] 
training 420th epoch: 32it [00:01, 20.36it/s, loss=0.0276] 
training 421th epoch: 32it [00:01, 21.14it/s, loss=0.0275] 
training 422th epoch: 32it [00:01, 21.08it/s, loss=0.0275] 
training 423th epoch: 32it [00:01, 20.37it/s, loss=0.0274] 
training 424th epoch: 32it [00:01, 20.18it/s, loss=0.0274] 
training 425th epoch: 32it [00:01, 19.02it/s, loss=0.0273] 
training 426th epoch: 32it [00:01, 19.08

training 546th epoch: 32it [00:01, 20.67it/s, loss=0.0222] 
training 547th epoch: 32it [00:01, 20.41it/s, loss=0.0222] 
training 548th epoch: 32it [00:01, 19.95it/s, loss=0.0222] 
training 549th epoch: 32it [00:01, 20.27it/s, loss=0.0221] 
training 550th epoch: 32it [00:01, 20.12it/s, loss=0.0221] 
training 551th epoch: 32it [00:01, 20.41it/s, loss=0.0221] 
training 552th epoch: 32it [00:01, 20.33it/s, loss=0.022]  
training 553th epoch: 32it [00:01, 20.41it/s, loss=0.022]  
training 554th epoch: 32it [00:01, 20.54it/s, loss=0.022]  
training 555th epoch: 32it [00:01, 20.21it/s, loss=0.0219] 
training 556th epoch: 32it [00:01, 20.46it/s, loss=0.0219] 
training 557th epoch: 32it [00:01, 20.33it/s, loss=0.0218] 
training 558th epoch: 32it [00:01, 20.66it/s, loss=0.0218] 
training 559th epoch: 32it [00:01, 20.71it/s, loss=0.0218] 
training 560th epoch: 32it [00:01, 20.32it/s, loss=0.0217] 
training 561th epoch: 32it [00:01, 20.55it/s, loss=0.0217] 
training 562th epoch: 32it [00:01, 20.44

training 682th epoch: 32it [00:01, 18.90it/s, loss=0.0183] 
training 683th epoch: 32it [00:01, 18.81it/s, loss=0.0183] 
training 684th epoch: 32it [00:01, 18.57it/s, loss=0.0183] 
training 685th epoch: 32it [00:01, 18.69it/s, loss=0.0182] 
training 686th epoch: 32it [00:01, 18.73it/s, loss=0.0182] 
training 687th epoch: 32it [00:01, 18.80it/s, loss=0.0182] 
training 688th epoch: 32it [00:01, 18.65it/s, loss=0.0182] 
training 689th epoch: 32it [00:01, 18.72it/s, loss=0.0181] 
training 690th epoch: 32it [00:01, 18.58it/s, loss=0.0181] 
training 691th epoch: 32it [00:01, 18.68it/s, loss=0.0181] 
training 692th epoch: 32it [00:01, 18.76it/s, loss=0.0181] 
training 693th epoch: 32it [00:01, 18.69it/s, loss=0.018]  
training 694th epoch: 32it [00:01, 18.41it/s, loss=0.018]  
training 695th epoch: 32it [00:01, 18.59it/s, loss=0.018]  
training 696th epoch: 32it [00:01, 18.60it/s, loss=0.018]  
training 697th epoch: 32it [00:01, 18.68it/s, loss=0.0179] 
training 698th epoch: 32it [00:01, 18.39

training 818th epoch: 32it [00:01, 18.22it/s, loss=0.0155] 
training 819th epoch: 32it [00:01, 17.40it/s, loss=0.0155] 
training 820th epoch: 32it [00:01, 17.88it/s, loss=0.0155] 
training 821th epoch: 32it [00:01, 18.07it/s, loss=0.0155] 
training 822th epoch: 32it [00:01, 18.06it/s, loss=0.0154] 
training 823th epoch: 32it [00:01, 17.81it/s, loss=0.0154] 
training 824th epoch: 32it [00:01, 19.53it/s, loss=0.0154] 
training 825th epoch: 32it [00:01, 20.30it/s, loss=0.0154] 
training 826th epoch: 32it [00:01, 20.52it/s, loss=0.0154] 
training 827th epoch: 32it [00:01, 20.23it/s, loss=0.0154] 
training 828th epoch: 32it [00:01, 20.18it/s, loss=0.0153] 
training 829th epoch: 32it [00:01, 20.14it/s, loss=0.0153] 
training 830th epoch: 32it [00:01, 20.37it/s, loss=0.0153] 
training 831th epoch: 32it [00:01, 20.72it/s, loss=0.0153] 
training 832th epoch: 32it [00:01, 21.09it/s, loss=0.0153] 
training 833th epoch: 32it [00:01, 20.79it/s, loss=0.0152] 
training 834th epoch: 32it [00:01, 20.91

training 954th epoch: 32it [00:01, 18.94it/s, loss=0.0134] 
training 955th epoch: 32it [00:01, 18.83it/s, loss=0.0134] 
training 956th epoch: 32it [00:01, 20.31it/s, loss=0.0134] 
training 957th epoch: 32it [00:01, 20.70it/s, loss=0.0134] 
training 958th epoch: 32it [00:01, 20.30it/s, loss=0.0134] 
training 959th epoch: 32it [00:01, 20.75it/s, loss=0.0134] 
training 960th epoch: 32it [00:01, 20.40it/s, loss=0.0133] 
training 961th epoch: 32it [00:01, 20.51it/s, loss=0.0133] 
training 962th epoch: 32it [00:01, 20.49it/s, loss=0.0133] 
training 963th epoch: 32it [00:01, 20.40it/s, loss=0.0133] 
training 964th epoch: 32it [00:01, 20.37it/s, loss=0.0133] 
training 965th epoch: 32it [00:01, 20.48it/s, loss=0.0133] 
training 966th epoch: 32it [00:01, 20.43it/s, loss=0.0133] 
training 967th epoch: 32it [00:01, 20.96it/s, loss=0.0132] 
training 968th epoch: 32it [00:01, 20.51it/s, loss=0.0132] 
training 969th epoch: 32it [00:01, 20.51it/s, loss=0.0132] 
training 970th epoch: 32it [00:01, 20.77

In [40]:
def validate(encoder, decoder, dataloader):
    with torch.no_grad():
        for i, batch in enumerate(dataloader):
            # get inputs and labels
            inputs, labels = batch
            inputs = inputs.to(device)
            labels = labels.to(device)

            print('** inputs[0]: {}'.format(inputs[0]))
            print('** labels[0]: {}'.format(labels[0]))

            # transpose inputs and labels
            inputs = inputs.transpose(1, 0)
            labels = labels.transpose(1, 0)

            # initialize hidden for encoder
            batch_size = inputs.size()[1]
            encoder_hidden = encoder.initHidden(batch_size)
            encoder_outputs = torch.zeros(MAX_LENGTH, batch_size, hidden_size, device=device)
            encoder_outputs_ = torch.zeros(MAX_LENGTH, batch_size, hidden_size, device=device)
            
            # encoding
            for j, inp in enumerate(inputs):
                inp = inp.unsqueeze(0)
                encoder_output, encoder_hidden = encoder(inp, encoder_hidden)
                #print('** encoder_output_{}: {}'.format(i, encoder_output.shape))
                encoder_outputs[j] = encoder_output
                encoder_outputs_[j] += encoder_output[:,0]

            print('** encoder_outputs.sum() VS encoder_outputs.sum(): {:.4f} VS {:.4f}'.format(encoder_outputs.sum(), encoder_outputs_.sum()))
            
            # initialize hidden for decoder
            decoder_hidden = encoder_hidden
            decoder_inputs = torch.tensor([[bos]*batch_size], device=device)

            pred = []
            # decoding
            for inp in labels:
                #inp = inp.unsqueeze(0)

                print('** inp: {}'.format(inp[0]))
                print('** decoder_inputs shape: {}'.format(decoder_inputs.shape))
                if use_attention:
                    decoder_output, decoder_hidden = decoder(decoder_inputs, decoder_hidden, encoder_outputs_)
                    print('** decoder_output shape: {}'.format(decoder_output.shape))
                    decoder_output = decoder_output.argmax(1)
                    print('** decoder_output[0]: {}'.format(decoder_output[0]))
                    pred.append(decoder_output.cpu().numpy())
                else:
                    decoder_output, decoder_hidden = decoder(decoder_inputs, decoder_hidden)
                    print('** decoder_output shape: {}'.format(decoder_output.shape))
                    decoder_output = decoder_output.argmax(1)
                    print('** decoder_output[0]: {}'.format(decoder_output[0]))
                    pred.append(decoder_output.cpu().numpy())

                decoder_inputs = inp.unsqueeze(0)

            # re-transpose for validation
            inputs = inputs.transpose(1, 0)
            labels = labels.transpose(1, 0).cpu().numpy()
            #return

            # stack-up prediction for validation
            pred = np.stack(pred, axis=1)
            accuracy = (pred == labels).astype(np.int).mean()
            print('{} VS {} → {:.4f}'.format(labels.shape, pred.shape, accuracy))
            
        return labels, pred

In [41]:
labels, pred = validate(encoder, decoder, valid_dataloader)

** inputs[0]: tensor([18, 21, 16, 12, 21, 12, 18, 16, 28, 28, 28, 23,  1,  2],
       device='cuda:0')
** labels[0]: tensor([13, 10, 15, 19, 10, 19, 13, 15,  3,  3,  3,  8,  1,  2],
       device='cuda:0')
** encoder_outputs.sum() VS encoder_outputs.sum(): -2726.1765 VS 852.4473
** inp: 13
** decoder_inputs shape: torch.Size([1, 32])
** decoder_output shape: torch.Size([32, 29])
** decoder_output[0]: 13
** inp: 10
** decoder_inputs shape: torch.Size([1, 32])
** decoder_output shape: torch.Size([32, 29])
** decoder_output[0]: 10
** inp: 15
** decoder_inputs shape: torch.Size([1, 32])
** decoder_output shape: torch.Size([32, 29])
** decoder_output[0]: 15
** inp: 19
** decoder_inputs shape: torch.Size([1, 32])
** decoder_output shape: torch.Size([32, 29])
** decoder_output[0]: 19
** inp: 10
** decoder_inputs shape: torch.Size([1, 32])
** decoder_output shape: torch.Size([32, 29])
** decoder_output[0]: 10
** inp: 19
** decoder_inputs shape: torch.Size([1, 32])
** decoder_output shape: torc

In [44]:
labels[1]

array([13, 16, 14, 22, 20, 12, 19, 12, 11,  1,  2,  2,  2,  2])

In [45]:
pred[1]

array([13, 16, 14, 14, 19, 11, 19, 16, 11, 16,  2,  2,  2,  2])

In [110]:
def predict_sequence(sequence):
    # some reserved words
    bos = 0
    eos = 1
    pad = 2

    with torch.no_grad():
        # make input tensor
        seq = list(map(lambda s: a2i[s], sequence))
        n_pad = MAX_LENGTH - len(seq) - 2
        inputs = [bos] + seq + [eos]# + [pad]*n_pad
        inputs = torch.tensor(inputs, dtype=torch.long).to(device)
        inputs = inputs.unsqueeze(0)
        print(inputs.shape)


        # transpose inputs and labels
        inputs = inputs.transpose(1, 0)
        #labels = labels.transpose(1, 0)

        # initialize hidden for encoder
        batch_size = inputs.size()[1]    
        encoder_hidden = encoder.initHidden(batch_size)
        encoder_outputs = torch.zeros(MAX_LENGTH, batch_size, hidden_size, device=device)
        print('** encoder_outputs: {}'.format(encoder_outputs.shape))
        # encoding
        for i, inp in enumerate(inputs):
            inp = inp.unsqueeze(0)
            encoder_output, encoder_hidden = encoder(inp, encoder_hidden)
            encoder_outputs[i] += encoder_output[:, 0]

        # initialize hidden for decoder
        decoder_hidden = encoder_hidden

        pred = []
        #target = [bos]
        #target = torch.tensor(target).to(device)

        print('** encoder_outputs sum: {}'.format(encoder_outputs.sum()))
        print('** encoder_hidden sum: {}'.format(encoder_hidden.sum()))

        #target = target.unsqueeze(0)
        decoder_input = torch.tensor([[bos]], device=device)  # SOS
        # decoding
        for i in range(MAX_LENGTH):

            if i == 7:
                break

            if use_attention:
                print('** decoder_input: {}'.format(decoder_input.shape))
                decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
                #decoder_output = decoder_output.argmax(1)
                topv, topi = decoder_output.data.topk(1)
                print('** topi: {}'.format(topi.data.item()))
                if topi.item() == eos:
                    pred.append('</s>')
                    break
                else:
                    pred.append(i2a[topi.item()])

                decoder_input = topi.detach()
        #else:
        #    decoder_output, decoder_hidden = decoder(inp, decoder_hidden)
        

In [111]:
predict_sequence('abc')

torch.Size([1, 5])
** encoder_outputs: torch.Size([15, 1, 256])
** encoder_outputs sum: -29.475393295288086
** encoder_hidden sum: -8.637638092041016
** decoder_input: torch.Size([1, 1])
** topi: 0
** decoder_input: torch.Size([1, 1])
** topi: 0
** decoder_input: torch.Size([1, 1])
** topi: 0
** decoder_input: torch.Size([1, 1])
** topi: 0
** decoder_input: torch.Size([1, 1])
** topi: 0
** decoder_input: torch.Size([1, 1])
** topi: 0
** decoder_input: torch.Size([1, 1])
** topi: 0


In [40]:
# encoder_output, encoder_hidden = encoder(inputs, encoder_hidden)

In [174]:
# batch_size = labels.size()[1]
# decoder_hidden = decoder.initHidden(batch_size)

In [26]:
# decoder(inputs, decoder_hidden, encoder_hidden)

In [190]:
m = nn.LogSoftmax(dim=1)
inp = torch.randn(3, 5, requires_grad=True)
lbl = torch.tensor([1, 0, 4])
o = criterion(m(inp), lbl)
inp.shape, lbl.shape

(torch.Size([3, 5]), torch.Size([3]))

In [189]:
o

tensor(1.7729, grad_fn=<NllLossBackward>)

In [27]:
for batch in train_dataloader:
    inputs, labels = batch
    inputs = inputs.to(device)
    labels = labels.to(device)
    print(inputs.shape, labels.shape)
    break

torch.Size([32, 15]) torch.Size([32, 15])


In [42]:
def train(encoder, decoder, dataloader, encoder_optimizer, decoder_optimizer, criterion):
    
    # zero_grad for encoder/decoder optimizer
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    for i, batch in enumerate(dataloader):
        # get inputs and labels
        inputs, labels = batch
        inputs = inputs.to(device)
        labels = labels.to(device)

        # transpose inputs and labels
        inputs = inputs.transpose(1, 0)
        labels = labels.transpose(1, 0)
        
        # initialize hidden for encoder
        batch_size = inputs.size()[1]
        encoder_hidden = encoder.initHidden(batch_size)
        encoder_outputs = torch.zeros(15, 256, device=device)
        
        # encoding
        for ip in inputs:
            ip = ip.unsqueeze(0)
            encoder_output, encoder_hidden = encoder(ip, encoder_hidden)
            print('** encoder_output_{}: {}'.format(i, encoder_output.shape))
            #print('** encoder_hidden_{}: {}'.format(i, encoder_hidden.shape))
            
        # initialize hidden for decoder
        decoder_hidden = encoder_hidden
        loss = 0
    
        # decoding
        for inp in labels:
            #inp = inp.unsqueeze(0)
            decoder_output, decoder_hidden = decoder(inp.unsqueeze(0), decoder_hidden)
            
            decoder_output = decoder_output.squeeze(0)
            
            #print('** decoder_hidden_{}: {}'.format(i, decoder_hidden.shape))
            loss_it = criterion(decoder_output, inp)
            loss += criterion(decoder_output, inp)
            #print('** label vs pred: {} vs {} → {:.4f}'.format(inp.shape, decoder_output.shape, loss_it))
        
        #encoder_sum = sum([p[1].data.sum() for p in encoder.named_parameters()])
        #decoder_sum = sum([p[1].data.sum() for p in decoder.named_parameters()])
        #print('encoder, decoder: {:.4f} {:.4f}'.format(encoder_sum, decoder_sum))
        
        # backward
        loss.backward()
        #encoder_sum = sum([p[1].data.sum() for p in encoder.named_parameters()])
        #decoder_sum = sum([p[1].data.sum() for p in decoder.named_parameters()])
        #print('total loss after backward: {:.4f}'.format(loss))
        #print('encoder, decoder: {:.4f} {:.4f}'.format(encoder_sum, decoder_sum))
        
        # update encoder/decoder
        encoder_optimizer.step()
        decoder_optimizer.step()
        #encoder_sum = sum([p[1].data.sum() for p in encoder.named_parameters()])
        #decoder_sum = sum([p[1].data.sum() for p in decoder.named_parameters()])
        #print('total loss after step: {:.4f}'.format(loss))
        #print('encoder, decoder: {:.4f} {:.4f}'.format(encoder_sum, decoder_sum))
        print('{}th iteration → total loss after update: {:.4f}'.format(i, loss))
        #print('-------------------------------------')
        #break

In [43]:
def validate(encoder, decoder, dataloader):
    for i, batch in enumerate(dataloader):
        # get inputs and labels
        inputs, labels = batch
        inputs = inputs.to(device)
        labels = labels.to(device)

        # transpose inputs and labels
        inputs = inputs.transpose(1, 0)
        labels = labels.transpose(1, 0)
        
        # initialize hidden for encoder
        batch_size = inputs.size()[1]
        encoder_hidden = encoder.initHidden(batch_size)
        
        # encoding
        for ip in inputs:
            ip = ip.unsqueeze(0)
            encoder_output, encoder_hidden = encoder(ip, encoder_hidden)
            
        # initialize hidden for decoder
        decoder_hidden = encoder_hidden
        
        pred = []
        # decoding
        for inp in labels:
            decoder_output, decoder_hidden = decoder(inp.unsqueeze(0), decoder_hidden)
            decoder_output = decoder_output.squeeze(0)            
            decoder_output = decoder_output.argmax(1)
            pred.append(decoder_output.cpu().numpy())
            #print('** decoder_output: {}'.format(decoder_output.cpu().numpy().shape))
            
        # re-transpose for validation
        inputs = inputs.transpose(1, 0)
        labels = labels.transpose(1, 0).cpu().numpy()
        
        # stack-up prediction for validation
        pred = np.stack(pred, axis=1)
        
        # calculate batch-accurac y
        accuracy = (pred == labels).astype(np.int).mean()
        print('{} VS {} → {:.4f}'.format(labels.shape, pred.shape, accuracy))

In [44]:
# validate(encoder, decoder, valid_dataloader)

In [45]:
learning_rate = 0.01
encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()

In [46]:
train(encoder, decoder, train_dataloader, encoder_optimizer, decoder_optimizer, criterion)

** hidden: torch.Size([1, 32, 256])
** encoder_output_0: torch.Size([1, 32, 256])
** hidden: torch.Size([1, 32, 256])
** encoder_output_0: torch.Size([1, 32, 256])
** hidden: torch.Size([1, 32, 256])
** encoder_output_0: torch.Size([1, 32, 256])
** hidden: torch.Size([1, 32, 256])
** encoder_output_0: torch.Size([1, 32, 256])
** hidden: torch.Size([1, 32, 256])
** encoder_output_0: torch.Size([1, 32, 256])
** hidden: torch.Size([1, 32, 256])
** encoder_output_0: torch.Size([1, 32, 256])
** hidden: torch.Size([1, 32, 256])
** encoder_output_0: torch.Size([1, 32, 256])
** hidden: torch.Size([1, 32, 256])
** encoder_output_0: torch.Size([1, 32, 256])
** hidden: torch.Size([1, 32, 256])
** encoder_output_0: torch.Size([1, 32, 256])
** hidden: torch.Size([1, 32, 256])
** encoder_output_0: torch.Size([1, 32, 256])
** hidden: torch.Size([1, 32, 256])
** encoder_output_0: torch.Size([1, 32, 256])
** hidden: torch.Size([1, 32, 256])
** encoder_output_0: torch.Size([1, 32, 256])
** hidden: torch

TypeError: forward() missing 1 required positional argument: 'encoder_output'

In [None]:
# inp = torch.from_numpy(np.random.randint(0, 29, (32)))
# out = torch.from_numpy(np.random.random((32, 29)))
# criterion(out, inp)

### *************** NMT ***************

In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

In [2]:
torch.__version__

'1.7.0+cu101'

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

### 데이터셋 다운로드 → https://download.pytorch.org/tutorial/data.zip

### 참고 → https://tutorials.pytorch.kr/intermediate/seq2seq_translation_tutorial.html

In [4]:
SOS_token = 0
EOS_token = 1

In [5]:
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # SOS 와 EOS 포함

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [6]:
# 유니 코드 문자열을 일반 ASCII로 변환하십시오.
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

In [7]:
# 소문자, 다듬기, 그리고 문자가 아닌 문자 제거
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [8]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # 파일을 읽고 줄로 분리
    lines = open('../data/attention-ntm/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # 모든 줄을 쌍으로 분리하고 정규화
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # 쌍을 뒤집고, Lang 인스턴스 생성
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [9]:
MAX_LENGTH = 10

In [10]:
eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

In [11]:
def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)

In [12]:
def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [13]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

In [14]:
input_lang, output_lang, pairs = prepareData('eng', 'fra', True)
print(random.choice(pairs))

Reading lines...
Read 135842 sentence pairs
Trimmed to 10599 sentence pairs
Counting words...
Counted words:
fra 4345
eng 2803
['j ai honte de mon corps .', 'i m ashamed of my body .']


In [15]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [16]:
# without Attention
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        print('DecoderRNN_INPUT: input: {} → {:.4f}'.format(input.shape, input.sum()))
        print('DecoderRNN_INPUT: hidden: {} → {:.4f}'.format(hidden.shape, hidden.sum()))
        
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        print('** output: {} → {:.4f}'.format(output.shape, output.sum()))
        
        
        output, hidden = self.gru(output, hidden)
        print('** gru-output: {} → {:.4f}'.format(output.shape, output.sum()))
        print('** gru-hidden: {} → {:.4f}'.format(hidden.shape, hidden.sum()))
        
        output = self.softmax(self.out(output[0]))
        #print('** hidden: {}'.format(hidden.shape))
        
        print('DecoderRNN_OUTPUT: output: {} → {:.4f}'.format(output.shape, output.sum()))
        print('DecoderRNN_OUTPUT: hidden: {} → {:.4f}'.format(hidden.shape, hidden.sum()))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [30]:
# with attention
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)
        print('AttnDecoderRNN_INPUT: input → embedded: {} → {}'.format(input.shape, embedded.shape))
        print('AttnDecoderRNN_INPUT: hidden: {}'.format(hidden.shape))
        print('AttnDecoderRNN_INPUT: encoder_outputs: {}'.format(encoder_outputs.shape))
        
        #print('** embedded[0]: {}'.format(embedded[0].shape))
        #print('** hidden[0]: {}'.format(hidden[0].shape))
        #print('** concatenate embedded[0] and hidden[0]: {}'.format(torch.cat((embedded[0], hidden[0]), 1).shape))
        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        #print('** attn_weights: {}'.format(attn_weights.shape))
        #print('** attn_weights: {}'.format(attn_weights))
        #print('** attn_weights.sum(): {}'.format(attn_weights.sum().shape))
        #print('** attn_weights.sum(): {:.4f}'.format(attn_weights.sum()))
        
        #print('** attn_weights unsqueezed: {}'.format(attn_weights.unsqueeze(0).shape))
        #print('** encoder_outputs unsqueezed: {}'.format(encoder_outputs.unsqueeze(0).shape))
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))
        #print('** attn_applied: {}'.format(attn_applied.shape))
        #print('** attn_applied[0]: {}'.format(attn_applied[0].shape))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        #print('** output: {}'.format(output.shape))
        output = self.attn_combine(output).unsqueeze(0)
        #print('** output: {}'.format(output.shape))
        
        output = F.relu(output)
        #print('** output: {}'.format(output.shape))
        output, hidden = self.gru(output, hidden)
        #print('** gru-output: {}'.format(output.shape))
        #print('** gru-hidden: {}'.format(hidden.shape))

        #print('** output: {}'.format(output[0].shape))
        #print('** self.out: {}'.format(self.out))
        output = F.log_softmax(self.out(output[0]), dim=1)
        print('AttnDecoderRNN_OUTPUT: {}'.format(output.shape))
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [31]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [32]:
teacher_forcing_ratio = 0.5

In [44]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH, with_attention=True):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)
    
    print('** input_tensor[0]: {}'.format(input_tensor[0]))
    print('** target_tensor[0]: {}'.format(target_tensor[0]))
    return

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
    print('encoder_outputs: {}'.format(encoder_outputs.shape))
    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]
        print('{} - {}'.format(ei, encoder_output[0, 0].shape))

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden
    print('decoder_hidden: {}'.format(decoder_hidden.shape))

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing 포함: 목표를 다음 입력으로 전달
        for di in range(target_length):
            if with_attention:
                print('# {}th decoding'.format(di))
                decoder_output, decoder_hidden, decoder_attention = decoder(
                    decoder_input, decoder_hidden, encoder_outputs)
                print('train:decoder_attention_{}: {}'.format(di, decoder_attention.shape))
            else:
                decoder_output, decoder_hidden = decoder(
                    decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing
            

    else:
        # Teacher forcing 미포함: 자신의 예측을 다음 입력으로 사용
        for di in range(target_length):
            if with_attention:
                decoder_output, decoder_hidden, decoder_attention = decoder(
                    decoder_input, decoder_hidden, encoder_outputs)
                print('train:decoder_attention_{}: {}'.format(di, decoder_attention.shape))
            else:
                decoder_output, decoder_hidden = decoder(
                    decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # 입력으로 사용할 부분을 히스토리에서 분리

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [46]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [47]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01, with_attention=True):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # print_every 마다 초기화
    plot_loss_total = 0  # plot_every 마다 초기화

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        print(input_tensor.shape, target_tensor.shape)
        print(input_tensor)
        print(target_tensor)
        print('----------------------')
        
        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion, with_attention=with_attention)
        return
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [48]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # 주기적인 간격에 이 locator가 tick을 설정
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [49]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [50]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [51]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

In [52]:
# hidden_size = 256
# encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
# decoder1 = DecoderRNN(hidden_size, output_lang.n_words).to(device)

In [53]:
#### Input validation

In [54]:
# trainIters(encoder1, attn_decoder1, 75000, print_every=5000)

In [55]:
trainIters(encoder1, decoder1, 75000, print_every=5000, with_attention=True)

torch.Size([10, 1]) torch.Size([8, 1])
tensor([[ 123],
        [ 245],
        [ 124],
        [ 246],
        [ 963],
        [  34],
        [ 101],
        [2194],
        [   5],
        [   1]], device='cuda:0')
tensor([[ 77],
        [ 78],
        [147],
        [ 22],
        [986],
        [588],
        [  4],
        [  1]], device='cuda:0')
----------------------
** input_tensor[0]: tensor([123], device='cuda:0')
** target_tensor[0]: tensor([77], device='cuda:0')


In [218]:
# evaluateRandomly(encoder1, attn_decoder1)

In [221]:
output_words, attentions = evaluate(
    encoder1, attn_decoder1, "je suis trop froid .")


AttnDecoderRNN_INPUT: input → embedded: torch.Size([1, 1]) → torch.Size([1, 1, 256])
AttnDecoderRNN_INPUT: hidden: torch.Size([1, 1, 256])
AttnDecoderRNN_INPUT: encoder_outputs: torch.Size([10, 256])
** embedded[0]: torch.Size([1, 256])
** hidden[0]: torch.Size([1, 256])
** concatenate embedded[0] and hidden[0]: torch.Size([1, 512])
** attn_weights: torch.Size([1, 10])
** attn_weights: tensor([[0.0686, 0.1335, 0.0626, 0.0603, 0.0611, 0.1706, 0.0989, 0.1053, 0.1473,
         0.0919]], device='cuda:0')
** attn_weights.sum(): torch.Size([])
** attn_weights.sum(): 1.0000
** attn_weights unsqueezed: torch.Size([1, 1, 10])
** encoder_outputs unsqueezed: torch.Size([1, 10, 256])
** attn_applied: torch.Size([1, 1, 256])
** attn_applied[0]: torch.Size([1, 256])
** output: torch.Size([1, 512])
** output: torch.Size([1, 1, 256])
** output: torch.Size([1, 1, 256])
** gru-output: torch.Size([1, 1, 256])
** gru-hidden: torch.Size([1, 1, 256])
** output: torch.Size([1, 256])
** self.out: Linear(in_fe

<matplotlib.image.AxesImage at 0x7f4b216434e0>

In [226]:
def showAttention(input_sentence, output_words, attentions):
    # colorbar로 그림 설정
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)

    # 축 설정
    ax.set_xticklabels([''] + input_sentence.split(' ') +
                       ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # 매 틱마다 라벨 보여주기
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()

In [227]:
def evaluateAndShowAttention(input_sentence):
    output_words, attentions = evaluate(
        encoder1, attn_decoder1, input_sentence)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))
    showAttention(input_sentence, output_words, attentions)

In [228]:
evaluateAndShowAttention("elle a cinq ans de moins que moi .")
#evaluateAndShowAttention("elle est trop petit .")
#evaluateAndShowAttention("je ne crains pas de mourir .")
#evaluateAndShowAttention("c est un jeune directeur plein de talent .")

AttnDecoderRNN_INPUT: input → embedded: torch.Size([1, 1]) → torch.Size([1, 1, 256])
AttnDecoderRNN_INPUT: hidden: torch.Size([1, 1, 256])
AttnDecoderRNN_INPUT: encoder_outputs: torch.Size([10, 256])
** embedded[0]: torch.Size([1, 256])
** hidden[0]: torch.Size([1, 256])
** concatenate embedded[0] and hidden[0]: torch.Size([1, 512])
** attn_weights: torch.Size([1, 10])
** attn_weights: tensor([[0.0618, 0.1636, 0.0533, 0.0789, 0.0618, 0.1500, 0.0878, 0.1144, 0.1363,
         0.0921]], device='cuda:0')
** attn_weights.sum(): torch.Size([])
** attn_weights.sum(): 1.0000
** attn_weights unsqueezed: torch.Size([1, 1, 10])
** encoder_outputs unsqueezed: torch.Size([1, 10, 256])
** attn_applied: torch.Size([1, 1, 256])
** attn_applied[0]: torch.Size([1, 256])
** output: torch.Size([1, 512])
** output: torch.Size([1, 1, 256])
** output: torch.Size([1, 1, 256])
** gru-output: torch.Size([1, 1, 256])
** gru-hidden: torch.Size([1, 1, 256])
** output: torch.Size([1, 256])
** self.out: Linear(in_fe

  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()
