### reference: https://towardsdatascience.com/attention-seq2seq-with-pytorch-learning-to-invert-a-sequence-34faf4133e53

In [1]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
from tqdm import tqdm
import torch.nn.functional as F

In [2]:
torch.__version__

'1.7.0+cu101'

In [3]:
torch.__version__

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
MAX_LENGTH = 15

In [5]:
inverse_map = {
    'a':'z',
    'b':'y',
    'c':'x',
    'd':'w',
    'e':'v',
    'f':'u',
    'g':'t',
    'h':'s',
    'i':'r',
    'j':'q',
    'k':'p',
    'l':'o',
    'm':'n',
    'n':'m',
    'o':'l',
    'p':'k',
    'q':'j',
    'r':'i',
    's':'h',
    't':'g',
    'u':'f',
    'v':'e',
    'w':'d',
    'x':'c',
    'y':'b',
    'z':'a'
}

In [6]:
a2i = {
    '<s>':0,
    '</s>':1,
    #'<pad>':2,
    'a':2,
    'b':3,
    'c':4,
    'd':5,
    'e':6,
    'f':7,
    'g':8,
    'h':9,
    'i':10,
    'j':11,
    'k':12,
    'l':13,
    'm':14,
    'n':15,
    'o':16,
    'p':17,
    'q':18,
    'r':19,
    's':20,
    't':21,
    'u':22,
    'v':23,
    'w':24,
    'x':25,
    'y':26,
    'z':27,
}

In [7]:
i2a = {v:k for k, v in a2i.items()}

In [8]:
i2a

{0: '<s>',
 1: '</s>',
 2: 'a',
 3: 'b',
 4: 'c',
 5: 'd',
 6: 'e',
 7: 'f',
 8: 'g',
 9: 'h',
 10: 'i',
 11: 'j',
 12: 'k',
 13: 'l',
 14: 'm',
 15: 'n',
 16: 'o',
 17: 'p',
 18: 'q',
 19: 'r',
 20: 's',
 21: 't',
 22: 'u',
 23: 'v',
 24: 'w',
 25: 'x',
 26: 'y',
 27: 'z'}

In [9]:
def generate_random_alphabet_index():
    random_length = np.random.randint(5, MAX_LENGTH-2)    # -2 because of <s> and </s>
    #random_length = 14
    random_alphabet_index = np.random.randint(0, 26, random_length) + 2
    return random_alphabet_index.tolist()

In [10]:
class AlphabetToyDataset(Dataset):
    def __init__(self, n_dataset=1000):
        bos = 0
        eos = 1
        pad = 2
        self.inputs = []
        self.labels = []
        for _ in range(n_dataset):
            # make input example
            aindex = generate_random_alphabet_index()
            
            # index to alphabet
            alphabet = list(map(lambda a: i2a[a], aindex))
            
            # inversing
            inversed_alphabet = list(map(lambda a: inverse_map[a], alphabet))
            
            # alphabet to index
            iindex = list(map(lambda ia: a2i[ia], inversed_alphabet))
            
            # add bos, eos and pad
            n_pad = MAX_LENGTH - len(aindex) - 2
            aindex = aindex + [eos]# + [pad]*n_pad
            iindex = iindex + [eos]# + [pad]*n_pad
            
            # add to examples
            self.inputs.append(aindex)
            self.labels.append(iindex)
            
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, index):
        return [
            torch.tensor(self.inputs[index], dtype=torch.long),
            torch.tensor(self.labels[index], dtype=torch.long)
        ]

In [11]:
train_dataset = AlphabetToyDataset(n_dataset=3000)
valid_dataset = AlphabetToyDataset(n_dataset=300)

In [12]:
len(train_dataset), len(valid_dataset)

(3000, 300)

In [13]:
def convert_index_to_alphabet(index):
    alphabet = list(map(lambda i: i2a[i], index))
    return ''.join(alphabet)

In [14]:
for i in range(3):
    ex = train_dataset[i]
    aindex, iindex = ex
    
    print('aindex_{}: {}'.format(len(aindex), convert_index_to_alphabet(aindex.numpy())))
    print('iindex_{}: {}'.format(len(iindex), convert_index_to_alphabet(iindex.numpy())))
    print('** aindex_{}: {}'.format(len(aindex), aindex))
    print('** iindex_{}: {}'.format(len(iindex), iindex))
    print('------------')

aindex_9: jpkdaofu</s>
iindex_9: qkpwzluf</s>
** aindex_9: tensor([11, 17, 12,  5,  2, 16,  7, 22,  1])
** iindex_9: tensor([18, 12, 17, 24, 27, 13, 22,  7,  1])
------------
aindex_11: loinslfuve</s>
iindex_11: olrmhoufev</s>
** aindex_11: tensor([13, 16, 10, 15, 20, 13,  7, 22, 23,  6,  1])
** iindex_11: tensor([16, 13, 19, 14,  9, 16, 22,  7,  6, 23,  1])
------------
aindex_6: lsytb</s>
iindex_6: ohbgy</s>
** aindex_6: tensor([13, 20, 26, 21,  3,  1])
** iindex_6: tensor([16,  9,  3,  8, 26,  1])
------------


In [15]:
train_dataloader = DataLoader(train_dataset, batch_size=1)
valid_dataloader = DataLoader(valid_dataset, batch_size=1)

In [16]:
class AlphabetEncoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(AlphabetEncoder, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, inputs, hidden):
        #embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.embedding(inputs)
        #print('** embedding: {}'.format(self.embedding(input).shape))
        #print('** embedded: {}'.format(embedded.shape))
        #print('** hidden: {}'.format(hidden.shape))
        output = embedded
        output, hidden = self.gru(output, hidden)
        #print('** hidden: {}'.format(hidden.shape))
        return output, hidden

    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

In [17]:
class AlphabetDecoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(AlphabetDecoder, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, inputs, hidden):
        embedded = self.embedding(inputs)
        embedded = F.relu(embedded)
        #print('** output: {}'.format(output.shape))
        output, hidden = self.gru(embedded, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

In [18]:
class AlphabetAttentionDecoder(nn.Module):
    def __init__(self, hidden_size, output_size, max_length):
        super(AlphabetAttentionDecoder, self).__init__()
        # some parameters
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.max_length = max_length
        self.dropout = nn.Dropout(0.1)
        
        # Linear layer
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attn = nn.Linear(2*self.hidden_size, max_length)
        self.context = nn.Linear(2*self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)
        
        # GRU layer
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        
        
    def forward(self, inputs, hidden, encoder_outputs):
        '''
        inputs: (1,B,1)
        hidden: (1,B,H)
        encoder_outputs: (M,B,H)
        '''
        embedded = self.embedding(inputs)
        embedded = self.dropout(embedded)    # (1,B,H)
        #print('** inputs → embedded: {} → {}'.format(inputs.shape, embedded.shape))
        #print('** hidden: {}'.format(hidden.shape))
        #print('** encoder_outputs: {}'.format(encoder_outputs.shape))
        
        weight = F.softmax(
            self.attn(
                torch.cat((embedded[0], hidden[0]),    # (B,2H)
                1)
            ),    # (B,M)
            1
        )    # (B,M)
        #print('** weight: {}'.format(weight.shape))
        
        #print('** weight.unsqueeze(1): {}'.format(weight.unsqueeze(1).shape))
        #print('** encoder_outputs.transpose(0,1): {}'.format(encoder_outputs.transpose(0,1).shape))
        weight_applied = torch.bmm(
            weight.unsqueeze(1),    # (B,1,M)
            encoder_outputs.transpose(0, 1)    # (B,M,H)
        )    # (B,1,H)
        #print('** weight_applied: {}'.format(weight_applied.shape))
        
        
        new_context = self.context(
            torch.cat(
                (embedded[0].unsqueeze(1), weight_applied),
                2
            )    # (B,1,2H)
        )    # (B,1,H)
        new_context = F.relu(new_context)    # (B,1,H)
        new_context = new_context.transpose(0, 1)    # (1,B,H)
        #print('** new_context: {}'.format(new_context.shape))
        #print('** hidden: {}'.format(hidden.shape))
        
        output, hidden = self.gru(
            new_context,    # (1,B,H)
            hidden    # (1,B,H)
        )    # output=(1,B,H), hidden=(1,B,H)
        #print('** output: {}'.format(output.shape))
        #print('** hidden: {}'.format(hidden.shape))
        
        output = self.out(output)    # (1,B,O)
        #print(output)
        output = F.log_softmax(output, 2)    # (1,B,O)
        #print(output)
        #print('** output: {}'.format(output.shape))
        #print('** weight: {}'.format(weight.shape))
        #print('** weight sum: {}'.format(weight.sum()))
        #print('--------------------------------------')
        
        return output, hidden, weight

In [19]:
# class AlphabetAttentionDecoder(nn.Module):
#     def __init__(self, hidden_size, output_size):
#         super(AlphabetAttentionDecoder, self).__init__()
#         self.hidden_size = hidden_size
#         self.output_size = output_size
#         self.embedding = nn.Embedding(self.output_size, self.hidden_size)
#         self.linear_query = nn.Linear(self.hidden_size, self.hidden_size)
#         self.softmax = nn.Softmax(dim=-1)
#         self.dropout = nn.Dropout(0.1)
        
#         self.gru = nn.GRU(self.hidden_size, self.hidden_size)
#         self.out = nn.Linear(self.hidden_size, self.output_size)
        
#     def forward(self, inputs, hidden, encoder_outputs):
#         '''
#         inputs: (B,M)
#         hidden: (1,B,H)
#         encoder_outputs: (M,B,H)
#         '''
#         embedded = self.embedding(inputs)
#         embedded = self.dropout(embedded)    # (B,M,H)
        
#         print('embedded: {}'.format(embedded.shape))
#         print('hidden: {}'.format(hidden.shape))
#         print('encoder_outputs: {}'.format(encoder_outputs.shape))
#         return
        
#         #print(hidden.shape, hidden.dtype)
#         query = self.linear_query(hidden)    # (1,B,H)
#         #print(query.transpose(0,1).shape, embedded.transpose(1,2).shape)
#         #return
#         weight = torch.bmm(query.transpose(0,1), embedded.transpose(1,2))    # (B,1,M)
#         weight = self.softmax(weight)    # (B,1,M)
#         print('** weight: {}'.format(weight.shape))
#         print('** encoder_outputs: {}'.format(encoder_outputs.shape))
#         return
        
#         context = torch.bmm(weight, encoder_outputs.transpose(0,1))    # (B,1,H)
#         context = context.transpose(0,1)
        
#         output, hidden = self.gru(embedded.transpose(0,1), context)    # (M,B,H), (1,B,H)
#         output = self.out(output[0])    # (M,B,H), we do it by assuming M=1
#         output = F.log_softmax(output, 1)    # (M,B,H)
#         #output = self.out(output)
#         #print(output.shape, hidden.shape)
#         return output, hidden, weight

In [20]:
# inp = torch.from_numpy(np.array([
#     [0,1,2,3],
#     [0,1,2,3],
#     [0,1,2,3]
# ]))    # (3,4)
# inp = inp.long()    # (3,4)
# hid = torch.rand((1,3,5))    # (1,3,5)
# eout = torch.rand((4,3,5))    # (4,3,5)
# print(inp.shape, hid.shape, eout.shape)
# att = AlphabetAttentionDecoder(hidden_size=5, output_size=4)
# o,h,w = att(inp, hid, eout)
# print(o.shape)

In [21]:
# inp = torch.from_numpy(np.array([
#     [0],
#     [2],
#     [3]
# ]))    # (3,4)
# inp = inp.long()
# hid = torch.rand((1,3,5))    # (1,3,5)
# eout = torch.rand((1,3,5))
# print(inp.shape, hid.shape, eout.shape)
# att = AlphabetAttentionDecoder(hidden_size=5, output_size=4)
# o,h,w = att(inp, hid, eout)
# print(o.shape)

In [22]:
# class AlphabetAttentionDecoder(nn.Module):
#     def __init__(self, hidden_size, output_size, max_length):
#         super(AlphabetAttentionDecoder, self).__init__()
#         self.hidden_size = hidden_size
#         self.output_size = output_size
#         self.max_length = max_length
        
#         self.embedding = nn.Embedding(self.output_size, self.hidden_size)
#         self.weight_combine = nn.Linear(2*self.hidden_size, self.max_length)
#         self.attention_combine = nn.Linear(2*self.hidden_size, self.hidden_size)
#         #self.softmax = nn.LogSoftmax(dim=1)
#         self.gru = nn.GRU(self.hidden_size, self.hidden_size)
#         self.out = nn.Linear(self.hidden_size, self.output_size)
#         self.dropout = nn.Dropout(0.1)
        
#     def forward(self, inputs, hidden, encoder_outputs):
#         # embedding
#         embedded = self.embedding(inputs)
#         embedded = self.dropout(embedded)

#         # making attention weight
#         concat_hidden = torch.cat((embedded[0], hidden[0]), 1)
#         attention_weight = F.softmax(self.weight_combine(concat_hidden), 1).unsqueeze(1)
        
#         # applying attention weight
#         encoder_outputs_transposed = encoder_outputs.transpose(1, 0)
#         attention_applied = torch.bmm(attention_weight, encoder_outputs_transposed)
        
#         # making new context and new input
#         new_context = torch.cat((embedded[0], attention_applied.squeeze(1)), dim=1)
#         output = self.attention_combine(new_context)
#         output = F.relu(output).unsqueeze(0)
        
#         # running gru
#         output, hidden = self.gru(output, hidden)
        
#         # making output
#         output = self.out(output)
#         output = F.log_softmax(output[0], 1)
    
#         return output, hidden, attention_weight

#     def initHidden(self, batch_size):
#         return torch.zeros(1, batch_size, self.hidden_size, device=device)

In [23]:
use_attention = True

In [24]:
hidden_size = 256
encoder = AlphabetEncoder(26+2, hidden_size).to(device)
#decoder = AlphabetDecoder(hidden_size, 26+3).to(device)
decoder = AlphabetAttentionDecoder(hidden_size, 26+2, MAX_LENGTH-1).to(device)

In [25]:
# inputs_t = inputs.transpose(1, 0)

In [26]:
# batch_size = 32
# encoder_hidden = encoder.initHidden(batch_size)
# encoder_hidden.shape

In [27]:
learning_rate = 0.01
encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()

In [28]:
bos = 0
eos = 1
pad = 2

In [2]:
import torch

In [5]:
eout = torch.rand(4, 3, 5)

In [7]:
eout[:,0,:].sum()

tensor(13.4022)

In [8]:
eout_trans = eout.transpose(0,1)
eout_trans.shape

torch.Size([3, 4, 5])

In [9]:
eout_trans[0].sum()

tensor(13.4022)

In [10]:
a = torch.rand(3,1,5)
b = a.transpose(1,0)
c = a.squeeze(1).unsqueeze(0)

In [11]:
b.shape, c.shape

(torch.Size([1, 3, 5]), torch.Size([1, 3, 5]))

In [12]:
b == c

tensor([[[True, True, True, True, True],
         [True, True, True, True, True],
         [True, True, True, True, True]]])

In [29]:
def train(encoder, decoder, dataloader, encoder_optimizer, decoder_optimizer, criterion, n_epochs=5):
    
    encoder.train()
    decoder.train()
    
    # zero_grad for encoder/decoder optimizer
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    for epoch in range(n_epochs):
        
        # zero_grad for encoder/decoder optimizer
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        
        loss_avg = 0
        tbar = tqdm(enumerate(dataloader), desc='training {}th epoch'.format(epoch))
        ####################################
        for i, batch in tbar:

            # get inputs and labels
            inputs, labels = batch
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            # transpose inputs and labels
            inputs = inputs.transpose(1, 0)
            labels = labels.transpose(1, 0)

            # initialize hidden for encoder
            batch_size = inputs.size()[1]
            max_length = inputs.size()[0]
            max_length = MAX_LENGTH - 1
            encoder_hidden = encoder.initHidden(batch_size)
            encoder_outputs = torch.zeros(max_length, batch_size, hidden_size, device=device)
            
            # encoding
            for j, inp in enumerate(inputs):
                inp = inp.unsqueeze(0)
                encoder_output, encoder_hidden = encoder(inp, encoder_hidden)
                encoder_outputs[j] = encoder_output[0]
                
            # initialize hidden for decoder
            decoder_hidden = encoder_hidden
            loss = 0

            decoder_inputs = torch.tensor([[bos]*batch_size], device=device)    # (1,B)
            #decoder_inputs = decoder_inputs.transpose(0,1)    # (B,1)

            teacher_forcing = True if np.random.random() < 0.5 else False
            #teacher_forcing = True
            # decoding
            for inp in labels:
                if teacher_forcing:
                    decoder_output, decoder_hidden, attention_weight = decoder(decoder_inputs, decoder_hidden, encoder_outputs)
                    loss_it = criterion(decoder_output[0], inp)    # criterion((B,O), (B))
                    loss += loss_it
                    decoder_inputs = inp.unsqueeze(0)
                else:
                    decoder_output, decoder_hidden, attention_weight = decoder(decoder_inputs, decoder_hidden, encoder_outputs)
                    loss_it = criterion(decoder_output[0], inp)
                    loss += loss_it
                    topv, topi = decoder_output.topk(1)
                    decoder_inputs = topi.squeeze(2)
                    
            # backward
            loss.backward()
            encoder_sum = sum([p[1].data.sum() for p in encoder.named_parameters()])
            decoder_sum = sum([p[1].data.sum() for p in decoder.named_parameters()])
            
            # update encoder/decoder
            encoder_optimizer.step()
            decoder_optimizer.step()
            encoder_sum = sum([p[1].data.sum() for p in encoder.named_parameters()])
            decoder_sum = sum([p[1].data.sum() for p in decoder.named_parameters()])
            
            tbar.set_postfix(loss=loss.data.item())
                

In [None]:
train(encoder, decoder, train_dataloader, encoder_optimizer, decoder_optimizer, criterion, n_epochs=3)

training 0th epoch: 175it [00:08, 21.93it/s, loss=3.03e+3]

In [72]:
def validate(encoder, decoder, dataloader):
    total_accuracy = 0.0
    with torch.no_grad():
        for i, batch in enumerate(dataloader):
            # get inputs and labels
            inputs, labels = batch
            inputs = inputs.to(device)
            labels = labels.to(device)

            #print('** inputs[0]: {}'.format(inputs[0]))
            #print('** labels[0]: {}'.format(labels[0]))

            # transpose inputs and labels
            inputs = inputs.transpose(1, 0)
            labels = labels.transpose(1, 0)

            # initialize hidden for encoder
            batch_size = inputs.size()[1]
            encoder_hidden = encoder.initHidden(batch_size)
            encoder_outputs = torch.zeros(MAX_LENGTH, batch_size, hidden_size, device=device)
            
            # encoding
            for j, inp in enumerate(inputs):
                inp = inp.unsqueeze(0)
                encoder_output, encoder_hidden = encoder(inp, encoder_hidden)
                #print('** encoder_output_{}: {}'.format(i, encoder_output.shape))
                encoder_outputs[j] = encoder_output[0]
            
            # initialize hidden for decoder
            decoder_hidden = encoder_hidden
            decoder_inputs = torch.tensor([[bos]*batch_size], device=device)
            #decoder_inputs = decoder_inputs.transpose(0,1)    # (B,1)
            
            pred = []
            # decoding
            for j, inp in enumerate(labels):
                #print('** decoder_inputs: {}'.format(decoder_inputs.shape))
                decoder_output, decoder_hidden, attention_weight = decoder(decoder_inputs, decoder_hidden, encoder_outputs[j].unsqueeze(0))
                #print('** decoder_output: {}'.format(decoder_output.shape))
                topv, topi = decoder_output.topk(1)
                #print('** new decoder_inputs: {}'.format(decoder_inputs.shape))
                pred.append(topi.cpu().numpy().flatten().tolist())
                decoder_inputs = topi

            # re-transpose for validation
            inputs = inputs.transpose(1, 0)
            labels = labels.transpose(1, 0).cpu().numpy()
            #return

            # stack-up prediction for validation
            pred = np.array(pred)
            pred = np.transpose(pred)
            #print(labels.shape, pred.shape)
            #pred = np.stack(pred, axis=1)
            accuracy = (pred == labels).astype(np.int).mean()
            total_accuracy += accuracy
            #print('{} VS {} → {:.4f}'.format(labels.shape, pred.shape, accuracy))
            
        total_accuracy = total_accuracy/(i+1)
        print('total accuracy: {:.4f}'.format(total_accuracy))
        return labels, pred, total_accuracy

In [58]:
labels, pred, acc = validate(encoder, decoder, valid_dataloader)

RuntimeError: invalid argument 6: wrong matrix size at /pytorch/aten/src/THC/generic/THCTensorMathBlas.cu:41

In [78]:
acc

0.9933483368483371

In [47]:
labels

array([[ 5,  4, 20, 18, 13, 10,  8, 13,  3, 16, 11, 25,  1]])

In [48]:
pred

array([[[ 5],
        [ 4],
        [20],
        [18],
        [13],
        [10],
        [ 8],
        [13],
        [ 3],
        [16],
        [11],
        [25],
        [ 1]]])

In [173]:
def predict_sequence(sequence):
    label = list(map(lambda x:inverse_map[x], sequence))
    label = list(map(lambda x:a2i[x], label))
    label_string = convert_index_to_alphabet(label)
    
    # print input and expected output
    print('** input sequence: {}'.format(sequence))
    print('** label sequence: {}'.format(label_string))
    
    with torch.no_grad():
        sequence = list(map(lambda s: a2i[s], sequence))
        inputs = torch.tensor(sequence)
        inputs = inputs.unsqueeze(1)
        inputs = inputs.to(device)
        
        sequence_length = inputs.shape[0]
        batch_size = 1
        encoder_hidden = encoder.initHidden(batch_size)
        encoder_outputs = torch.zeros(MAX_LENGTH, batch_size, hidden_size, device=device)

        # encoding
        for j, inp in enumerate(inputs):
            inp = inp.unsqueeze(0)
            #print(inp.shape, encoder_hidden.shape)
            #return
            encoder_output, encoder_hidden = encoder(inp, encoder_hidden)
            #print('** encoder_output_{}: {}'.format(i, encoder_output.shape))
            encoder_outputs[j] = encoder_output[0]
        
        # initialize hidden for decoder
        decoder_hidden = encoder_hidden
        decoder_inputs = torch.tensor([[a2i['<s>']]*batch_size], device=device)
        #decoder_inputs = decoder_inputs.transpose(0,1)    # (B,1)

        pred = []
        attention_weights = []
        # decoding
        for j in range(MAX_LENGTH):
            decoder_output, decoder_hidden, attention_weight = decoder(decoder_inputs, decoder_hidden, encoder_outputs[j].unsqueeze(0))
            #print('** decoder_output: {}'.format(decoder_output.shape))
            topv, topi = decoder_output.topk(1)
            print(attention_weight)
            attention_weights.append(attention_weight)
            
            if topi.item() == a2i['</s>'] or sequence_length == j:
                pred.append(a2i['</s>'])
                break
            else:
                #print('** new decoder_inputs: {}'.format(decoder_inputs.shape))
                #pred.append(topi.cpu().numpy().flatten().tolist())
                pred.append(topi.item())
                decoder_inputs = topi
        
        pred = np.array(pred).flatten()[:-1]    # remove the last eos character
        converted_string = convert_index_to_alphabet(pred)
        attention_weights = np.array(attention_weights)
        print('** pred  sequence: {}'.format(converted_string))
        
        return converted_string, attention_weights
        #return pred

In [174]:
o, attention_weights = predict_sequence('abcd')
attention_weights.shape

** input sequence: abcd
** label sequence: zyxw
tensor([[[1.]]], device='cuda:0')
tensor([[[1.]]], device='cuda:0')
tensor([[[1.]]], device='cuda:0')
tensor([[[1.]]], device='cuda:0')
tensor([[[1.]]], device='cuda:0')
** pred  sequence: zyxw


(5,)

In [172]:
attention_weights[0]

tensor([[[1.]]], device='cuda:0')

### save your model

In [162]:
import datetime
import os

In [166]:
randidx = '{}'.format(np.random.randint(0, 10000)).zfill(4)
yymmdd = datetime.datetime.now().strftime('%Y%m%d')
model_nm = os.path.join('encoder-decoder-{}-{}.bin'.format(yymmdd, randidx))
model_nm

'encoder-decoder-20201221-5120.bin'

In [167]:
encoder_state = encoder.state_dict()
decoder_state = decoder.state_dict()
torch.save({
    'encoder_state': encoder_state,
    'decoder_state': decoder_state,
}, model_nm)

### *************** NMT ***************

In [None]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

In [None]:
torch.__version__

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

### 데이터셋 다운로드 → https://download.pytorch.org/tutorial/data.zip

### 참고 → https://tutorials.pytorch.kr/intermediate/seq2seq_translation_tutorial.html

In [None]:
SOS_token = 0
EOS_token = 1

In [None]:
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # SOS 와 EOS 포함

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [None]:
# 유니 코드 문자열을 일반 ASCII로 변환하십시오.
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

In [None]:
# 소문자, 다듬기, 그리고 문자가 아닌 문자 제거
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [None]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # 파일을 읽고 줄로 분리
    lines = open('../data/attention-ntm/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # 모든 줄을 쌍으로 분리하고 정규화
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # 쌍을 뒤집고, Lang 인스턴스 생성
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [None]:
MAX_LENGTH = 10

In [None]:
eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

In [None]:
def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)

In [None]:
def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [None]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

In [None]:
input_lang, output_lang, pairs = prepareData('eng', 'fra', True)
print(random.choice(pairs))

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
# without Attention
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        print('DecoderRNN_INPUT: input: {} → {:.4f}'.format(input.shape, input.sum()))
        print('DecoderRNN_INPUT: hidden: {} → {:.4f}'.format(hidden.shape, hidden.sum()))
        
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        print('** output: {} → {:.4f}'.format(output.shape, output.sum()))
        
        
        output, hidden = self.gru(output, hidden)
        print('** gru-output: {} → {:.4f}'.format(output.shape, output.sum()))
        print('** gru-hidden: {} → {:.4f}'.format(hidden.shape, hidden.sum()))
        
        output = self.softmax(self.out(output[0]))
        #print('** hidden: {}'.format(hidden.shape))
        
        print('DecoderRNN_OUTPUT: output: {} → {:.4f}'.format(output.shape, output.sum()))
        print('DecoderRNN_OUTPUT: hidden: {} → {:.4f}'.format(hidden.shape, hidden.sum()))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
# with attention
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)
        print('AttnDecoderRNN_INPUT: input → embedded: {} → {}'.format(input.shape, embedded.shape))
        print('AttnDecoderRNN_INPUT: hidden: {}'.format(hidden.shape))
        print('AttnDecoderRNN_INPUT: encoder_outputs: {}'.format(encoder_outputs.shape))
        
        #print('** embedded[0]: {}'.format(embedded[0].shape))
        #print('** hidden[0]: {}'.format(hidden[0].shape))
        #print('** concatenate embedded[0] and hidden[0]: {}'.format(torch.cat((embedded[0], hidden[0]), 1).shape))
        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        #print('** attn_weights: {}'.format(attn_weights.shape))
        #print('** attn_weights: {}'.format(attn_weights))
        #print('** attn_weights.sum(): {}'.format(attn_weights.sum().shape))
        #print('** attn_weights.sum(): {:.4f}'.format(attn_weights.sum()))
        
        #print('** attn_weights unsqueezed: {}'.format(attn_weights.unsqueeze(0).shape))
        #print('** encoder_outputs unsqueezed: {}'.format(encoder_outputs.unsqueeze(0).shape))
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))
        #print('** attn_applied: {}'.format(attn_applied.shape))
        #print('** attn_applied[0]: {}'.format(attn_applied[0].shape))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        #print('** output: {}'.format(output.shape))
        output = self.attn_combine(output).unsqueeze(0)
        #print('** output: {}'.format(output.shape))
        
        output = F.relu(output)
        #print('** output: {}'.format(output.shape))
        output, hidden = self.gru(output, hidden)
        #print('** gru-output: {}'.format(output.shape))
        #print('** gru-hidden: {}'.format(hidden.shape))

        #print('** output: {}'.format(output[0].shape))
        #print('** self.out: {}'.format(self.out))
        output = F.log_softmax(self.out(output[0]), dim=1)
        print('AttnDecoderRNN_OUTPUT: {}'.format(output.shape))
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [None]:
teacher_forcing_ratio = 0.5

In [None]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH, with_attention=True):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)
    
    print('** input_tensor[0]: {}'.format(input_tensor[0]))
    print('** target_tensor[0]: {}'.format(target_tensor[0]))
    return

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
    print('encoder_outputs: {}'.format(encoder_outputs.shape))
    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]
        print('{} - {}'.format(ei, encoder_output[0, 0].shape))

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden
    print('decoder_hidden: {}'.format(decoder_hidden.shape))

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing 포함: 목표를 다음 입력으로 전달
        for di in range(target_length):
            if with_attention:
                print('# {}th decoding'.format(di))
                decoder_output, decoder_hidden, decoder_attention = decoder(
                    decoder_input, decoder_hidden, encoder_outputs)
                print('train:decoder_attention_{}: {}'.format(di, decoder_attention.shape))
            else:
                decoder_output, decoder_hidden = decoder(
                    decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing
            

    else:
        # Teacher forcing 미포함: 자신의 예측을 다음 입력으로 사용
        for di in range(target_length):
            if with_attention:
                decoder_output, decoder_hidden, decoder_attention = decoder(
                    decoder_input, decoder_hidden, encoder_outputs)
                print('train:decoder_attention_{}: {}'.format(di, decoder_attention.shape))
            else:
                decoder_output, decoder_hidden = decoder(
                    decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # 입력으로 사용할 부분을 히스토리에서 분리

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [None]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [None]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01, with_attention=True):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # print_every 마다 초기화
    plot_loss_total = 0  # plot_every 마다 초기화

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        print(input_tensor.shape, target_tensor.shape)
        print(input_tensor)
        print(target_tensor)
        print('----------------------')
        
        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion, with_attention=with_attention)
        return
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [None]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # 주기적인 간격에 이 locator가 tick을 설정
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [None]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [None]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [None]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

In [None]:
# hidden_size = 256
# encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
# decoder1 = DecoderRNN(hidden_size, output_lang.n_words).to(device)

In [None]:
#### Input validation

In [None]:
# trainIters(encoder1, attn_decoder1, 75000, print_every=5000)

In [None]:
trainIters(encoder1, decoder1, 75000, print_every=5000, with_attention=True)

In [None]:
# evaluateRandomly(encoder1, attn_decoder1)

In [None]:
output_words, attentions = evaluate(
    encoder1, attn_decoder1, "je suis trop froid .")


In [None]:
def showAttention(input_sentence, output_words, attentions):
    # colorbar로 그림 설정
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)

    # 축 설정
    ax.set_xticklabels([''] + input_sentence.split(' ') +
                       ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # 매 틱마다 라벨 보여주기
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()

In [None]:
def evaluateAndShowAttention(input_sentence):
    output_words, attentions = evaluate(
        encoder1, attn_decoder1, input_sentence)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))
    showAttention(input_sentence, output_words, attentions)

In [None]:
evaluateAndShowAttention("elle a cinq ans de moins que moi .")
#evaluateAndShowAttention("elle est trop petit .")
#evaluateAndShowAttention("je ne crains pas de mourir .")
#evaluateAndShowAttention("c est un jeune directeur plein de talent .")

In [None]:
from torch.utils import data
from random import choice, randrange
import numpy as np

In [None]:

class ToyDataset(data.Dataset):
    """
    Inspired from https://talbaumel.github.io/blog/attention/
    """
    def __init__(self, min_length=5, max_length=20, type='train'):
        self.SOS = "<s>"  
        self.EOS = "</s>" 
        self.characters = list("abcd")
        self.int2char = list(self.characters)
        self.char2int = {c: i+3 for i, c in enumerate(self.characters)}
        self.VOCAB_SIZE = len(self.characters)
        self.min_length = min_length
        self.max_length = max_length
        if type=='train':
            self.set = [self._sample() for _ in range(3000)]
        else:
            self.set = [self._sample() for _ in range(300)]

    def __len__(self):
        return len(self.set)

    def __getitem__(self, item):
        return self.set[item]

    def _sample(self):
        random_length = randrange(self.min_length, self.max_length)# Pick a random length
        random_char_list = [choice(self.characters[:-1]) for _ in range(random_length)]  # Pick random chars
        random_string = ''.join(random_char_list)
        print(random_string)
        a = np.array([self.char2int.get(x) for x in random_string])
        b = np.array([self.char2int.get(x) for x in random_string[::-1]] + [2]) # Return the random string and its reverse
        x = np.zeros((random_length, self.VOCAB_SIZE))
        x[np.arange(random_length), a-3] = 1
        return x, b

In [None]:
ds = ToyDataset()

In [None]:
ds[0]

In [None]:
caaabbaabaaba
abaabaabbaaac