In [1]:
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

### RUN ALL THE CELLS SEQUENTIALLy
#### Train the model for atleast 20000-30000 iterations (3 epochs)

In [115]:
SOS_token = 0
EOS_token = 1


class Vocab:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS"}
        self.n_words = 1  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [116]:
def readVocab():
    print("Reading lines...")

    with open('../data/train_original.txt') as f:
        lines_original = f.read().splitlines()
    
    with open('../data/train_scrambled.txt') as f:
        lines_scrambled = f.read().splitlines()

    pairs = list(zip(lines_scrambled,lines_original))
    vocab = Vocab('train_vocab')  # x and y vocab remains the same

    return vocab, pairs

In [117]:
def prepareData():
    vocab, pairs = readVocab()
    print("Read %s sentence pairs" % len(pairs))
    #pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        vocab.addSentence(pair[0])
        
    print("Counted words:")
    print(vocab.name, vocab.n_words)
    return vocab, pairs


vocab, pairs = prepareData()
print(random.choice(pairs))

Reading lines...
Read 10000 sentence pairs
Trimmed to 10000 sentence pairs
Counting words...
Counted words:
train_vocab 21421
('well. as countries other with agreements facilitation conclude visa to made being are efforts aware, be will you As', 'As you will be aware, efforts are being made to conclude visa facilitation agreements with other countries as well.')


In [118]:
#Includes self attention
class AttentiveEncoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(AttentiveEncoder, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.linear = nn.Linear(hidden_size, hidden_size)

        
        self.self_att = SelfAttention(hidden_size)
    def forward(self, input):
        embedded = self.embedding(input)
        linear_out = self.linear(embedded)
        output_att = linear_out.clone()
        for k in range(linear_out.shape[0]):
          
            output_att[k] = self.self_att(linear_out[k].unsqueeze(0),linear_out,linear_out)
        
        return output_att

### CUSTOM SELF ATTENTION

In [119]:
class SelfAttention(torch.nn.Module):
    def __init__(self,hid_dim):
        super(SelfAttention,self).__init__()
        self.k_dim = hid_dim
        self.q_projection = torch.nn.Linear(hid_dim,hid_dim)
        self.k_projection = torch.nn.Linear(hid_dim,hid_dim)
        self.v_projection = torch.nn.Linear(hid_dim,hid_dim)

    def forward(self,q,k,v):
        q_proj = self.q_projection(q)
        k_proj = self.k_projection(k)
        v_proj = self.v_projection(v)
        attention = F.softmax(q_proj@torch.t(k_proj),dim=1)
        att_wtd = (attention @ v_proj)
        return att_wtd/self.k_dim**0.5

In [120]:
with open('../data/train_original.txt') as f:
      lines_original = f.read().splitlines()
MAX_LENGTH = max([len(line.split()) for line in lines_original])
MAX_LENGTH = MAX_LENGTH + 1

### Bahdanaus attention in the decoder

In [121]:
class BahdanauDecoder(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(BahdanauDecoder, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.lstm = nn.LSTM(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0][0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))
 

        output = torch.cat((embedded[0], attn_applied[0]), 1)
 
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.lstm(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size), torch.zeros(1, 1, self.hidden_size)

In [122]:
def indexesFromSentence(vocab, sentence):
    return [vocab.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(vocab, sentence):
    indexes = indexesFromSentence(vocab, sentence)

    return torch.tensor(indexes, dtype=torch.long).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(vocab, pair[0])
    target_tensor = tensorFromSentence(vocab, pair[1])
    return (input_tensor, target_tensor)

In [123]:
teacher_forcing_ratio = 1
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):


    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size)


    loss = 0
 
    final_encoder_outputs = encoder(input_tensor.view(1,input_tensor.shape[0]).squeeze(0))


    for ei in range(input_length):
        encoder_outputs[ei] = final_encoder_outputs[ei]

        
        
    
    
    decoder_input = torch.tensor([[SOS_token]])

    decoder_hidden = decoder.initHidden()




    # Teacher forcing: Feed the target as the next input
    for di in range(target_length):
        decoder_output, decoder_hidden, decoder_attention = decoder(
            decoder_input, decoder_hidden, encoder_outputs)
        loss += criterion(decoder_output, target_tensor[di])
        decoder_input = target_tensor[di]  # Teacher forcing


    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [124]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [127]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss
        print(iter)

        if iter % print_every == 0:
            torch.save(encoder, "../models/encoder_self_attn"+str(iter)+".pth")
            torch.save(decoder, "../models/decoder_self_attn"+str(iter)+".pth")
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))


In [None]:
hidden_size = 256
encoder1 = AttentiveEncoder(vocab.n_words, hidden_size)
attn_decoder1 = BahdanauDecoder(hidden_size, vocab.n_words, dropout_p=0.1)

trainIters(encoder1, attn_decoder1, 30000, print_every=5000)

### EVALUATION

In [129]:
def readVocab_test():
    print("Reading lines...")

    with open('../data/test_scrambled.txt') as f:
        lines_test = f.read().splitlines()
    
 
        

   

    return lines_test

In [130]:
with open('../data/train_original.txt') as f:
    lines_original = f.read().splitlines()
MAX_LENGTH = max([len(line.split()) for line in lines_original])
MAX_LENGTH = MAX_LENGTH + 1

##### I am ignoring OOV words as I forgot to train an UNK token during training

In [70]:
def tensorFromSentence_test(vocab, sentence):
    indexes = indexesFromSentence_test(vocab, sentence)
 
    return torch.tensor(indexes, dtype=torch.long).view(-1, 1)

def indexesFromSentence_test(vocab, sentence):
    idx = []
    for word in sentence.split(' '):
        #Ignore OOV words
        if word in vocab.word2index.keys():
            idx.append(vocab.word2index[word])

    return idx

#### In evaluation I am sampling only from the input word vocabulary and not the complete vocabulary

In [78]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence_test(vocab, sentence)
    
        
        idx = input_tensor.view(input_tensor.shape[0])

        
  
 
        
        input_length = input_tensor.size()[0]


        encoder_outputs = torch.zeros(max_length, encoder.hidden_size)
        final_encoder_outputs = encoder(input_tensor.view(1,input_tensor.shape[0]).squeeze(0))
        for ei in range(input_length):
            encoder_outputs[ei] = final_encoder_outputs[ei]


        decoder_input = torch.tensor([[SOS_token]])  # SOS

        decoder_hidden = decoder.initHidden()

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(len(input_tensor)-1):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            temp = decoder_output.squeeze(0)
            temp = temp[idx]            

            topv, topi = temp.data.topk(1)
            
            top_index = topi.item()
            topi = idx[topi]
        
            idx = torch.cat([idx[0:top_index], idx[top_index+1:]])

            decoded_words.append(vocab.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words

In [71]:
lines_test = readVocab_test()

Reading lines...


In [72]:
def evaluateRandomly_test(encoder, decoder, n=1):
    for i in range(n):
        pair = random.choice(lines_test)
        print('>', pair)

        output_words = evaluate(encoder, decoder, pair)
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

### Randomly run a test pair

In [98]:
encoder.eval()
decoder.eval()
evaluateRandomly_test(encoder1, attn_decoder1)

> spent. are manage they that EU how funds the on statements their annual submit undertake to States Member the that importance paramount of view my in is It
< It is to the importance of the Member States that are in my view that they on their EU funds how statements manage undertake annual paramount



In [99]:
def evaluate_all_test(encoder,decoder,lines_test):
    unscrambled = []
    for line in lines_test:
    
        output_words = evaluate(encoder, decoder, line)
        output_sentence = ' '.join(output_words)
        unscrambled.append(output_sentence)
    
    return unscrambled


In [101]:
out = evaluate_all_test(encoder1,attn_decoder1,lines_test)

In [131]:
print(out)

['in this debate is very serious that it is to the issue of the human action on that it has now has of it, and of human and climate change, and on whether at global areas can cause clear extremely diverse level. caused life, threatening implications verifying causes imbalances reverse', 'Mr President, in the last legal of our consequences was at Mr when Mr \xa0\xa0 about he spoke part-session warned', 'The Member States. the directive in every and to cooperation between from covers for provides stage', 'of any questions of water with and at access could water easily nationalist supplies', 'in the first that the Commission in its Parliament of all at it. itself had shown thus therefore, reading pronounced card prohibiting red', 'Prime Minister will the measures which the reduce he announced hopes', 'We think about a specific of therefore, should, label.', "I believe that the European Parliament, we will have to the House to this report. in order in this great deal with a regulatory of a

In [112]:
#save

with open('../predictions/predictions_self_attention_v2.0.txt', 'w') as f:
    for item in out:
        f.write("%s\n" % item)