In [2]:
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

In [4]:
SOS_token = 0


class Scrambled:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS"}
        self.n_words = 1  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [5]:
def readLangs():
    print("Reading lines...")

    with open('../data/train_original.txt') as f:
        lines_original = f.read().splitlines()
    
    with open('../data/train_scrambled.txt') as f:
        lines_scrambled = f.read().splitlines()

    pairs = list(zip(lines_scrambled,lines_original))
    lang = Lang('train_vocab')  # x and y vocab remains the same

    return lang, pairs

In [4]:
def prepareData():
    lang, pairs = readLangs()
    print("Read %s sentence pairs" % len(pairs))
    #pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        lang.addSentence(pair[0])
        
    print("Counted words:")
    print(lang.name, lang.n_words)
    return lang, pairs


lang, pairs = prepareData()
print(random.choice(pairs))

Reading lines...
Read 10000 sentence pairs
Trimmed to 10000 sentence pairs
Counting words...
Counted words:
train_vocab 21422
('legislation. social all taking on decisions for supranational responsible EU the make to misguided be would it believe we issues, but certain on decision-making coordinate EU to the for acceptable certainly is It', 'It is certainly acceptable for the EU to coordinate decision-making on certain issues, but we believe it would be misguided to make the EU responsible for taking supranational decisions on all social legislation.')


KeyError: 'EOS'

In [5]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.linear = nn.Linear(hidden_size, hidden_size)

        
        self.self_att = SelfAttention(hidden_size)
    def forward(self, input):
        embedded = self.embedding(input)
        linear_out = self.linear(embedded)
        output_att = linear_out.clone()
        for k in range(linear_out.shape[0]):
          
            output_att[k] = self.self_att(linear_out[k].unsqueeze(0),linear_out,linear_out)
        
        return output_att

In [6]:
class SelfAttention(torch.nn.Module):
    def __init__(self,hid_dim):
        super(SelfAttention,self).__init__()
        self.k_dim = hid_dim
        self.q_projection = torch.nn.Linear(hid_dim,hid_dim)
        self.k_projection = torch.nn.Linear(hid_dim,hid_dim)
        self.v_projection = torch.nn.Linear(hid_dim,hid_dim)

    def forward(self,q,k,v):
        q_proj = self.q_projection(q)
        k_proj = self.k_projection(k)
        v_proj = self.v_projection(v)
        attention = F.softmax(q_proj@torch.t(k_proj),dim=1)
        att_wtd = (attention @ v_proj)
        return att_wtd/self.k_dim**0.5

In [52]:
encoder = EncoderRNN(lang.n_words,100)


torch.Size([1, 100])

In [10]:
with open('data/data/train_original.txt') as f:
    lines_original = f.read().splitlines()
MAX_LENGTH = max([len(line.split()) for line in lines_original])
MAX_LENGTH = MAX_LENGTH + 1

In [51]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.LSTM(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)
        
        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0][0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))
 

        output = torch.cat((embedded[0], attn_applied[0]), 1)
 
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        
        output, hidden = self.gru(output, hidden)
        

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size), torch.zeros(1, 1, self.hidden_size)

In [49]:
class AttnDecoderGRU(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderGRU, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)
        print(hidden[0].shape)
        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))
 

        output = torch.cat((embedded[0], attn_applied[0]), 1)
 
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        print(hidden.shape)
        output, hidden = self.gru(output, hidden)
        

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size)

In [90]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(lang, pair[0])
    target_tensor = tensorFromSentence(lang, pair[1])
    return (input_tensor, target_tensor)

In [None]:
def pos_index_from_sentence(sentence,pos_to_idx):
    idx = []
    tokens = nltk.word_tokenize(sentence)
    tag_pairs = nktk.pos_tag(tokens)
    tags = [tag for word, tag in tag_pairs]
    for tag in tags:
        if ta

In [29]:
teacher_forcing_ratio = 1
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):


    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size)


    loss = 0
 
    final_encoder_outputs = encoder(input_tensor.view(1,input_tensor.shape[0]).squeeze(0))


    for ei in range(input_length):
        encoder_outputs[ei] = final_encoder_outputs[ei]

        
        
    
    
    decoder_input = torch.tensor([[SOS_token]])

    decoder_hidden = decoder.initHidden()
  

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [30]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [87]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        print(input_tensor)
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            torch.save(encoder, "encoder_self_attn"+str(iter)+".pth")
            torch.save(decoder, "decoder_self_attn"+str(iter)+".pth")
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

#         if iter % plot_every == 0:
#             plot_loss_avg = plot_loss_total / plot_every
#             plot_losses.append(plot_loss_avg)
#             plot_loss_total = 0

    #showPlot(plot_losses)

In [88]:
hidden_size = 256
encoder1 = EncoderRNN(lang.n_words, hidden_size)
attn_decoder1 = AttnDecoderRNN(hidden_size, lang.n_words, dropout_p=0.1)

trainIters(encoder1, attn_decoder1, 30000, print_every=5)

tensor([[13299],
        [ 4204],
        [ 4430],
        [   17],
        [   31],
        [   32],
        [  266],
        [   17],
        [12754],
        [   17],
        [ 4204],
        [   43],
        [18620],
        [ 2469],
        [  117],
        [  251],
        [ 1269],
        [   31],
        [  188],
        [ 4312],
        [   38],
        [   43],
        [ 2786],
        [ 4444],
        [ 5781],
        [  791],
        [    1]])
tensor([[7103],
        [2960],
        [ 251],
        [  26],
        [8087],
        [1132],
        [1803],
        [1025],
        [4439],
        [   6],
        [  17],
        [ 851],
        [   6],
        [1498],
        [  38],
        [1246],
        [   6],
        [ 454],
        [  17],
        [ 934],
        [ 345],
        [  12],
        [ 995],
        [   6],
        [5007],
        [ 180],
        [  47],
        [ 314],
        [   1]])


KeyboardInterrupt: 

In [None]:
x = torch.randn(16, 1580, 201)
idx = torch.tensor(
    [1580, 959, 896, 881, 881, 881, 881, 881, 881, 881, 881, 881, 881, 335, 254, 219]
)
idx = idx - 1  # 0-based index
y = x[torch.arange(x.size(0)), idx]

In [89]:
pairs

[("We thorough make a to Prodi President under group Commissioners' up a set then analysis.",
  "We then set up a Commissioners' group under President Prodi to make a thorough analysis."),
 ("support. and engagement your for gratitude express to want I view of point this from and critical, be will climatic strategy resources natural environment and the and cooperation instrument development to the relation in support European Parliament's The",
  "The European Parliament's support in relation to the development cooperation instrument and the environment and natural resources climatic strategy will be critical, and from this point of view I want to express gratitude for your engagement and support."),
 ("children's health. substances on chemical impact of the demonstrating studies the and ago, year recalled a were problems that safety presenting toys of number significant the toys: of safety the on legislation its review to European Parliament the led have elements crucial two Minister,

In [64]:
torch.zeros(MAX_LENGTH,100)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [64]:
long_line = " ".join([line for line in lines_original])


In [66]:
text = nltk.word_tokenize(long_line)

In [68]:
tags = nltk.pos_tag(text)

In [71]:
all_tags = [ tag for (word,tag) in tags]

In [80]:
pos_tags = [tag for tag in set(all_tags) if tag.isalnum()]

In [81]:
pos_tags.append('UNK_TAG')

In [83]:
tag_to_idx = {tag:i for i,tag in enumerate(pos_tags)}

In [85]:
idx_to_tag = {i:t for t,i in tag_to_idx.items()}

In [86]:
tag_to_idx

{'WDT': 0,
 'SYM': 1,
 'RBS': 2,
 'VBG': 3,
 'VBZ': 4,
 'RBR': 5,
 'VBD': 6,
 'NNPS': 7,
 'WP': 8,
 'VBN': 9,
 'IN': 10,
 'JJS': 11,
 'UH': 12,
 'WRB': 13,
 'RB': 14,
 'POS': 15,
 'NN': 16,
 'DT': 17,
 'NNS': 18,
 'JJR': 19,
 'MD': 20,
 'EX': 21,
 'CD': 22,
 'NNP': 23,
 'RP': 24,
 'PDT': 25,
 'PRP': 26,
 'VBP': 27,
 'JJ': 28,
 'TO': 29,
 'FW': 30,
 'CC': 31,
 'VB': 32,
 'UNK_TAG': 33}

In [92]:
nltk.pos_tag(nltk.word_tokenize('hello world.'))

[('hello', 'JJ'), ('world', 'NN'), ('.', '.')]