## Using POS tags as feature embeddings

### Run all the cells sequentially

In [4]:
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS"}
        self.n_words = 1  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split():
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [6]:
def readLangs():
    print("Reading lines...")

    with open('../data/train_original.txt') as f:
        lines_original = f.read().splitlines()
    
    with open('../data/train_scrambled.txt') as f:
        lines_scrambled = f.read().splitlines()

    pairs = list(zip(lines_scrambled,lines_original))
    lang = Lang('train_vocab')  # x and y vocab remains the same

    return lang, pairs

In [7]:
def prepareData():
    lang, pairs = readLangs()
    print("Read %s sentence pairs" % len(pairs))
    #pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        lang.addSentence(pair[0])
        
    print("Counted words:")
    print(lang.name, lang.n_words)
    return lang, pairs


lang, pairs = prepareData()
print(random.choice(pairs))

Reading lines...
Read 10000 sentence pairs
Trimmed to 10000 sentence pairs
Counting words...
Counted words:
train_vocab 21360
('Internet. the through also but paper Signatures on', 'Signatures on paper but also through the Internet.')


### Here I am introducing another embedding layer exclusivley for the POS tags that will be concatenated with word embeddings

In [9]:
class AttentiveEncoderPOS(nn.Module):
    def __init__(self, input_size, hidden_size,pos_size):
        super(AttentiveEncoderPOS, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.pos_embedding = nn.Embedding(pos_size,hidden_size)   # POS embedding layer
        self.linear = nn.Linear(2*hidden_size, hidden_size)

        
        self.self_att = SelfAttention(hidden_size)
    def forward(self, input,pos_input):
        embedded = self.embedding(input)
        pos_embedding = self.pos_embedding(pos_input)
      
        emb_with_pos = torch.cat([embedded,pos_embedding],dim=1)
   
        linear_out = self.linear(emb_with_pos)
        output_att = linear_out.clone()
        for k in range(linear_out.shape[0]):
          
            output_att[k] = self.self_att(linear_out[k].unsqueeze(0),linear_out,linear_out)
        
        return output_att

In [10]:
class SelfAttention(torch.nn.Module):
    def __init__(self,hid_dim):
        super(SelfAttention,self).__init__()
        self.k_dim = hid_dim
        self.q_projection = torch.nn.Linear(hid_dim,hid_dim)
        self.k_projection = torch.nn.Linear(hid_dim,hid_dim)
        self.v_projection = torch.nn.Linear(hid_dim,hid_dim)

    def forward(self,q,k,v):
        q_proj = self.q_projection(q)
        k_proj = self.k_projection(k)
        v_proj = self.v_projection(v)
        attention = F.softmax(q_proj@torch.t(k_proj),dim=1)
        att_wtd = (attention @ v_proj)
        return att_wtd/self.k_dim**0.5

In [12]:
with open('../data/train_original.txt') as f:
    lines_original = f.read().splitlines()
MAX_LENGTH = max([len(line.split()) for line in lines_original])
MAX_LENGTH = MAX_LENGTH + 1

In [14]:
class BahdanauDecoder(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(BahdanauDecoder, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.lstm = nn.LSTM(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)
        
        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0][0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))
 

        output = torch.cat((embedded[0], attn_applied[0]), 1)
 
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        
        output, hidden = self.lstm(output, hidden)
        

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size), torch.zeros(1, 1, self.hidden_size)

In [15]:
def indexesFromSentence(lang, sentence):
   
    
    return [lang.word2index[word] for word in sentence.split()]
   


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)

    return torch.tensor(indexes, dtype=torch.long).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(lang, pair[0])
    target_tensor = tensorFromSentence(lang, pair[1])
    return (input_tensor, target_tensor)

### Represent POS tags by indexes

In [17]:
import nltk
def pos_index_from_sentence(sentence,pos_to_idx):
    idx = []
    tokens = sentence.split()
    tag_pairs = nltk.pos_tag(tokens)
    tags = [tag for word, tag in tag_pairs]
    for tag in tags:
        if tag in pos_to_idx:
            idx.append(pos_to_idx[tag])
        else:
            idx.append(pos_to_idx['UNK_TAG'])
    
    return idx

def pos_tensor_from_sentence(sentence,pos_to_idx):
    indexes = pos_index_from_sentence(sentence,pos_to_idx)
    return torch.tensor(indexes,dtype=torch.long).view(-1,1)

### Creating lookup dictionaryfor POS tags

In [18]:
def generate_pos_tag_dict(sentences):
    text = sentences.split()
    
    tags = nltk.pos_tag(text)
    all_tags = [ tag for (word,tag) in tags]
    pos_tags = [tag for tag in set(all_tags) if len(tag) > 1]
    pos_tags.append('UNK_TAG')
    tag_to_idx = {tag:i for i,tag in enumerate(pos_tags)}
    
    return tag_to_idx

long_line = " ".join([line for line in lines_original])
pos_to_idx = generate_pos_tag_dict(long_line)

In [24]:

def train(input_tensor, pos_tensor,target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):


    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size)


    loss = 0
 
    final_encoder_outputs = encoder(input_tensor.view(1,input_tensor.shape[0]).squeeze(0),pos_tensor.view(1,input_tensor.shape[0]).squeeze(0))


    for ei in range(input_length):
        encoder_outputs[ei] = final_encoder_outputs[ei]

        
        
    
    
    decoder_input = torch.tensor([[SOS_token]])

    decoder_hidden = decoder.initHidden()
  

    


    # Teacher forcing: Feed the target as the next input
    for di in range(target_length):
        decoder_output, decoder_hidden, decoder_attention = decoder(
            decoder_input, decoder_hidden, encoder_outputs)
        loss += criterion(decoder_output, target_tensor[di])
        decoder_input = target_tensor[di]  # Teacher forcing

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [20]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [25]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pair_sentences = []
    for i in range(n_iters):
        pair = random.choice(pairs)
        training_pair_sentences.append(pair)
        

    
    training_pairs, pos_tensor = [], []
    print("Creating POS tokens...")
    for i in range(len(training_pair_sentences)):

    
        training_pairs.append(tensorsFromPair(training_pair_sentences[i]))
        pos_tensor.append(pos_tensor_from_sentence(training_pair_sentences[i][0],pos_to_idx))

    
    criterion = nn.NLLLoss()
    print("Training starts now :")

    for iter in range(1, n_iters + 1):
        print("EPOCH :",iter)
        training_pair = training_pairs[iter - 1]
        p_tensor = pos_tensor[iter-1]
        input_tensor = training_pair[0]
        
        target_tensor = training_pair[1]

        loss = train(input_tensor,p_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            torch.save(encoder, "encoder_self_attn_pos"+str(iter)+".pth")
            torch.save(decoder, "decoder_self_attn_pos"+str(iter)+".pth")
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

#         if iter % plot_every == 0:
#             plot_loss_avg = plot_loss_total / plot_every
#             plot_losses.append(plot_loss_avg)
#             plot_loss_total = 0

    #showPlot(plot_losses)

#### Training

In [27]:
hidden_size = 256
encoder1 = AttentiveEncoderPOS(lang.n_words, hidden_size,len(pos_to_idx))
attn_decoder1 = BahdanauDecoder(hidden_size, lang.n_words, dropout_p=0.1)

trainIters(encoder1, attn_decoder1, 30000, print_every=5000)

Creating POS tokens...
Training starts now :
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89


KeyboardInterrupt: 