In [1]:
import unicodedata
import string
import re
import random
import time
import math

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
import gc
import numpy as np

import os

In [2]:
#os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
#os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [3]:
USE_CUDA = True
MAX_LENGTH = 1328

In [4]:
import torch
from torch.nn import functional
from torch.autograd import Variable

def sequence_mask(sequence_length, max_len=None):
    if max_len is None:
        max_len = sequence_length.data.max()
        
    batch_size = sequence_length.size(0)
    seq_range = torch.range(0, max_len - 1).long()
    seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
    seq_range_expand = Variable(seq_range_expand)
    if sequence_length.is_cuda:
        seq_range_expand = seq_range_expand.cuda()
    seq_length_expand = (sequence_length.unsqueeze(1)
                         .expand_as(seq_range_expand))
    return seq_range_expand < seq_length_expand


def masked_cross_entropy(logits, target, length):
    length = Variable(torch.LongTensor(length)).cuda()
    logits.cuda()
    target.cuda()
    """
    Args:
        logits: A Variable containing a FloatTensor of size
            (batch, max_len, num_classes) which contains the
            unnormalized probability for each class.
        target: A Variable containing a LongTensor of size
            (batch, max_len) which contains the index of the true
            class for each corresponding step.
        length: A Variable containing a LongTensor of size (batch,)
            which contains the length of each data in a batch.
    Returns:
        loss: An average loss value masked by the length.
    """

    # logits_flat: (batch * max_len, num_classes)
    logits_flat = logits.view(-1, logits.size(-1))
    # log_probs_flat: (batch * max_len, num_classes)
    log_probs_flat = functional.log_softmax(logits_flat)
    # target_flat: (batch * max_len, 1)
    target_flat = target.view(-1, 1)
    # losses_flat: (batch * max_len, 1)
    losses_flat = -torch.gather(log_probs_flat, dim=1, index=target_flat)
    # losses: (batch, max_len)
    losses = losses_flat.view(*target.size())
    # mask: (batch, max_len)
    mask = sequence_mask(sequence_length=length, max_len=target.size(1))
    losses = losses * mask.float()
    loss = losses.sum() / length.float().sum()
    
    return loss

## Reading senseval 3 corpus

In [5]:
def process_instance(text):
    pairs = []
    
    sense_ids = re.findall(r'senseid=\"(.*?)\"', text, re.DOTALL)
    context = re.findall(r'<context>(.*?)</context>', text, re.DOTALL)
    word_ambiguos = re.findall(r'<head>(.*?)</head>', context[0], re.DOTALL)
    sentences = re.findall(r'(.*?)\.', context[0])
    
    for sentence in sentences:
        tags = re.findall(r'<head>(.*?)</head>', sentence)
        if(len(tags) != 0):
            for sense_id in sense_ids:   
                pair = [[],[]]
                sense_id = re.sub(r'%|:', '', sense_id)
                pair[0] = re.sub(r'<head>(.*?)</head>', word_ambiguos[0], sentence)
                pair[1] = re.sub(r'<head>(.*?)</head>', word_ambiguos[0] + '_' + sense_id, sentence)
                pairs.append(pair)
        
    return np.array(pairs)

In [6]:
def construct_pairs(path):
    with open('corpus/EnglishLS.train', 'r') as f:
        xml = f.read()

    instances = re.findall(r'<instance(.*?)</instance>', xml, re.DOTALL)
    pairs= []

    for instance in instances:
        data = "<instance" + instance + "</instance>"
        data = re.sub(r'[^\x20-\x7E]', '', data)
        data = re.sub(r' n\'t', 'n\'t', data)
        data = re.sub(r'&', '', data)
        pairs.extend(process_instance(data))
        
    return np.array(pairs)

In [7]:
pairs = construct_pairs('corpus/EnglishLS.train')

In [8]:
pairs[0]

array([ '  It is quite a hefty spade , with bicycle - type handlebars and a sprung lever at the rear , which you step on to activate it ',
       '  It is quite a hefty spade , with bicycle - type handlebars and a sprung lever at the rear , which you step on to activate_38201 it '],
      dtype='<U1335')

## Generate vocab

In [9]:
PAD_token = 0
SOS_token = 1
EOS_token = 2

class Lang:
    def __init__(self):
        self.trimmed = False
        self.stoi = {}
        self.word2count = {}
        self.itos = {0: "PAD", 1: "SOS", 2: "EOS"}
        self.n_words = 3 # Count default tokens

    def index_words(self, sentence):
        for word in sentence.split(' '):
            self.index_word(word)

    def index_word(self, word):
        if word not in self.stoi:
            self.stoi[word] = self.n_words
            self.word2count[word] = 1
            self.itos[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

    # Remove words below a certain count threshold
    def trim(self, min_count):
        if self.trimmed: return
        self.trimmed = True
        
        keep_words = []
        
        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)

        print('keep_words %s / %s = %.4f' % (
            len(keep_words), len(self.stoi), len(keep_words) / len(self.stoi)
        ))

        # Reinitialize dictionaries
        self.stoi = {}
        self.word2count = {}
        self.itos = {0: "PAD", 1: "SOS", 2: "EOS"}
        self.n_words = 3 # Count default tokens

        for word in keep_words:
            self.index_word(word)

In [10]:
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalize_string(s):
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r"\s+", r" ", s).strip()
    return s

def normalize_pairs(pairs):
    for pair in pairs:
        pair[0] = normalize_string(pair[0])
        pair[1] = normalize_string(pair[1])

def filter_pairs(pairs):
    filtered_pairs = []
    for pair in pairs:
        if len(pair[0]) <= MAX_LENGTH \
            and len(pair[1]) <= MAX_LENGTH:
                filtered_pairs.append(pair)
    return filtered_pairs

In [11]:
def prepare_data(pairs):
    
    normalize_pairs(pairs)
    print("Reading pairs %d" % len(pairs))
    
    pairs = filter_pairs(pairs)
    print("Filtered to %d pairs" % len(pairs))
    
    sentence =  Lang()
    sense = Lang()
    
    print("Indexing words...")
    for pair in pairs:
        sentence.index_words(pair[0])
        sense.index_words(pair[1])
    
    print('Indexed %d words in input language, %d words in output' % (sentence.n_words, sense.n_words))
    return sentence, sense, pairs

In [12]:
sentence, sense, pairs = prepare_data(pairs)

Reading pairs 8453
Filtered to 8452 pairs
Indexing words...
Indexed 20843 words in input language, 21553 words in output


In [13]:
pairs[0]

array([ 'it is quite a hefty spade , with bicycle - type handlebars and a sprung lever at the rear , which you step on to activate it',
       'it is quite a hefty spade , with bicycle - type handlebars and a sprung lever at the rear , which you step on to activate_38201 it'],
      dtype='<U1335')

In [14]:
def indexes_from_sentence(lang, sentence):
    return [lang.stoi[word] for word in sentence.split(' ')] + [EOS_token]

In [15]:
def pad_seq(seq, max_length):
    seq += [PAD_token for i in range(max_length - len(seq))]
    return seq

In [16]:
def random_batch(batch_size):
    input_seqs = []
    target_seqs = []

    # Choose random pairs
    for i in range(batch_size):
        pair = random.choice(pairs)
        input_seqs.append(indexes_from_sentence(sentence, pair[0]))
        target_seqs.append(indexes_from_sentence(sense, pair[1]))

    # Zip into pairs, sort by length (descending), unzip
    seq_pairs = sorted(zip(input_seqs, target_seqs), key=lambda p: len(p[0]), reverse=True)
    input_seqs, target_seqs = zip(*seq_pairs)
    
    # For input and target sequences, get array of lengths and pad with 0s to max length
    input_lengths = [len(s) for s in input_seqs]
    input_padded = [pad_seq(s, max(input_lengths)) for s in input_seqs]
    target_lengths = [len(s) for s in target_seqs]
    target_padded = [pad_seq(s, max(target_lengths)) for s in target_seqs]

    # Turn padded arrays into (batch_size x max_len) tensors, transpose into (max_len x batch_size)
    input_var = Variable(torch.LongTensor(input_padded)).transpose(0, 1)
    target_var = Variable(torch.LongTensor(target_padded)).transpose(0, 1)
    
    if USE_CUDA:
        input_var = input_var.cuda()
        target_var = target_var.cuda()
        
    return input_var, input_lengths, target_var, target_lengths

## Model

In [22]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1, dropout=0.1):
        super(EncoderRNN, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.dropout = dropout
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=self.dropout, bidirectional=True)
        
    def forward(self, input):
        # Note: we run this all at once (over multiple batches of multiple sequences)
        input_seqs, input_lengths, hidden = input
        
        embedded = self.embedding(input_seqs)
        
        #packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        self.gru.flatten_parameters() 
        outputs, hidden = self.gru(embedded, hidden)      
        #outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(outputs) # unpack (back to padded)
        
        padded_output = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:] # Sum bidirectional outputs
        return padded_output, hidden

In [23]:
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        
        self.method = method
        self.hidden_size = hidden_size
        
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)

        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(1, hidden_size))

    def forward(self, hidden, encoder_outputs):
        max_len = encoder_outputs.size(0)
        this_batch_size = encoder_outputs.size(1)

        # Create variable to store attention energies
        attn_energies = Variable(torch.zeros(this_batch_size, max_len)) # B x S

        if USE_CUDA:
            attn_energies = attn_energies.cuda()

        # For each batch of encoder outputs
        for b in range(this_batch_size):
            # Calculate energy for each encoder output
            for i in range(max_len):
                attn_energies[b, i] = self.score(hidden[:, b], encoder_outputs[i, b].unsqueeze(0))

        # Normalize energies to weights in range 0 to 1, resize to 1 x B x S
        return F.softmax(attn_energies).unsqueeze(1)
    
    def score(self, hidden, encoder_output):
        if self.method == 'dot':
            energy =torch.dot(hidden.view(-1), encoder_output.view(-1))
        elif self.method == 'general':
            energy = self.attn(encoder_output)
            energy = energy.view(-1,1)
            energy = hidden.mm(energy)
            return energy
        elif self.method == 'concat':
            energy = self.attn(torch.cat((hidden, encoder_output), 1))
            energy = torch.dot(self.v.view(-1), energy.view(-1))
        return energy

In [25]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.LSTM(hidden_size, hidden_size, n_layers, dropout=dropout, batch_first=True)
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        
        # Choose attention model
        if attn_model != 'none':
            self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_seq, last_hidden, encoder_outputs):
        # Note: we run this one step at a time

        # Get the embedding of the current input word (last output word)
        batch_size = input_seq.size(0)
        embedded = self.embedding(input_seq)
        embedded = self.embedding_dropout(embedded)
        embedded = embedded.view(1, batch_size, self.hidden_size) # S=1 x B x N
        print("hidden layer")
        print(last_hidden)
        
        # Get current hidden state from input word and last hidden state
        rnn_output, hidden = self.gru(embedded, last_hidden)

        # Calculate attention from current RNN state and all encoder outputs;
        # apply to encoder outputs to get weighted average
        attn_weights = self.attn(rnn_output, encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) # B x S=1 x N

        # Attentional vector using the RNN hidden state and context vector
        # concatenated together (Luong eq. 5)
        rnn_output = rnn_output.squeeze(0) # S=1 x B x N -> B x N
        context = context.squeeze(1)       # B x S=1 x N -> B x N
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = F.tanh(self.concat(concat_input))

        # Finally predict next token (Luong eq. 6, without softmax)
        output = self.out(concat_output)

        # Return final output, hidden state, and attention weights (for visualization)
        return output, hidden, attn_weights

In [26]:
def as_minutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def time_since(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (as_minutes(s), as_minutes(rs))

In [21]:
def train(n_layers, input_batches, input_lengths, target_batches, target_lengths, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    
    # Zero gradients of both optimizers
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss = 0 # Added onto for each word

    # Run words through encoder
    encoder_outputs, encoder_hidden = encoder((input_batches, input_lengths, None))
    
    # Prepare input and output variables
    decoder_input = Variable(torch.LongTensor([SOS_token] * batch_size))
    decoder_hidden = encoder_hidden[:n_layers] # Use last (forward) hidden state from encoder

    max_target_length = max(target_lengths)
    all_decoder_outputs = Variable(torch.zeros(max_target_length, batch_size, sense.n_words))

    # Move new Variables to CUDA
    if USE_CUDA:
        decoder_input = decoder_input.cuda()
        all_decoder_outputs = all_decoder_outputs.cuda()
    # Run through decoder one time step at a time
    for t in range(max_target_length):
        decoder_output, decoder_hidden, decoder_attn = decoder(
            decoder_input, decoder_hidden, encoder_outputs
        )

        all_decoder_outputs[t] = decoder_output
        decoder_input = target_batches[t] # Next input is current target
        
        del decoder_output
        decoder_output = None
        gc.collect()
        torch.cuda.empty_cache()

    del decoder_hidden
    torch.cuda.empty_cache()
    # Loss calculation and backpropagation
    loss = masked_cross_entropy(
        all_decoder_outputs.transpose(0, 1).contiguous(), # -> batch x seq
        target_batches.transpose(0, 1).contiguous(), # -> batch x seq
        target_lengths
    )
    torch.cuda.empty_cache()
    loss.backward()
    torch.cuda.empty_cache()
    # Clip gradient norms
    ec = torch.nn.utils.clip_grad_norm(encoder.parameters(), clip)
    dc = torch.nn.utils.clip_grad_norm(decoder.parameters(), clip)

    # Update parameters with optimizers
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss.data[0], ec, dc

In [31]:
# Configure models
attn_model = 'general'
hidden_size = 300
n_layers = 2
dropout = 0.1
batch_size = 10

# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 0.5
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_epochs = 50000
epoch = 0
plot_every = 20
print_every = 300
evaluate_every = 1000

# Initialize models
encoder = EncoderRNN(sentence.n_words, hidden_size, n_layers, dropout=dropout)
#encoder = torch.nn.DistributedDataParallel(encoder, device_ids=[0, 1])
decoder = LuongAttnDecoderRNN(attn_model, hidden_size, sense.n_words, n_layers, dropout=dropout)
#decoder = torch.nn.DistributedDataParallel(decoder, device_ids=[0, 1])

# Initialize optimizers and criterion
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
criterion = nn.CrossEntropyLoss()

# Move models to GPU
if USE_CUDA:
    encoder.cuda()
    decoder.cuda()
    #encoder = nn.DataParallel(encoder, device_ids=[0,1]).cuda()
    #decoder = nn.DataParallel(decoder, device_ids=[0,1]).cuda()

# Keep track of time elapsed and running averages
start = time.time()
plot_losses = []
print_loss_total = 0 # Reset every print_every
plot_loss_total = 0 # Reset every plot_every

In [32]:
# Begin!
ecs = []
dcs = []
eca = 0
dca = 0

while epoch < n_epochs:
    epoch += 1
    gc.collect()
    torch.cuda.empty_cache()
    # Get training data for this cycle
    input_batches, input_lengths, target_batches, target_lengths = random_batch(batch_size)

    # Run the train function
    loss, ec, dc = train(
        n_layers, input_batches, input_lengths, target_batches, target_lengths,
        encoder, decoder,
        encoder_optimizer, decoder_optimizer, criterion
    )

    # Keep track of loss
    print_loss_total += loss
    plot_loss_total += loss
    eca += ec
    dca += dc
    
    del loss
    del input_batches
    del target_batches
    gc.collect()
    torch.cuda.empty_cache()
    
    if epoch % print_every == 0:
        print_loss_avg = print_loss_total / print_every
        print_loss_total = 0
        print_summary = '%s (%d %d%%) %.4f' % (time_since(start, epoch / n_epochs), epoch, epoch / n_epochs * 100, print_loss_avg)
        print(print_summary)
        
    #if epoch % evaluate_every == 0:
    #    evaluate_randomly()

    if epoch % plot_every == 0:
        plot_loss_avg = plot_loss_total / plot_every
        plot_losses.append(plot_loss_avg)
        plot_loss_total = 0
        
        # TODO: Running average helper
        ecs.append(eca / plot_every)
        dcs.append(dca / plot_every)
        eca = 0
        dca = 0

hidden layer
Variable containing:
( 0 ,.,.) = 
 -0.0511  0.2237 -0.2490  ...   0.0491  0.3095 -0.1848
 -0.3700  0.0742  0.2353  ...  -0.2486  0.3134  0.4576
 -0.4135 -0.1339  0.6474  ...  -0.3013  0.4225  0.6595
           ...             ⋱             ...          
 -0.4248 -0.1582  0.6511  ...  -0.3235  0.4222  0.6633
 -0.4250 -0.1589  0.6510  ...  -0.3229  0.4222  0.6634
 -0.4251 -0.1598  0.6511  ...  -0.3228  0.4221  0.6632

( 1 ,.,.) = 
  0.6223 -0.2182  0.2976  ...   0.5492  0.2573  0.4666
 -0.2972  0.2671 -0.4795  ...   0.3262 -0.2472  0.2179
  0.3301  0.3254  0.1263  ...  -0.4029  0.1811  0.0198
           ...             ⋱             ...          
  0.2214  0.3348 -0.5984  ...   0.5105 -0.3768  0.3825
 -0.3032  0.2465 -0.5671  ...   0.1797 -0.4157  0.4108
 -0.1247  0.2693 -0.3294  ...   0.5356  0.1254  0.4586
[torch.cuda.FloatTensor of size 2x10x300 (GPU 0)]



RuntimeError: Expected hidden size (2, 1, 300), got (2, 10, 300)

In [None]:
def evaluate(input_seq, max_length=MAX_LENGTH):   
    input_seqs = [indexes_from_sentence(sentence, input_seq)]
    input_lengths = [len(input_seqs)]
    input_batches = Variable(torch.LongTensor(input_seqs), volatile=True).transpose(0, 1)
    
    if USE_CUDA:
        input_batches = input_batches.cuda()
        
    # Set to not-training mode to disable dropout
    encoder.train(False)
    decoder.train(False)
    
    # Run through encoder
    encoder_outputs, encoder_hidden = encoder(input_batches, input_lengths, None)

    # Create starting vectors for decoder
    decoder_input = Variable(torch.LongTensor([SOS_token]), volatile=True) # SOS
    decoder_hidden = encoder_hidden[:decoder.n_layers] # Use last (forward) hidden state from encoder
    
    if USE_CUDA:
        decoder_input = decoder_input.cuda()

    # Store output words and attention states
    decoded_words = []
    decoder_attentions = torch.zeros(max_length + 1, max_length + 1)
    
    # Run through decoder
    for di in range(max_length):
        decoder_output, decoder_hidden, decoder_attention = decoder(
            decoder_input, decoder_hidden, encoder_outputs
        )
        decoder_attentions[di,:decoder_attention.size(2)] += decoder_attention.squeeze(0).squeeze(0).cpu().data

        # Choose top word from output
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0]
        if ni == EOS_token:
            decoded_words.append('<EOS>')
            break
        else:
            decoded_words.append(sense.itos[ni])
            
        # Next input is chosen word
        decoder_input = Variable(torch.LongTensor([ni]))
        #if USE_CUDA: decoder_input = decoder_input.cuda()

    # Set back to training mode
    encoder.train(True)
    decoder.train(True)
    
    return decoded_words, decoder_attentions[:di+1, :len(encoder_outputs)]

In [None]:
def evaluate_randomly():
    [input_sentence, target_sentence] = random.choice(pairs)
    evaluate_and_show_attention(input_sentence, target_sentence)

In [None]:
import io
import torchvision
from PIL import Image
import visdom
vis = visdom.Visdom()

def show_plot_visdom():
    buf = io.BytesIO()
    plt.savefig(buf)
    buf.seek(0)
    attn_win = 'attention (%s)' % hostname
    vis.image(torchvision.transforms.ToTensor()(Image.open(buf)), win=attn_win, opts={'title': attn_win})

In [None]:
def show_attention(input_sentence, output_words, attentions):
    # Set up figure with colorbar
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + input_sentence.split(' ') + ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    show_plot_visdom()
    plt.show()
    plt.close()

In [None]:
def evaluate_and_show_attention(input_sentence, target_sentence=None):
    output_words, attentions = evaluate(input_sentence)
    output_sentence = ' '.join(output_words)
    print('>', input_sentence)
    if target_sentence is not None:
        print('=', target_sentence)
    print('<', output_sentence)
    
    #show_attention(input_sentence, output_words, attentions)
    
    # Show input, target, output text in visdom
    #win = 'evaluted'
    #text = '<p>&gt; %s</p><p>= %s</p><p>&lt; %s</p>' % (input_sentence, target_sentence, output_sentence)
    #vis.text(text, win=win, opts={'title': win})

In [None]:
evaluate_randomly()