In [1]:
import random
import time

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import torchtext

from torch.autograd import Variable

import pandas as pd

from collections import Counter

In [2]:
mydevice = torch.device("cuda" if torch.cuda.is_available() else "cpu")
mydevice

device(type='cuda')

In [3]:
SOS_TOKEN = '<sos>'
EOS_TOKEN = '<eos>'

MAX_LEN = 50

def len_filter(example):
    return len(example.src) <= MAX_LEN and len(example.tgt) <= MAX_LEN

### Load dummy number reversal dataset

In [4]:
train_path = 'data/toy_reverse/train/data.txt'
dev_path = 'data/toy_reverse/dev/data.txt'

src = torchtext.data.Field(
    batch_first=True, 
    include_lengths=True
    )
tgt = torchtext.data.Field(
    batch_first=True, 
    preprocessing = lambda seq: [SOS_TOKEN] + seq + [EOS_TOKEN]
    )

data_train = torchtext.data.TabularDataset(
        path=train_path, format='tsv',
        fields=[('src', src), ('tgt', tgt)],
        filter_pred=len_filter
    )

data_dev = torchtext.data.TabularDataset(
        path=dev_path, format='tsv',
        fields=[('src', src), ('tgt', tgt)],
        filter_pred=len_filter
    )


### Load the e2e data


In [5]:
train_path = 'data/e2e-dataset/trainset.csv'
dev_path = 'data/e2e-dataset/devset.csv'

src = torchtext.data.Field(
    batch_first=True, 
    include_lengths=True
    )
tgt = torchtext.data.Field(
    batch_first=True, 
    preprocessing = lambda seq: [SOS_TOKEN] + seq + [EOS_TOKEN]
    )

data_train = torchtext.data.TabularDataset(
        path=train_path, format='csv',
        fields=[('src', src), ('tgt', tgt)],
        filter_pred=len_filter
    )

data_dev = torchtext.data.TabularDataset(
        path=dev_path, format='csv',
        fields=[('src', src), ('tgt', tgt)],
        filter_pred=len_filter
    )

In [6]:
src.build_vocab(data_train, max_size=50000)
tgt.build_vocab(data_train, max_size=50000)
input_vocab = src.vocab
output_vocab = tgt.vocab

print('20 tokens from input vocab:\n', list(input_vocab.stoi.keys())[:20])
print('\n20 tokens from output vocab:\n', list(output_vocab.stoi.keys())[:20])

print('\nnum training examples:', len(data_train.examples))

item = random.choice(data_train.examples)
print('\nexample train data:')
print('src:\n', item.src)
print('tgt:\n', item.tgt)

20 tokens from input vocab:
 ['Cambridge', 'food[English]', 'Hotel]', 'Vaults],', 'near[Yippee', 'Twenty', 'familyFriendly[no]', 'Noodle', 'Brazil]', 'Sorrento]', 'near[Ranch]', 'than', 'priceRange[cheap]', 'Plough],', 'near[Café', 'One]', 'Arms]', 'Bells]', 'near[Crowne', 'area[city']

20 tokens from output vocab:
 ['Outside', 'Consumers', 'rated', 'describe', 'orientated,', 'places.', 'Twenty', "'Cocum'.", 'children,', 'low-prices,', 'okay', 'moderate-', 'pub,', 'sizes', 'fictional', '£20.', 'fare.', 'establishments,', 'lacks', 'wants']

num training examples: 42038

example train data:
src:
 ['name[The', 'Rice', 'Boat],', 'food[Fast', 'food],', 'priceRange[moderate],', 'customer', 'rating[3', 'out', 'of', '5],', 'area[riverside],', 'familyFriendly[yes],', 'near[Express', 'by', 'Holiday', 'Inn]']
tgt:
 ['<sos>', 'The', 'Rice', 'Boat', 'is', 'a', 'restaurant', 'that', 'serves', 'moderately', 'priced', 'fast', 'food', 'and', 'it', 'is', 'located', 'near', 'Express', 'by', 'Holiday', 'I

### Model definition and training functions

In [7]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, myinput, hidden):
        embedded = self.embedding(myinput).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=mydevice)

    
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=mydevice)

In [8]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion,
          max_length=MAX_LEN, teacher_forcing_ratio=0.5):
    
    # get an initial hidden state for the encoder
    encoder_hidden = encoder.initHidden()

    # zero the gradients of the optimizers
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # get the seq lengths, used for iterating through encoder/decoder
    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    # create empty tensor to fill with encoder outputs
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=mydevice)

    # create a variable for loss
    loss = 0
    
    # pass the inputs through the encoder
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    # create a start-of-sequence tensor for the decoder
    decoder_input = torch.tensor([[output_vocab.stoi[SOS_TOKEN]]], device=mydevice)

    # set the decoder hidden state to the final encoder hidden state
    decoder_hidden = encoder_hidden

    # decide if we will use teacher forcing
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    for di in range(target_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        
        topv, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze().detach()  # detach from history as input
                
        loss += criterion(decoder_output, target_tensor[di].unsqueeze(0))
        
        if use_teacher_forcing:
            decoder_input = target_tensor[di]
        
        if decoder_input.item() == output_vocab.stoi[EOS_TOKEN]:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [9]:
def trainIters(encoder, decoder, n_iters, print_every=1000, learning_rate=0.01, teacher_forcing_ratio=0.5):
    print('Running {} epochs'.format(n_iters))
    print_loss_total = 0
    print_loss_epoch = 0

    encoder_optim = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optim = optim.SGD(decoder.parameters(), lr=learning_rate)

    batch_iterator = torchtext.data.Iterator(
        dataset=data_train, batch_size=1,
        sort=False, sort_within_batch=True,
        sort_key=lambda x: len(x.src),
        device=mydevice, repeat=False)
    

    criterion = nn.NLLLoss()

    for e in range(n_iters):
        batch_generator = batch_iterator.__iter__()
        step = 0
        start = time.time()
        for batch in batch_generator:
            step += 1
            
            # get the input and target from the batch iterator
            input_tensor, input_lengths = getattr(batch, 'src')
            target_tensor = getattr(batch, 'tgt')
            
            input_tensor = input_tensor[0]
            target_tensor = target_tensor[0]

            loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optim, decoder_optim, criterion, teacher_forcing_ratio=teacher_forcing_ratio)
            print_loss_total += loss
            print_loss_epoch += loss
            

            if step % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                t = (time.time() - start) / 60
                print('step: {}\t avg loss: {:2f}\t time for {} steps: {:2f} min'.format(step,print_loss_avg,print_every,t))
                start = time.time()
        
        print_loss_avg = print_loss_epoch / step
        print_loss_epoch = 0
        print('End of epoch {}, avg loss {:2f}'.format(e,print_loss_avg)) 

### 2. Create and train a model

In [10]:
hidden_size = 128
encoder1 = EncoderRNN(len(input_vocab), hidden_size).to(mydevice)
decoder1 = DecoderRNN(hidden_size, len(output_vocab)).to(mydevice)

In [11]:
#trainIters(encoder1, decoder1, 1, print_every=1000, teacher_forcing_ratio=0.75)

In [12]:
#torch.save(encoder1.state_dict(),'encoder.mdl')
#torch.save(decoder1.state_dict(),'decoder.mdl')
encoder1.load_state_dict(torch.load('encoder.mdl'))
decoder1.load_state_dict(torch.load('decoder.mdl'))
print(encoder1)
print(decoder1)

EncoderRNN(
  (embedding): Embedding(146, 128)
  (gru): GRU(128, 128)
)
DecoderRNN(
  (embedding): Embedding(4905, 128)
  (gru): GRU(128, 128)
  (out): Linear(in_features=128, out_features=4905, bias=True)
  (softmax): LogSoftmax()
)


In [13]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LEN):
    with torch.no_grad():
        input_tensor = torch.tensor([input_vocab.stoi[word] for word in sentence], device=mydevice)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=mydevice)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[output_vocab.stoi[SOS_TOKEN]]], device=mydevice)

        decoder_hidden = encoder_hidden

        decoded_words = []

        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            topv, topi = decoder_output.data.topk(1)
            next_word = output_vocab.itos[topi.item()]
            decoded_words.append(next_word)
            if next_word == EOS_TOKEN:
                break

            decoder_input = topi.squeeze().detach()

        return decoded_words

### Beam search evaluator


In [14]:
def infer(candidate, kVal):
        candidates = []
        decoder_input = Variable(torch.LongTensor([[candidate[0][-1]]]))
        if torch.cuda.is_available():
            decoder_input = decoder_input.cuda()
        sequence, decoder_hidden, encoder_outputs = candidate
        decoder_output, decoder_hidden = decoder1(decoder_input, decoder_hidden)

        topk = decoder_output.data.topk(kVal)
        for k in range(kVal):
            topk_prob = topk[0][0][k]
            topk_index = int(topk[1][0][k])
            candidates.append([sequence+[topk_index],  decoder_hidden, encoder_outputs])
        return candidates
    
def beamSearch(encoder, decoder, sentence, k, max_length=MAX_LEN):
        
    with torch.no_grad():
        input_tensor = torch.tensor([input_vocab.stoi[word] for word in sentence], device=mydevice)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=mydevice)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[output_vocab.stoi[SOS_TOKEN]]], device=mydevice)

        decoder_hidden = encoder_hidden

        decoded_words = []
        
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        # get the top candidates of beam size
        topk = decoder_output.data.topk(k)
        candidates = [[] for i in range(k)]
        dead_k = 0
        final_candidates = []
        for index in range(k):
            topk_prob = topk[0][0][index]
            topk_index = int(topk[1][0][index])
            candidates[index] = [[topk_index], decoder_hidden, encoder_outputs]

        for _ in range(max_length):
            tmp = []
            for index in range(len(candidates)):
                tmp.extend(infer(candidates[index], k))
            candidates = []

            df = pd.DataFrame(tmp)
            df.columns = ['sequence', "decoder_hidden", "encoder_outputs"]
            sequence_len = df.sequence.apply(lambda x:len(x))
            df = df[:(k-dead_k)]
            for index in range(len(df)):
                group = df.ix[index]
                
                # EOS TOKEN
                if group.tolist()[0][-1] == 3:
                    final_candidates.append(group.tolist())
                    df = df.drop([index], axis=0)
                    dead_k += 1

            candidates = df.values.tolist()
            if len(candidates) == 0:
                break  
        
        if len(final_candidates) < k:
            final_candidates.extend(candidates[:(k-dead_k)])
            
        return final_candidates   


In [15]:
kVal = 3

item = random.choice(data_train.examples)
seq = item.src
candidates = beamSearch(encoder1,decoder1, seq, kVal)

print(seq)
print()
for candidate in candidates:
    outstrs = []
    for i in candidate[0]:
        outstrs.append(output_vocab.itos[i])
    print(" ".join(outstrs))
    print()

['name[The', 'Twenty', 'Two],', 'food[French],', 'area[city', 'centre],', 'familyFriendly[yes]']

<sos> The Twenty Two is a French restaurant in the city centre is <eos>

<sos> The Twenty Two is a French restaurant in the city centre is and <eos>

<sos> The Twenty Two is a French restaurant in the city centre is and is <eos>



In [16]:
for i in range(5):
    item = random.choice(data_train.examples)
    seq = item.src
    print(seq)
    #print(" ".join(item.tgt[1:-1]))
    words = evaluate(encoder1, decoder1, seq)
    print(' '.join(words))
    print()

['name[Loch', 'Fyne],', 'eatType[restaurant],', 'food[Fast', 'food],', 'familyFriendly[yes]']
<sos> Loch Fyne is a fast food restaurant that is kid friendly and has a price range of £20-25. <eos>

['name[Loch', 'Fyne],', 'eatType[restaurant],', 'food[French],', 'familyFriendly[no]']
<sos> Loch Fyne is a French restaurant that is not not family-friendly. <eos>

['name[The', 'Waterman],', 'food[French],', 'priceRange[more', 'than', '£30],', 'customer', 'rating[5', 'out', 'of', '5],', 'area[city', 'centre],', 'familyFriendly[yes]']
<sos> The Waterman is a French restaurant in the city centre with a price range of of of the city and a customer rating of 3 out of 5. <eos>

['name[The', 'Rice', 'Boat],', 'food[Japanese],', 'priceRange[more', 'than', '£30],', 'customer', 'rating[5', 'out', 'of', '5],', 'area[riverside],', 'familyFriendly[yes],', 'near[Express', 'by', 'Holiday', 'Inn]']
<sos> The Rice Boat is a restaurant The Rice Boat located near Express by Holiday of Holiday of the Holiday 

### BLEU Evaluator

In [17]:
import math
from nltk import ngrams

def brevity(canidate,references):
    clen = len(canidate)
    rlens=[]
    # get lengths of references
    for i in references:
        rlens.append(len(i))
    
    tmp = list(dict.fromkeys(rlens))

    diff=[]
    # get the differences
    for i in tmp:
        diff.append((i,abs(clen-i)))
    
    # sort on the differences
    diff.sort(key = lambda x: x[1])

    # if the diff is 0, return candidate lengths
    for i in diff:
        if i[1] == 0:
            return clen, clen
    
    # find the closest
    closest = diff[0][0]
    closest_len= diff[0][1]
    anotherClosest = False
    for item in diff:
        if item[0] != closest and item[1]==closest_len:
            anotherClosest = True
            closest2 = item
            
    if anotherClosest == False:
        return closest,clen
     
    return (closest, clen) if closest < closest2[0] else (closest2[0],clen)

def getNgramPrec(candidate, references, n):
    
    # tokenize candidate
    tcandidate = candidate
    grams = ngrams(tcandidate, n)
    candidate=[]
    
    if n == 1:
        candidate = tcandidate
    else:
        for i in grams:
            candidate.append(i)
            
    clen = len(candidate)
    
    # if len is 0, back off to the previous n
    x = 1
    newN = n
    while clen == 0:
        if n - 1 == 0:
            return 0.0
        grams=ngrams(tcandidate,n - x)
        candidate=[]
        if n == 1:
            candidate = tcandidate
        else:
            for i in grams:
                candidate.append(i)
        clen = len(candidate)
        newN = n - x
        x += 1
        
    count={}
    for i in candidate:
        key=i
        counts=candidate.count(key)
        if n ==1:
            count[key]=counts
        else:
            count[','.join(key)]=counts

    # tokenize references
    trefs = references
    
    rgrams=[]
    for i in trefs:
        tmp = ngrams(i,n)
        tmpLst=[]
        for i in tmp:
            tmpLst.append(i)
        rgrams.append(tmpLst)
    
    # count min
    minctr={}
    for i in rgrams:
        for token in i:
            counts=i.count(token)
        
            key=','.join(token)
        
            if key not in minctr:
                minctr[key]=counts
            else:
                if minctr[key] < counts:
                    minctr[key] = counts
                    
    # get modified ngrams       
    total = []
    for i,j in count.items():
        if i not in minctr:
            ctr = 0
        else:
            ctr = minctr[i]
        w = (i, min(j, ctr))
        total.append(w)
    
    score=0
    
    for i in total:
        w = i[1]
        score = score + w
    return score / clen, newN

def makeDict(datas):
    d = {}
    for data in datas:
        tmp = str(data.src)
        if tmp not in d:
            d[tmp] = []
            d[tmp].append(data.src)
        
        sentence = data.tgt[1:-1]
        # strip '.'
        sentence = [s.strip('.') for s in sentence]
        d[tmp].append(sentence)
    return d

def getBleu(canidate, references, n):
    precisions=[]
    # sum up wn * logpn
    for i in range(1, n+1):
        precision,newN= getNgramPrec(canidate, references, i)
        if precision != 0.0:
            precisions.append(1/n*math.log(precision))
    
    # get brecity penalty
    reflen,canlen = brevity(canidate,references)

    # apply brevity penalty
    if canlen > reflen:
        return math.exp(sum(precisions))
    else:
        return math.exp(1-(reflen/canlen)) * math.exp(sum(precisions))


In [18]:
bres = 0.0
gres = 0.0
myDict = makeDict(data_dev)
for data in myDict:
    seq = myDict[data][0]
    beam = beamSearch(encoder1, decoder1, seq, 3)
    outstrs = []
    # select the top-scoring sequence output
    for idx in beam[0][0]:
        # remove EOS/SOS tokens
        if idx == 4:
            continue
        if idx == 3:
            break
        outstrs.append(output_vocab.itos[idx])
    tmp = getBleu(outstrs, myDict[data][1:],4)
    bres += tmp
    
for data in myDict:
    seq = myDict[data][0]
    words = evaluate(encoder1, decoder1, seq)
    tmp = getBleu(words,myDict[data][1:],4)
    gres += tmp 
print("My BLEU-4 score with beam size 3 -> " + str(bres/len(myDict)))
print("My BLEU-4 score with greedy -> " + str(gres/len(myDict)))
# The output of this cell should be the average BLEU score on the dev set
# for greedy decoding AND for beam search decoding (beam size = 3)

My BLEU-4 score with beam size 3 -> 0.45566752275121164
My BLEU-4 score with greedy -> 0.4108040888998508


In [19]:
import nltk
nbres = 0.0
ngres = 0.0
w = [(1/3,1/3,1/3),(1/2,1/2)]
beamSize = 3
for data in myDict:
    seq = myDict[data][0]
    words = evaluate(encoder1, decoder1, seq)
    tmp = nltk.translate.bleu_score.sentence_bleu(myDict[data][1:],words)
    trial = 0
    while tmp == 0.0:
        tmp = nltk.translate.bleu_score.sentence_bleu(myDict[data][1:],words, weights=w[trial])
        trial += 1
        if trial > 1:
            break
    ngres += tmp 
    
for data in myDict:
    seq = myDict[data][0]    
    beam = beamSearch(encoder1, decoder1, seq, beamSize)
    outstrs = []
    # select the top-scoring sequence output
    for idx in beam[0][0]:
        # remove EOS/SOS tokens
        if idx == 4:
            continue
        if idx == 3:
            break
        outstrs.append(output_vocab.itos[idx])
    tmp = nltk.translate.bleu_score.sentence_bleu(myDict[data][1:],outstrs)
    trial = 0
    while tmp == 0.0:
        tmp = nltk.translate.bleu_score.sentence_bleu(myDict[data][1:],outstrs, weights=w[trial])
        trial += 1
        if trial > 1:
            break
    nbres += tmp    
    
print("NLTK BLEU-4 score with beam size 3 -> " + str(nbres / len(myDict)))
print("NLTK BLEU-4 score with greedy -> " + str(ngres / len(myDict)))

NLTK BLEU-4 score with beam size 3 -> 0.45384270523296344
NLTK BLEU-4 score with greedy -> 0.4089792713816026
