In [1]:
def read_corpus(file_path, source):
    """ Read file, where each sentence is dilineated by a `\n`.
    @param file_path (str): path to file containing corpus
    @param source (str): "tgt" or "src" indicating whether text
        is of the source language or target language
    """
    data = []
    for line in open(file_path):
        sent = line.strip().split(' ')
        # only append <s> and </s> to the target sentence
        if source == 'tgt':
            sent = ['<s>'] + sent + ['</s>']
        data.append(sent)

    return data

In [2]:
# 	python run.py train --train-src=./en_es_data/train_tiny.es --train-tgt=./en_es_data/train_tiny.en \
#         --dev-src=./en_es_data/dev_tiny.es --dev-tgt=./en_es_data/dev_tiny.en --vocab=vocab_tiny_q1.json --batch-size=2 \
#         --valid-niter=100 --max-epoch=101 --no-char-decoder

import math
import sys
import pickle
import time


from docopt import docopt
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction
from nmt_model import Hypothesis, NMT
import numpy as np
from typing import List, Tuple, Dict, Set, Union
from tqdm import tqdm
from utils import read_corpus, batch_iter
from vocab import Vocab, VocabEntry

import torch
import torch.nn.utils
    
args = {}
args['--train-src'] = './en_es_data/train_tiny.es'
args['--train-tgt'] = './en_es_data/train_tiny.en'
args['--dev-src'] = './en_es_data/dev_tiny.es'
args['--dev-tgt'] = './en_es_data/dev_tiny.en'
args['--vocab'] = 'vocab_tiny_q1.json'
args['--batch-size'] =  2
args['--valid-niter'] = 100
args['--max-epoch'] = 101
args['--clip-grad'] = 5
args['--valid-niter'] = 2000
args['--log-every'] = 10
args['--save-to'] = 'model.bin'
args['--vocab'] = 'vocab.json'
args['--embed-size'] = 256
args['--hidden-size'] = 256
args['--dropout'] = 0.3
args['--no-char-decoder'] = True
args['--uniform-init'] = 0.1
args['--cuda'] = None
args['--lr'] = 0.001

In [3]:
train_data_src = read_corpus(args['--train-src'], source='src')
train_data_tgt = read_corpus(args['--train-tgt'], source='tgt')

dev_data_src = read_corpus(args['--dev-src'], source='src')
dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt')

train_data = list(zip(train_data_src, train_data_tgt))
dev_data = list(zip(dev_data_src, dev_data_tgt))

train_batch_size = int(args['--batch-size'])

clip_grad = float(args['--clip-grad'])
valid_niter = int(args['--valid-niter'])
log_every = int(args['--log-every'])
model_save_path = args['--save-to']

vocab = Vocab.load(args['--vocab'])

model = NMT(embed_size=int(args['--embed-size']),
            hidden_size=int(args['--hidden-size']),
            dropout_rate=float(args['--dropout']),
            vocab=vocab, no_char_decoder=args['--no-char-decoder'])

In [4]:
model.train()

NMT(
  (model_embeddings_source): ModelEmbeddings(
    (embeddings): Embedding(96, 50, padding_idx=0)
    (cnn_layer): CNN(
      (cnn_layer): Conv1d(50, 256, kernel_size=(5,), stride=(1,))
      (max_pool_layer): MaxPool1d(kernel_size=17, stride=17, padding=0, dilation=1, ceil_mode=False)
    )
    (highway_layer): Highway(
      (proj): Linear(in_features=256, out_features=256, bias=True)
      (gate): Linear(in_features=256, out_features=256, bias=True)
    )
    (dropout_layer): Dropout(p=0.3, inplace=False)
  )
  (model_embeddings_target): ModelEmbeddings(
    (embeddings): Embedding(96, 50, padding_idx=0)
    (cnn_layer): CNN(
      (cnn_layer): Conv1d(50, 256, kernel_size=(5,), stride=(1,))
      (max_pool_layer): MaxPool1d(kernel_size=17, stride=17, padding=0, dilation=1, ceil_mode=False)
    )
    (highway_layer): Highway(
      (proj): Linear(in_features=256, out_features=256, bias=True)
      (gate): Linear(in_features=256, out_features=256, bias=True)
    )
    (dropout_lay

In [5]:
uniform_init = float(args['--uniform-init'])
if np.abs(uniform_init) > 0.:
    print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init), file=sys.stderr)
    for p in model.parameters():
        p.data.uniform_(-uniform_init, uniform_init)

vocab_mask = torch.ones(len(vocab.tgt))
vocab_mask[vocab.tgt['<pad>']] = 0

device = torch.device("cuda:0" if args['--cuda'] else "cpu")
print('use device: %s' % device, file=sys.stderr)

model = model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr']))

num_trial = 0
train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
cum_examples = report_examples = epoch = valid_num = 0
hist_valid_scores = []
train_time = begin_time = time.time()
print('begin Maximum Likelihood training')

begin Maximum Likelihood training


uniformly initialize parameters [-0.100000, +0.100000]
use device: cpu


In [7]:
for src_sents, tgt_sents in batch_iter(train_data, batch_size=train_batch_size, shuffle=True):
    train_iter += 1

    optimizer.zero_grad()

    batch_size = len(src_sents)

    example_losses = -model(src_sents, tgt_sents) # (batch_size,)
    batch_loss = example_losses.sum()
    loss = batch_loss / batch_size

    loss.backward()

    # clip gradient
    grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)

    optimizer.step()

    batch_losses_val = batch_loss.item()
    report_loss += batch_losses_val
    cum_loss += batch_losses_val

    tgt_words_num_to_predict = sum(len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
    report_tgt_words += tgt_words_num_to_predict
    cum_tgt_words += tgt_words_num_to_predict
    report_examples += batch_size
    cum_examples += batch_size

    if train_iter % log_every == 0:
        print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \
              'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter,
                                                                                 report_loss / report_examples,
                                                                                 math.exp(report_loss / report_tgt_words),
                                                                                 cum_examples,
                                                                                 report_tgt_words / (time.time() - train_time),
                                                                                 time.time() - begin_time), file=sys.stderr)

        train_time = time.time()
        report_loss = report_tgt_words = report_examples = 0.

    # perform validation
    if train_iter % valid_niter == 0:
        print('epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter,
                                                                                 cum_loss / cum_examples,
                                                                                 np.exp(cum_loss / cum_tgt_words),
                                                                                 cum_examples), file=sys.stderr)

        cum_loss = cum_examples = cum_tgt_words = 0.
        valid_num += 1

        print('begin validation ...', file=sys.stderr)

        # compute dev. ppl and bleu
        dev_ppl = evaluate_ppl(model, dev_data, batch_size=128)   # dev batch size can be a bit larger
        valid_metric = -dev_ppl

        print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl), file=sys.stderr)

        is_better = len(hist_valid_scores) == 0 or valid_metric > max(hist_valid_scores)
        hist_valid_scores.append(valid_metric)

  e_t.data.masked_fill_(enc_masks.byte(), -float('inf'))


In [16]:
hist_valid_scores

[]

In [18]:
optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 0
)

In [22]:
src_sents = read_corpus(args['--train-src'], source='src')
tgt_sents = read_corpus(args['--train-tgt'], source='tgt')

In [25]:
tgt_sents[0]

['<s>',
 'Thank',
 'you',
 'so',
 'much,',
 'Chris.',
 'And',
 "it's",
 'truly',
 'a',
 'great',
 'honor',
 'to',
 'have',
 'the',
 'opportunity',
 'to',
 'come',
 'to',
 'this',
 'stage',
 'twice;',
 "I'm",
 'extremely',
 'grateful.',
 '</s>']

In [None]:
elif [ "$1" = "test_local_q1" ]; then
mkdir -p outputs
touch outputs/test_outputs_local_q1.txt
python run.py decode model.bin ./en_es_data/test_tiny.es ./en_es_data/test_tiny.en outputs/test_outputs_local_q1.txt \
    --no-char-decoder

In [9]:
from collections import namedtuple
import sys
from typing import List, Tuple, Dict, Set, Union
import torch
import torch.nn as nn
import torch.nn.utils
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

from model_embeddings import ModelEmbeddings
from char_decoder import CharDecoder

In [19]:
def beam_search(model: NMT, test_data_src: List[List[str]], beam_size: int, max_decoding_time_step: int) -> List[List[Hypothesis]]:
    """ Run beam search to construct hypotheses for a list of src-language sentences.
    @param model (NMT): NMT Model
    @param test_data_src (List[List[str]]): List of sentences (words) in source language, from test set.
    @param beam_size (int): beam_size (# of hypotheses to hold for a translation at every step)
    @param max_decoding_time_step (int): maximum sentence length that Beam search can produce
    @returns hypotheses (List[List[Hypothesis]]): List of Hypothesis translations for every source sentence.
    """
    was_training = model.training
    model.eval()a

    hypotheses = []
    with torch.no_grad():
        for src_sent in tqdm(test_data_src, desc='Decoding', file=sys.stdout):
            example_hyps = model.beam_search(src_sent, beam_size=beam_size, max_decoding_time_step=max_decoding_time_step)

            hypotheses.append(example_hyps)

    if was_training: model.train(was_training)

    return hypotheses

In [12]:
def decode(args: Dict[str, str]):
    """ Performs decoding on a test set, and save the best-scoring decoding results.
    If the target gold-standard sentences are given, the function also computes
    corpus-level BLEU score.
    @param args (Dict): args from cmd line
    """

    print("load test source sentences from [{}]".format(args['TEST_SOURCE_FILE']), file=sys.stderr)
    test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src')
    if args['TEST_TARGET_FILE']:
        print("load test target sentences from [{}]".format(args['TEST_TARGET_FILE']), file=sys.stderr)
        test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt')

    print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr)
    model = NMT.load(args['MODEL_PATH'], no_char_decoder=args['--no-char-decoder'])

    if args['--cuda']:
        model = model.to(torch.device("cuda:0"))

    hypotheses = beam_search(model, test_data_src,
                             beam_size=int(args['--beam-size']),
                             max_decoding_time_step=int(args['--max-decoding-time-step']))

    if args['TEST_TARGET_FILE']:
        top_hypotheses = [hyps[0] for hyps in hypotheses]
        bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses)
        print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr)

    with open(args['OUTPUT_FILE'], 'w') as f:
        for src_sent, hyps in zip(test_data_src, hypotheses):
            top_hyp = hyps[0]
            hyp_sent = ' '.join(top_hyp.value)
            f.write(hyp_sent + '\n')

In [14]:
test_data_src = read_corpus('./en_es_data/test_tiny.es', source = 'src')
test_data_tgt = read_corpus('./en_es_data/test_tiny.en', source = 'tgt')

In [23]:
model = NMT.load('model.bin', no_char_decoder = True)

In [20]:
hypotheses = beam_search(model, test_data_src,
                         beam_size = 5,
                         max_decoding_time_step=70)

Decoding:  50%|█████     | 2/4 [00:00<00:00, 17.95it/s]



Decoding: 100%|██████████| 4/4 [00:00<00:00, 16.45it/s]


In [21]:
hypotheses

[[Hypothesis(value=["It's"], score=-2.5637383460998535),
  Hypothesis(value=["It's", 'a', 'a', 'true', 'Tipper', 'and', 'like', 'a', 'for', 'for', 'me.'], score=-5.935949325561523),
  Hypothesis(value=["It's", 'a', 'a', 'true', 'story', '--', 'and', 'all', 'of', 'a', 'for', 'for', 'me.'], score=-6.5942606925964355),
  Hypothesis(value=["It's", 'a', 'a', 'a', 'true', 'story', '--', 'and', 'all', 'of', 'a', 'for', 'for', 'me.'], score=-7.084736347198486),
  Hypothesis(value=["It's", 'a', 'a', 'a', 'true', 'story', '--', 'and', 'all', 'of', 'a', 'a', 'for', 'me.'], score=-7.09389066696167)],
 [Hypothesis(value=['Soon', 'after', 'Tipper'], score=-2.709735631942749),
  Hypothesis(value=['Soon', 'after'], score=-3.7327487468719482),
  Hypothesis(value=['Soon', 'after', 'Tipper', 'and', 'like', 'a', 'a', 'for', 'me.'], score=-4.965090274810791),
  Hypothesis(value=['Soon', 'after', 'Tipper', 'and', 'like', 'a', 'a', 'little', 'farm', 'we', '(Mock', 'sob)', 'sob)', 'I', 'after', 'Tipper', 'and

In [33]:
model.model_embeddings_source.embeddings.weight

Parameter containing:
tensor([[ 0.0829, -0.0682,  0.0369,  ..., -0.0071,  0.0592,  0.0172],
        [-0.0831, -0.0380, -0.1022,  ...,  0.0492,  0.0313,  0.0256],
        [ 0.0395,  0.0201,  0.0406,  ...,  0.0139,  0.0140,  0.1094],
        ...,
        [-0.0237, -0.0783, -0.0273,  ..., -0.0010, -0.0431, -0.0818],
        [-0.0031, -0.0827, -0.0044,  ..., -0.0512,  0.0486,  0.0773],
        [-0.0082,  0.0594,  0.0419,  ...,  0.0268, -0.0039,  0.0971]],
       requires_grad=True)

In [83]:
import json
import math
import pickle
import sys
import time

import numpy as np

from docopt import docopt
from typing import List, Tuple, Dict, Set, Union
from tqdm import tqdm
from utils import pad_sents_char, read_corpus, batch_iter
from vocab import Vocab, VocabEntry

from char_decoder import CharDecoder
from nmt_model import NMT


import torch
import torch.nn as nn
import torch.nn.utils

BATCH_SIZE = 5
EMBED_SIZE = 3
HIDDEN_SIZE = 3
DROPOUT_RATE = 0.0


class DummyVocab():
    def __init__(self):
        self.char2id = json.load(open('./sanity_check_en_es_data/char_vocab_sanity_check.json', 'r'))
        self.id2char = {id: char for char, id in self.char2id.items()}
        self.char_unk = self.char2id['<unk>']
        self.start_of_word = self.char2id["{"]
        self.end_of_word = self.char2id["}"]
        
char_vocab = DummyVocab()
        
decoder = CharDecoder(
    hidden_size=HIDDEN_SIZE,
    char_embedding_size=EMBED_SIZE,
    target_vocab=char_vocab)

model = NMT(
        embed_size=EMBED_SIZE,
        hidden_size=HIDDEN_SIZE,
        dropout_rate=DROPOUT_RATE,
        vocab=vocab)

In [99]:
BATCH_SIZE = 5
sequence_length = 4
inpt = torch.randint(high = 10,size=(sequence_length, BATCH_SIZE))

In [100]:
input_sequence = inpt[:-1]
target_sequence = inpt[1:]

dec_hiddens = (torch.rand(1, BATCH_SIZE, HIDDEN_SIZE),torch.rand(1, BATCH_SIZE, HIDDEN_SIZE))
scores, dec_hidden = decoder.forward(input_sequence, dec_hiddens)

In [90]:
class CharDecoder(nn.Module):
    def __init__(self, hidden_size, char_embedding_size=50, target_vocab=None):
        """ Init Character Decoder.

        @param hidden_size (int): Hidden size of the decoder LSTM
        @param char_embedding_size (int): dimensionality of character embeddings
        @param target_vocab (VocabEntry): vocabulary for the target language. See vocab.py for documentation.
        """
        ### YOUR CODE HERE for part 2a
        ### TODO - Initialize as an nn.Module.
        ###      - Initialize the following variables:
        ###        self.charDecoder: LSTM. Please use nn.LSTM() to construct this.
        ###        self.char_output_projection: Linear layer, called W_{dec} and b_{dec} in the PDF
        ###        self.decoderCharEmb: Embedding matrix of character embeddings
        ###        self.target_vocab: vocabulary for the target language
        ###
        ### Hint: - Use target_vocab.char2id to access the character vocabulary for the target language.
        ###       - Set the padding_idx argument of the embedding matrix.
        ###       - Create a new Embedding layer. Do not reuse embeddings created in Part 1 of this assignment.
        super(CharDecoder, self).__init__()

        V = len(target_vocab.char2id)
        self.charDecoder = nn.LSTM(input_size = char_embedding_size, hidden_size = hidden_size)
        self.char_output_projection = nn.Linear(in_features = hidden_size, out_features = V)
        self.decoderCharEmb = nn.Embedding(num_embeddings = V, embedding_dim= char_embedding_size)
        self.target_vocab = target_vocab

        ### END YOUR CODE
        
    def forward(self, input, dec_hidden=None):
        """ Forward pass of character decoder.

        @param input: tensor of integers, shape (length, batch)
        @param dec_hidden: internal state of the LSTM before reading the input characters. A tuple of two tensors of shape (1, batch, hidden_size)

        @returns scores: called s_t in the PDF, shape (length, batch, self.vocab_size)
        @returns dec_hidden: internal state of the LSTM after reading the input characters. A tuple of two tensors of shape (1, batch, hidden_size)
        """
        ### YOUR CODE HERE for part 2b
        ### TODO - Implement the forward pass of the character decoder.

        char_embeddings = self.decoderCharEmb(input) # (length, batch, char_embedding_size)
        hiddens, dec_hidden = self.charDecoder(char_embeddings, dec_hidden) # See documentation. Takes in (input, (h_0,c_0)) and produces (output, (h_t,c_t))
        scores = self.char_output_projection(hiddens)

        return scores, dec_hidden
    
decoder = CharDecoder(hidden_size = HIDDEN_SIZE, target_vocab = char_vocab)

In [102]:
scores, dec_hidden = decoder.forward(input_sequence, dec_hiddens)

In [104]:
loss = nn.CrossEntropyLoss(reduction = 'sum', ignore_index=decoder.target_vocab.char2id['<pad>'])

In [107]:
loss(scores, target_sequence)

ValueError: Expected target size (3, 30), got torch.Size([3, 5])

In [108]:
target_sequence.shape, scores.shape

(torch.Size([3, 5]), torch.Size([3, 5, 30]))

In [110]:
scores.view(-1, scores.shape[-1]).shape

torch.Size([15, 30])

In [114]:
inpt = torch.zeros(1, BATCH_SIZE, HIDDEN_SIZE, dtype=torch.float)
initialStates = (inpt, inpt)
device = decoder.char_output_projection.weight.device
# decodedWords = decoder.decode_greedy(initialStates, device)

In [120]:
hidden_t, cell_t = initialStates
output_word = []
current_char = '{'
softmax_layer = nn.Softmax()
max_length = 21

In [128]:
torch.tensor([decoder.target_vocab.char2id[current_char]]).shape

torch.Size([1])

In [187]:
start_index = decoder.target_vocab.char2id[current_char]
end_index = decoder.target_vocab.char2id['}']

inputs = torch.tensor([start_index for _ in range(BATCH_SIZE)], device = device).unsqueeze(0)
decodedWords = []

for t in range(max_length):
    scores, (hidden_t, cell_t) = decoder.forward(inputs, (hidden_t, cell_t))

    # For as long as our max_length is, push each sentence in the batch through charDecoder. So we should end up
    # Scores = (1, batch_size, char_dim)

    inputs = torch.argmax(scores, dim = 2) # Collapse char_dim, pick highest probability character
    # current_chars has (1, batch_size), want to add to each in inputs
    current_chars = [decoder.target_vocab.id2char[i.item()] for i in inputs[0]] # Pull out the next character
    if t == 0:
        words = current_chars
    else:
        words = [x + y for x, y in zip(words, current_chars)]

# Now decodedWords is (max_length, batch_size). Slice it so that if } is reached, stop.
for word in words:
    if '}' in word:
        word = word[word.index('}')]
    decodedWords.append(word)

In [193]:
inputs.detach().squeeze()

tensor([8, 8, 8, 8, 8])