In [1]:
import pandas as pd
import torch
import numpy as np
import time
import json
import os
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import corpus_bleu

import preprocess as prp
from transformer import Transformer

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


## Load data

In [3]:
# load data and word2vec embeddings
df = pd.read_csv("data/en_es_reduced_data.csv")
en_vec_df = pd.read_csv("data/cc.en.300.reduced.csv")
es_vec_df = pd.read_csv("data/cc.es.300.reduced.csv")

In [4]:
df.head()

Unnamed: 0,english,spanish
0,Go.,Ve.
1,Go.,Vete.
2,Go.,Vaya.
3,Go.,Váyase.
4,Hi.,Hola.


In [5]:
en_vec_df.head()

Unnamed: 0,word,vector
0,",",[0.1250 -0.1079 0.0245 -0.2529 0.1057 -0.0184 ...
1,the,[-0.0517 0.0740 -0.0131 0.0447 -0.0343 0.0212 ...
2,.,[0.0342 -0.0801 0.1162 -0.3968 -0.0147 -0.0533...
3,and,[0.0082 -0.0899 0.0265 -0.0086 -0.0609 0.0068 ...
4,to,[0.0047 0.0281 -0.0296 -0.0108 -0.0620 -0.0532...


In [6]:
es_vec_df.head()

Unnamed: 0,word,vector
0,de,[0.0547 0.0112 0.1910 0.0308 0.0414 0.0303 -0....
1,",",[-0.0001 -0.0448 -0.2344 -0.0175 0.0231 -0.012...
2,.,[-0.0041 -0.0990 -0.0862 -0.0211 0.0899 -0.020...
3,la,[0.0373 -0.0051 0.1350 0.0990 0.0181 0.0190 -0...
4,y,[-0.1160 -0.0598 -0.0966 0.0369 -0.0063 0.0431...


In [11]:
words_df = pd.DataFrame({"word": ["asdfadsf", "apple", "your"]})
words_df["valid"] = words_df["word"].isin(en_vec_df["word"])
words_df[words_df["valid"] == False]["word"].values.tolist()

['asdfadsf']

## Preprocess data

In [12]:
# convert vector strings into lists of floats
en_vec_df["vector"] = prp.vec_str_to_list(en_vec_df, "vector")
es_vec_df["vector"] = prp.vec_str_to_list(es_vec_df, "vector")

In [13]:
en_vec_df.head()

Unnamed: 0,word,vector
0,",","[0.125, -0.1079, 0.0245, -0.2529, 0.1057, -0.0..."
1,the,"[-0.0517, 0.074, -0.0131, 0.0447, -0.0343, 0.0..."
2,.,"[0.0342, -0.0801, 0.1162, -0.3968, -0.0147, -0..."
3,and,"[0.0082, -0.0899, 0.0265, -0.0086, -0.0609, 0...."
4,to,"[0.0047, 0.0281, -0.0296, -0.0108, -0.062, -0...."


In [14]:
es_vec_df.head()

Unnamed: 0,word,vector
0,de,"[0.0547, 0.0112, 0.191, 0.0308, 0.0414, 0.0303..."
1,",","[-0.0001, -0.0448, -0.2344, -0.0175, 0.0231, -..."
2,.,"[-0.0041, -0.099, -0.0862, -0.0211, 0.0899, -0..."
3,la,"[0.0373, -0.0051, 0.135, 0.099, 0.0181, 0.019,..."
4,y,"[-0.116, -0.0598, -0.0966, 0.0369, -0.0063, 0...."


In [15]:
# tokenize strings into lists of tokens
# see preprocess.py for function definition
df["en_tokens"] = df["english"].apply(lambda sent: prp.sentence_to_tokens(sent, "en"))
df["es_tokens"] = df["spanish"].apply(lambda sent: prp.sentence_to_tokens(sent, "es"))
df["en_num_tokens"] = df["en_tokens"].apply(lambda x: len(x))
df["es_num_tokens"] = df["es_tokens"].apply(lambda x: len(x))

In [16]:
df.head()

Unnamed: 0,english,spanish,en_tokens,es_tokens,en_num_tokens,es_num_tokens
0,Go.,Ve.,"[<s>, go, ., <e>]","[<s>, ve, ., <e>]",4,4
1,Go.,Vete.,"[<s>, go, ., <e>]","[<s>, vete, ., <e>]",4,4
2,Go.,Vaya.,"[<s>, go, ., <e>]","[<s>, vaya, ., <e>]",4,4
3,Go.,Váyase.,"[<s>, go, ., <e>]","[<s>, váyase, ., <e>]",4,4
4,Hi.,Hola.,"[<s>, hi, ., <e>]","[<s>, hola, ., <e>]",4,4


In [17]:
print(f"df shape: {df.shape}")
print(f"num unique english sequences: {df['english'].unique().shape}")
print(f"num unique spanish sequences: {df['spanish'].unique().shape}")

df shape: (111184, 6)
num unique english sequences: (96092,)
num unique spanish sequences: (106072,)


In [18]:
# reduce data to sequences of length < 11 (arbitrary)
# (+2 for start/end tokens, +1 for spanish to account for double punctuation)
max_tokens = 10
df = df[(df["en_num_tokens"] < (max_tokens + 3)) & (df["es_num_tokens"] < (max_tokens + 4))]

# a lot of words have multiple translations...
# i don't think this is a big deal considering i just want to train a functional model
print(f"df shape: {df.shape}")
print(f"num unique english sequences: {df['english'].unique().shape}")
print(f"num unique spanish sequences: {df['spanish'].unique().shape}")

df shape: (97443, 6)
num unique english sequences: (83385,)
num unique spanish sequences: (92597,)


In [19]:
# train/test split
train_df, test_df = train_test_split(df, test_size=0.2)
train_df.reset_index(inplace=True, drop=True)
test_df.reset_index(inplace=True, drop=True)

In [20]:
print(f"train shape: {train_df.shape}")
print(f"test shape: {test_df.shape}")

train shape: (77954, 6)
test shape: (19489, 6)


In [21]:
def get_batch_tuples(input_df, batch_size):
    """
    Returns a list of tuples containing start/stop idx to grab from inp_df for a batch.
    """
    
    num_samples = input_df.shape[0]
    num_batches = int(num_samples / batch_size)
    batch_starts = np.arange(0, num_samples + 1, batch_size)    
    return [(batch_starts[i], batch_starts[i + 1]) for i in range(len(batch_starts) - 1)]

In [22]:
# batch data
batch_size = 256
train_batch_tuples = get_batch_tuples(train_df, batch_size)
test_batch_tuples = get_batch_tuples(test_df, batch_size)
num_train_batches = len(train_batch_tuples)
num_test_batches = len(test_batch_tuples)

## Utility functions

In [23]:
def map_indices_to_tokens(pred_seqs, vec_df):
    """
    Given list of lists containing indices of predicted tokens (output of get_predicted_token_indices()),
    return list of lists where each sublist is now the actual string token corresponding to the input
    index.
    """
    
    all_preds = []
    
    for pred_seq in pred_seqs:
        pred_tokens = []
        
        for pred_token in pred_seq:
            word = vec_df.loc[pred_token]["word"]
            pred_tokens.append(word)
            
            if (word == "<e>"):
                break
            
        all_preds.append(pred_tokens)
        
    return all_preds


def get_predicted_token_indices(predicted_batch):
    """
    Returns a list of lists where each sublist contains the predicted token indices for each 
    predicted sequence in predicted_batch.
    """
    
    pred_seqs = []
    
    for batch in predicted_batch:
        pred_seq = []
        
        for row in batch:
            pred_seq.append(row.argmax().item())
        
        pred_seqs.append(pred_seq)
        
    return pred_seqs


def save_model(model, model_name, epoch_train_losses, epoch_test_losses, epoch_bleu_scores):
    """
    Save model and losses to files. Should be called after every training epoch.
    Number of training epochs completed should be length of epoch_train_losses or 
    epoch_test_losses when model loaded back in. Could (should) save other training
    params, but this is fine for now.
    A .pt (actual model) and .json (metrics) file are created with model_name.
    
    returns None
    """
    
    # create paths
    os.system("mkdir -p models")
    model_path = f"models/{model_name}.pt"
    metrics_path = f"models/{model_name}_metrics.json"
    
    # save da model to pickle file
    torch.save(model.state_dict(), model_path)
    
    # save metrics with corresponding model_path just in case i get confused later
    metrics_dict = {"epoch_train_losses": epoch_train_losses, "epoch_test_losses": epoch_test_losses, "epoch_bleu_scores": epoch_bleu_scores, "model_path": model_path}
    with open(metrics_path, "w") as fi: json.dump(metrics_dict, fi)


def load_model(model, model_name):
    """
    Load saved pytorch model and metrics created with save_model().
    Must initialize model with same params as one you load from path.
    
    returns torch model, list of train losses, list of test losses, list of bleu scores
    """
    
    # create paths
    model_path = f"models/{model_name}.pt"
    metrics_path = f"models/{model_name}_metrics.json"
    
    # load model
    model.load_state_dict(torch.load(model_path))
    
    # load metrics
    with open(metrics_path, "r") as fi: 
        metrics = json.load(fi)
        
    epoch_train_losses = metrics["epoch_train_losses"]
    epoch_test_losses = metrics["epoch_test_losses"]
    epoch_bleu_scores = metrics["epoch_bleu_scores"]
        
    return model, epoch_train_losses, epoch_test_losses, epoch_bleu_scores

## Initialize transformer

In [24]:
# inputs will have shape (num_tokens, embedding_len)
embedding_len = 300
num_tokens = max(df["en_num_tokens"].max(), df["es_num_tokens"].max())  

# NOTE <s> will never be in target, but its mapping index still exists
# just to keep an annoying out-of-bounds error from happening later
input_size = embedding_len
output_size = es_vec_df.shape[0]

# see "attn is all you need"
N_layers = 1
num_heads = 1
version = "NO_USE_IT"

# create torch model object
new_model = True
model_name = f"tfrmr_{version}_N={N_layers}_h={num_heads}_B={batch_size}"
tfrmr = Transformer(input_size, output_size, N_layers, num_heads)

if (new_model): 
    epoch_train_losses = []
    epoch_test_losses = []
    epoch_bleu_scores = []
else:
    tfrmr, epoch_train_losses, epoch_test_losses, epoch_bleu_scores = load_model(tfrmr, model_name)
    
tfrmr.to(device)

Transformer(
  (encoders): ModuleList(
    (0): Encoder(
      (Q_layer): Linear(in_features=300, out_features=300, bias=True)
      (K_layer): Linear(in_features=300, out_features=300, bias=True)
      (V_layer): Linear(in_features=300, out_features=300, bias=True)
      (mha_linear): Linear(in_features=300, out_features=300, bias=True)
      (layer_norm0): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
      (ff_linear0): Linear(in_features=300, out_features=300, bias=True)
      (ff_relu): ReLU(inplace=True)
      (ff_linear1): Linear(in_features=300, out_features=300, bias=True)
      (layer_norm1): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
    )
  )
  (decoders): ModuleList(
    (0): Decoder(
      (Q_layer): Linear(in_features=300, out_features=300, bias=True)
      (K_layer): Linear(in_features=300, out_features=300, bias=True)
      (V_layer): Linear(in_features=300, out_features=300, bias=True)
      (mmha_linear): Linear(in_features=300, out_features=300, b

## Inference function

In [20]:
def inference(tfrmr, test_df, test_batch_tuple, en_vec_df, es_vec_df, batch_size, num_tokens, embedding_len, 
              output_size, pos_encoding_mat, start_token_vec, end_token_idx, device):
    """
    This function generates a predicted sequence for each input sequence in input batch iteratively.
    Used for making predictions after training (i.e. when the full target sequence is not available to 
    input to the decoder).
    
    returns a batch of predicted tfrmr outputs (not softmax probs) and a batch of one-hot target sequences
    """
    
    # get batch
    test_start_idx = test_batch_tuple[0]
    test_end_idx = test_batch_tuple[1]
    test_batch = test_df[test_start_idx:test_end_idx]

    # transform strings into tensors of embedded values
    encoder_input = prp.embed_input_batch(test_batch, "en_tokens", en_vec_df, "word", "vector", embedding_len, num_tokens).to(device)

    # positionally-encode
    prp.pos_encode_batch(encoder_input, test_batch["en_num_tokens"].values.tolist(), pos_encoding_mat)

    # context generated for every matrix in batch, dont need to do again
    encoder_context = tfrmr.forward(encoder_input, None, encoder_only=True)

    # initialize decoder input for autoregressive prediction
    decoder_input = torch.zeros(batch_size, num_tokens, embedding_len).to(device)

    # holds final sequence prediction for each input sequence
    predicted_seqs = torch.zeros(batch_size, num_tokens, output_size).to(device)

    # iteratively generate predicted sequences for each input sequence
    # SUB BATCH LOOP START
    for batch_idx in range(batch_size):
        done = False
        token_idx = 0  # starting index of decoder input (0th predicted sequence will be put in decoder_input at token_idx=1 on next iteration)
        decoder_input[batch_idx][token_idx] = start_token_vec  # initialize input with start token

        # WHILE LOOP START
        while ((not done) and (token_idx < (num_tokens - 1))):
            # positional encoding performed row by row
            decoder_input[batch_idx][token_idx] += pos_encoding_mat[token_idx]

            # encoder_context is constant, decoder_input contains data up to before current token
            # and is updated one row at a time with each prediction

            # long statement. makes batches of single matrices so shapes work in transformer
            prediction = tfrmr.forward(encoder_input[batch_idx].view(1, encoder_input.shape[1], 
                                          encoder_input.shape[2]), 
                                          decoder_input[batch_idx].view(1, decoder_input.shape[1], 
                                          decoder_input.shape[2]), 
                                          encoder_context=encoder_context[batch_idx].view(1, encoder_context.shape[1], 
                                          encoder_context.shape[2]), decoder_only=True)

            # NOTE transformer output is not a softmax distribution across each row
            # softmax is computed w loss calculation. but, the argmax before softmax
            # will be the same as the argmax after softmax (just think about softmax eqn.)
            max_token_idx = prediction[0][token_idx].argmax().item()
            
            # put prediction into final predicted sequence
            predicted_seqs[batch_idx][token_idx] = prediction[0][token_idx]

            # stop predicting if the end token has been predicted
            if (max_token_idx == end_token_idx):        
                done = True
            else:
                # for decoder's input: start token remains at index 0, this prediction goes to token_idx+1
                # for predicted sequence: this prediction goes at token_idx (start token not included in final prediction)
                token_vec = torch.tensor(es_vec_df.loc[max_token_idx, "vector"]).to(device)
                decoder_input[batch_idx][token_idx + 1] = token_vec                

            token_idx += 1

            # WHILE LOOP END

        # SUB BATCH LOOP END
        
    # create one-hot encoding of target sequences
    targets_batch = prp.one_hot_batch(test_batch, es_vec_df, num_tokens, output_size, "es_tokens", 
                                      "word", smoothing=False).to(device)

    return predicted_seqs, targets_batch

## Main train loop

In [21]:
%%time 

num_epochs = 20
pos_encoding_mat = prp.gen_positional_encoding(num_tokens, embedding_len).to(device)
start_token_vec = torch.tensor(es_vec_df[es_vec_df["word"] == "<s>"]["vector"].item())
end_token_idx = es_vec_df[es_vec_df["word"] == "<e>"].index.item()

# EPOCH LOOP START
epoch_start_time = time.time()
for epoch in range(num_epochs):
    print("=" * 60)
    print(f"EPOCH {epoch}")
    train_losses = []
    
    # BATCH LOOP START
    batch_start_time = time.time()  
    for batch_idx, batch_tuple in enumerate(train_batch_tuples):
        # get batch
        start_idx = batch_tuple[0]
        end_idx = batch_tuple[1]
        batch = train_df[start_idx:end_idx]
        
        # transform strings into tensors of embedded values
        encoder_input = prp.embed_input_batch(batch, "en_tokens", en_vec_df, "word", "vector", embedding_len, num_tokens).to(device)
        decoder_input = prp.embed_input_batch(batch, "es_tokens", es_vec_df, "word", "vector", embedding_len, num_tokens).to(device)
        
        # positionally-encode
        prp.pos_encode_batch(encoder_input, batch["en_num_tokens"].values.tolist(), pos_encoding_mat)
        prp.pos_encode_batch(decoder_input, batch["es_num_tokens"].values.tolist(), pos_encoding_mat)

        # forward pass
        fwd_out = tfrmr.forward(encoder_input, decoder_input)
        
        # create one-hot targets
        targets = prp.one_hot_batch(batch, es_vec_df, num_tokens, output_size, "es_tokens", "word", 
                                    smoothing=True, smoothing_epsilon=0.1).to(device)
        
        # loss and backward pass
        train_loss = tfrmr.calc_loss(fwd_out, targets)
        tfrmr.backward()
        train_losses.append(train_loss.item())
        
        # BATCH LOOP END  
    
    batch_stop_time = time.time()
    print(f"{num_train_batches} train batches of size {batch_size} done in {(batch_stop_time - batch_start_time) / 60:.3f} minutes")
    
    mean_train_loss = np.mean(train_losses)
    epoch_train_losses.append(mean_train_loss)
    
    # INFERENCE START
    # perform inference (i.e. see how model performs on unseen test data)
    inference_start_time = time.time()
    test_losses = []
    bleu_scores = []
    example_idx = 0
    num_examples = 5
    
    # TEST BATCH LOOP START
    for test_batch_idx, test_batch_tuple in enumerate(test_batch_tuples):
        # iteratively generate predicted sequences in each batch
        predicted_seqs, targets_batch = inference(tfrmr, test_df, test_batch_tuple, en_vec_df, es_vec_df, 
                                                  batch_size, num_tokens, embedding_len, output_size, 
                                                  pos_encoding_mat, start_token_vec, end_token_idx, device)

        # calculate loss w/o updating model params
        # NOTE that the .item() here is very important - saves a lot of internal torch autograd graph stuff in memory w/o
        test_loss = tfrmr.calc_loss(predicted_seqs, targets_batch, test_loss=True).item()
        test_losses.append(test_loss)
        
        # calculate 1-gram bleu score
        predicted_tokens = map_indices_to_tokens(get_predicted_token_indices(predicted_seqs), es_vec_df)
        target_tokens = [seq[1:] for seq in test_df[test_batch_tuple[0]:test_batch_tuple[1]]["es_tokens"].values.tolist()]
        bleu_score = corpus_bleu(target_tokens, predicted_tokens, weights=[1])
        bleu_scores.append(bleu_score)
        
        # look at a few example predictions
        if (test_batch_idx == example_idx):
            en_input = test_df[test_batch_tuple[0]:test_batch_tuple[0] + num_examples]["en_tokens"].values.tolist()
            print("\nexample predictions:")
            
            for idx in range(num_examples):
                print(f"english: {en_input[idx]}")
                print(f"spanish: {target_tokens[idx]}")
                print(f"predicted: {predicted_tokens[idx]}\n")
        
        # TEST BATCH LOOP END
        
    mean_test_loss = np.mean(test_losses)
    epoch_test_losses.append(mean_test_loss)
    mean_bleu_score = np.mean(bleu_scores)
    epoch_bleu_scores.append(mean_bleu_score)
    inference_stop_time = time.time()
    print(f"{len(test_batch_tuples)} test batches of size {batch_size} done in {(inference_stop_time - inference_start_time) / 60:.3f} minutes")
    
    # INFERENCE END
    
    print(f"mean train loss: {mean_train_loss}")
    print(f"mean test loss: {mean_test_loss}")
    print(f"mean 1-gram bleu score: {mean_bleu_score}\n")
    
    # save model and metrics
    save_model(tfrmr, model_name, epoch_train_losses, epoch_test_losses, epoch_bleu_scores)
    
    # EPOCH LOOP END
    
epoch_stop_time = time.time()
print(f"{num_epochs} epochs done in {(epoch_stop_time - epoch_start_time) / 60:.3f} minutes")


EPOCH 0
304 train batches of size 256 done in 20.179 minutes

example predictions:
english: ['<s>', 'she', 'used', 'to', 'play', 'tennis', 'with', 'him', '.', '<e>']
spanish: ['ella', 'solía', 'jugar', 'al', 'tenis', 'con', 'él', '.', '<e>']
predicted: ['¿', 'no', 'que', 'que', 'que', '.', '<e>']

english: ['<s>', 'the', 'bus', 'stops', 'in', 'front', 'of', 'my', 'house', '.', '<e>']
spanish: ['el', 'autobús', 'para', 'delante', 'de', 'mi', 'casa', '.', '<e>']
predicted: ['¿', 'no', 'que', 'que', 'que', 'que', '.', '<e>']

english: ['<s>', 'unfortunately', ',', 'she', 'is', 'absent', '.', '<e>']
spanish: ['desafortunadamente', ',', 'ella', 'está', 'ausente', '.', '<e>']
predicted: ['¿', 'no', 'que', 'a', '.', '<e>']

english: ['<s>', 'we', 'want', 'an', 'assistant', ',', 'preferably', 'someone', 'with', 'experience', '.', '<e>']
spanish: ['buscamos', 'un', 'asistente', ',', 'preferiblemente', 'alguien', 'con', 'experiencia', '.', '<e>']
predicted: ['¿', 'no', 'que', 'que', 'que', 'que'

76 test batches of size 256 done in 4.404 minutes
mean train loss: 2.4148699288305484
mean test loss: 4.012906105894792
mean 1-gram bleu score: 0.1731450253790133

EPOCH 6
304 train batches of size 256 done in 20.314 minutes

example predictions:
english: ['<s>', 'she', 'used', 'to', 'play', 'tennis', 'with', 'him', '.', '<e>']
spanish: ['ella', 'solía', 'jugar', 'al', 'tenis', 'con', 'él', '.', '<e>']
predicted: ['ella', 'le', 'gusta', 'a', 'la', 'puerta', '.', '<e>']

english: ['<s>', 'the', 'bus', 'stops', 'in', 'front', 'of', 'my', 'house', '.', '<e>']
spanish: ['el', 'autobús', 'para', 'delante', 'de', 'mi', 'casa', '.', '<e>']
predicted: ['el', 'padre', 'en', 'la', 'vida', 'en', 'mi', 'padre', '.', '<e>']

english: ['<s>', 'unfortunately', ',', 'she', 'is', 'absent', '.', '<e>']
spanish: ['desafortunadamente', ',', 'ella', 'está', 'ausente', '.', '<e>']
predicted: ['el', 'hombre', 'es', 'es', 'muy', '.', '<e>']

english: ['<s>', 'we', 'want', 'an', 'assistant', ',', 'preferably',

76 test batches of size 256 done in 4.478 minutes
mean train loss: 2.1442450935903348
mean test loss: 3.8279180965925517
mean 1-gram bleu score: 0.17040263242660728

EPOCH 12
304 train batches of size 256 done in 20.350 minutes

example predictions:
english: ['<s>', 'she', 'used', 'to', 'play', 'tennis', 'with', 'him', '.', '<e>']
spanish: ['ella', 'solía', 'jugar', 'al', 'tenis', 'con', 'él', '.', '<e>']
predicted: ['ella', 'tiene', 'a', 'la', 'puerta', 'de', 'su', 'madre', '.', '<e>']

english: ['<s>', 'the', 'bus', 'stops', 'in', 'front', 'of', 'my', 'house', '.', '<e>']
spanish: ['el', 'autobús', 'para', 'delante', 'de', 'mi', 'casa', '.', '<e>']
predicted: ['el', 'perro', 'se', 'puso', 'en', 'mi', 'padre', '.', '<e>']

english: ['<s>', 'unfortunately', ',', 'she', 'is', 'absent', '.', '<e>']
spanish: ['desafortunadamente', ',', 'ella', 'está', 'ausente', '.', '<e>']
predicted: ['hay', 'que', 'es', 'muy', '.', '<e>']

english: ['<s>', 'we', 'want', 'an', 'assistant', ',', 'preferab

76 test batches of size 256 done in 4.468 minutes
mean train loss: 1.9979269469254894
mean test loss: 3.791279375553131
mean 1-gram bleu score: 0.17753994502435988

EPOCH 18
304 train batches of size 256 done in 20.369 minutes

example predictions:
english: ['<s>', 'she', 'used', 'to', 'play', 'tennis', 'with', 'him', '.', '<e>']
spanish: ['ella', 'solía', 'jugar', 'al', 'tenis', 'con', 'él', '.', '<e>']
predicted: ['ella', 'le', 'aconsejó', 'a', 'la', 'escuela', '.', '<e>']

english: ['<s>', 'the', 'bus', 'stops', 'in', 'front', 'of', 'my', 'house', '.', '<e>']
spanish: ['el', 'autobús', 'para', 'delante', 'de', 'mi', 'casa', '.', '<e>']
predicted: ['el', 'coche', 'se', 'quedó', 'en', 'mi', 'padre', '.', '<e>']

english: ['<s>', 'unfortunately', ',', 'she', 'is', 'absent', '.', '<e>']
spanish: ['desafortunadamente', ',', 'ella', 'está', 'ausente', '.', '<e>']
predicted: ['hay', ',', 'ella', 'es', 'muy', '.', '<e>']

english: ['<s>', 'we', 'want', 'an', 'assistant', ',', 'preferably', 