In [1]:
from termcolor import colored
import random
import numpy as np

import trax
from trax import layers as tl
from trax.fastmath import numpy as fastnp
from trax.supervised import training

import w1_unittest

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_stream_fn = trax.data.TFDS('opus/medical',
                                 data_dir='./data/',
                                 keys=('en', 'de'),
                                 eval_holdout_size=0.01,
                                 train=True
                                 )
eval_stream_fn = trax.data.TFDS('opus/medical',
                                 data_dir='./data/',
                                 keys=('en', 'de'),
                                 eval_holdout_size=0.01,
                                 train=False
                                 )

No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)


In [3]:
train_stream = train_stream_fn()
print(colored('train data (en, de) tuple:', 'red'), next(train_stream))
print()

eval_stream = eval_stream_fn()
print(colored('eval data (en, de) tuple:', 'red'), next(eval_stream))

[31mtrain data (en, de) tuple:[0m (b'Tel: +421 2 57 103 777\n', b'Tel: +421 2 57 103 777\n')

[31meval data (en, de) tuple:[0m (b'Subcutaneous use and intravenous use.\n', b'Subkutane Anwendung und intraven\xc3\xb6se Anwendung.\n')


# Tokenize

In [4]:
VOCAB_FILE = 'ende_32k.subword'
VOCAB_DIR = 'data/'

tokenized_train_stream = trax.data.Tokenize(vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR)(train_stream)
tokenized_eval_stream = trax.data.Tokenize(vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR)(eval_stream)

In [5]:
# Append EOS at the end of each sentence
EOS = 1

def append_eos(stream):
    for (inputs, targets) in stream:
        inputs_with_eos = list(inputs) + [EOS]
        targets_with_eos = list(targets) + [EOS]
        yield np.array(inputs_with_eos), np.array(targets_with_eos)

tokenized_train_stream = append_eos(tokenized_train_stream)
tokenized_eval_stream = append_eos(tokenized_eval_stream)

## filter long sentences

In [6]:
filtered_train_stream = trax.data.FilterByLength(
    max_length=512, length_keys=[0, 1])(tokenized_train_stream)
filtered_eval_stream = trax.data.FilterByLength(
    max_length=512, length_keys=[0, 1])(tokenized_eval_stream)

train_input, train_target = next(filtered_train_stream)
print(colored(f'Single tokenized example input:', 'red'), train_input)
print(colored(f'Single tokenized example target:', 'red'), train_target)

[31mSingle tokenized example input:[0m [ 2538  2248    30 12114 23184 16889     5     2 20852  6456 20592  5812
  3932    96  5178  3851    30  7891  3550 30650  4729   992     1]
[31mSingle tokenized example target:[0m [ 1872    11  3544    39  7019 17877 30432    23  6845    10 14222    47
  4004    18 21674     5 27467  9513   920   188 10630    18  3550 30650
  4729   992     1]


In [7]:
def tokenize(input_str, vocab_file=None, vocab_dir=None):
    EOS = 1
    inputs = next(trax.data.tokenize(iter([input_str]), vocab_file=vocab_file, vocab_dir=vocab_dir))
    inputs = list(inputs) + [EOS]

    batch_inputs = np.reshape(np.array(inputs), [1, -1])
    
    return batch_inputs

def detokenize(integers, vocab_file=None, vocab_dir=None):
    integers = list(np.squeeze(integers))

    EOS = 1

    if EOS in integers:
        integers = integers[:integers.index(EOS)]
    return trax.data.detokenize(integers, vocab_file=vocab_file, vocab_dir=vocab_dir)

In [8]:
print(colored(f'Single detokenized example input:', 'red'), detokenize(train_input, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR))
print(colored(f'Single detokenized example target:', 'red'), detokenize(train_target, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR))

print(colored(f"tokenize('hello'): ", 'green'), tokenize('hello', vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR))
print(colored(f"detokenize([17332, 140, 1]): ", 'green'), detokenize([17332, 140, 1], vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR))

[31mSingle detokenized example input:[0m During treatment with olanzapine, adolescents gained significantly more weight compared with adults.

[31mSingle detokenized example target:[0m Während der Behandlung mit Olanzapin nahmen die Jugendlichen im Vergleich zu Erwachsenen signifikant mehr Gewicht zu.

[32mtokenize('hello'): [0m [[17332   140     1]]
[32mdetokenize([17332, 140, 1]): [0m hello


## Bucketing

In [9]:
boundaries = [8, 16, 32, 64, 128, 256, 512]
batch_sizes = [256, 128, 64, 32, 16, 8, 4, 2]

train_batch_stream = trax.data.BucketByLength(
    boundaries, batch_sizes,
    length_keys=[0, 1]
)(filtered_train_stream)

eval_batch_stream = trax.data.BucketByLength(
    boundaries, batch_sizes,
    length_keys=[0, 1]
)(filtered_eval_stream)

train_batch_stream = trax.data.AddLossWeights(id_to_mask=0)(train_batch_stream)
eval_batch_stream = trax.data.AddLossWeights(id_to_mask=0)(eval_batch_stream)

## Exploring the data

In [10]:
input_batch, target_batch, mask_batch = next(train_batch_stream)

print('input_batch data type: ', type(input_batch))
print('target_batch data type: ', type(target_batch))

print("input_batch shape: ", input_batch.shape)
print("target_batch shape: ", target_batch.shape)

input_batch data type:  <class 'numpy.ndarray'>
target_batch data type:  <class 'numpy.ndarray'>
input_batch shape:  (32, 64)
target_batch shape:  (32, 64)


In [11]:
# pick a random index less than the batch size.
index = random.randrange(len(input_batch))

# use the index to grab an entry from the input and target batch
print(colored('THIS IS THE ENGLISH SENTENCE: \n', 'red'), detokenize(input_batch[index], vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR), '\n')
print(colored('THIS IS THE TOKENIZED VERSION OF THE ENGLISH SENTENCE: \n ', 'red'), input_batch[index], '\n')
print(colored('THIS IS THE GERMAN TRANSLATION: \n', 'red'), detokenize(target_batch[index], vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR), '\n')
print(colored('THIS IS THE TOKENIZED VERSION OF THE GERMAN TRANSLATION: \n', 'red'), target_batch[index], '\n')

[31mTHIS IS THE ENGLISH SENTENCE: 
[0m Prior treatment with high dose diuretics may result in volume depletion and a risk of hypotension when initiating therapy with Karvea (see section 4.4).
 

[31mTHIS IS THE TOKENIZED VERSION OF THE ENGLISH SENTENCE: 
 [0m [ 5120    66  2248    30   350 20441  4296 22371  8141    14   247   571
     6  7618  6530  6858   379     8    13   814     7  4814 23010 12122
    22   196 28753  1333 14667    30  4345  3452    13    50   372  2745
   219     3   219 33022 30650  4729   992     1     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0] 

[31mTHIS IS THE GERMAN TRANSLATION: 
[0m Eine Vorbehandlung mit hohen Dosen von Diuretika kann bei Beginn der Therapie mit Karvea zu Flüssigkeitsmangel und zum Risiko eines übermäßigen Blutdruckabfalls führen (siehe Abschnitt 4.4).
 

[31mTHIS IS THE TOKENIZED VERSION OF THE GERMAN TRANSLATION: 
[0m [  478  8427 12305     5    39  2514 1

# NMT with Attention

## input encoder

In [12]:
def input_encoder_fn(input_vocab_size, d_model, n_encoder_layers):
    input_encoder = tl.Serial(
        # convert tokens to vectors
        tl.Embedding(input_vocab_size, d_model),

        # feed embedding to lstm
        [tl.LSTM(d_model) for _ in range(n_encoder_layers)]
    )
    return input_encoder

In [13]:
# test input_encoder_fn
w1_unittest.test_input_encoder_fn(input_encoder_fn)

[92m All tests passed


## Pre-attention decoder

In [14]:
def pre_attention_decoder_fn(mode, target_vocab_size, d_model):
    pre_attention_decoder = tl.Serial(
        tl.ShiftRight(mode=mode),
        tl.Embedding(target_vocab_size, d_model),
        tl.LSTM(d_model)
    )
    return pre_attention_decoder

In [15]:
w1_unittest.test_pre_attention_decoder_fn(pre_attention_decoder_fn)

[92m All tests passed


## Prepare attention input

In [16]:
def prepare_attention_input(encoder_activations, decoder_activations, inputs):
    keys = encoder_activations
    values = encoder_activations

    queries = decoder_activations

    mask = fastnp.where(inputs>0, 1, 0)

    mask = fastnp.reshape(mask, (mask.shape[0], 1, 1, mask.shape[1]))
    
    # broadcast so mask shape is [batch size, attention heads, decoder-len, encoder-len].
    # note: for this assignment, attention heads is set to 1.
    mask = mask + fastnp.zeros((1, 1, decoder_activations.shape[1], 1))
        
    
    return queries, keys, values, mask

In [17]:
w1_unittest.test_prepare_attention_input(prepare_attention_input)

[92m All tests passed


## NMTAttn

In [18]:
def NMTAttn(input_vocab_size=33300,
            target_vocab_size=33300,
            d_model=1024,
            n_encoder_layers=2,
            n_decoder_layers=2,
            n_attention_heads=4,
            attention_dropout=0.0,
            mode='train'):
    # Step 0
    input_encoder = input_encoder_fn(input_vocab_size, d_model, n_encoder_layers)
    pre_attention_decoder = pre_attention_decoder_fn(mode, target_vocab_size, d_model)

    # Step 1
    model = tl.Serial(
        # Step 2
        tl.Select([0, 1, 0, 1]),

        # Step 3
        tl.Parallel(input_encoder, pre_attention_decoder),

        # Step 4
        tl.Fn('PrepareAttentionInput', prepare_attention_input, n_out=4),

        # Step 5
        tl.Residual(tl.AttentionQKV(d_model, n_heads=n_attention_heads, dropout=attention_dropout, mode=mode)),

        # Step 6
        tl.Select([0, 2]),
        
        # Step 7
        [tl.LSTM(d_model) for _ in range(n_decoder_layers)],

        # Step 8
        tl.Dense(target_vocab_size),

        # Step 9
        tl.LogSoftmax()
    )
    return model

In [19]:
model = NMTAttn()
print(model)

Serial_in2_out2[
  Select[0,1,0,1]_in2_out4
  Parallel_in2_out2[
    Serial[
      Embedding_33300_1024
      LSTM_1024
      LSTM_1024
    ]
    Serial[
      Serial[
        ShiftRight(1)
      ]
      Embedding_33300_1024
      LSTM_1024
    ]
  ]
  PrepareAttentionInput_in3_out4
  Serial_in4_out2[
    Branch_in4_out3[
      None
      Serial_in4_out2[
        _in4_out4
        Serial_in4_out2[
          Parallel_in3_out3[
            Dense_1024
            Dense_1024
            Dense_1024
          ]
          PureAttention_in4_out2
          Dense_1024
        ]
        _in2_out2
      ]
    ]
    Add_in2
  ]
  Select[0,2]_in3_out2
  LSTM_1024
  LSTM_1024
  Dense_33300
  LogSoftmax
]


In [20]:
w1_unittest.test_NMTAttn(NMTAttn)

[92m All tests passed


# Training

In [21]:
def train_task_function(train_batch_stream):
    return training.TrainTask(
        labeled_data=train_batch_stream,
        loss_layer=tl.CrossEntropyLoss(),
        optimizer=trax.optimizers.Adam(0.01),
        lr_schedule=trax.lr.warmup_and_rsqrt_decay(1000, 0.01),
        n_steps_per_checkpoint=10
    )

In [22]:
train_task = train_task_function(train_batch_stream)

In [23]:
w1_unittest.test_train_task(train_task_function)

[92m All tests passed


## Eval

In [24]:
eval_task = training.EvalTask(
    
    ## use the eval batch stream as labeled data
    labeled_data=eval_batch_stream,
    
    ## use the cross entropy loss and accuracy as metrics
    metrics=[tl.CrossEntropyLoss(), tl.Accuracy()],
)

## Loop

In [25]:
# define the output directory
output_dir = './output_dir/'

# remove old model if it exists. restarts training.
!rm -f ./output_dir/model.pkl.gz  

# define the training loop
training_loop = training.Loop(NMTAttn(mode='train'),
                              train_task,
                              eval_tasks=[eval_task],
                              output_dir=output_dir)



In [26]:
training_loop.run(10)

  with gzip.GzipFile(fileobj=f, compresslevel=compresslevel) as gzipf:



Step      1: Total number of trainable weights: 148492820
Step      1: Ran 1 train steps in 41.89 secs
Step      1: train CrossEntropyLoss |  10.43066597


  with gzip_lib.GzipFile(fileobj=f, compresslevel=2) as gzipf:


Step      1: eval  CrossEntropyLoss |  10.43288612
Step      1: eval          Accuracy |  0.00000000

Step     10: Ran 9 train steps in 131.56 secs
Step     10: train CrossEntropyLoss |  10.26390457
Step     10: eval  CrossEntropyLoss |  9.96872807
Step     10: eval          Accuracy |  0.02429765


# Testing

In [29]:
model = NMTAttn(mode='eval')

model.init_from_file('./output_dir/model.pkl.gz', weights_only=True)
model = tl.Accelerate(model)

In [40]:
def next_symbol(NMTAttn, input_tokens, cur_output_tokens, temperature):
    token_length = len(cur_output_tokens)
    padded_length = np.power(2, int(np.ceil(np.log2(token_length+1))))
    padded = cur_output_tokens + [0]*(padded_length - token_length)
    padded_with_batch = np.expand_dims(padded, axis=0)

    output, _ = NMTAttn((input_tokens, padded_with_batch))
    
    # (Hint: choose correct indices on the output)
    log_probs = output[0, token_length, :]

    # get the next symbol by getting a logsoftmax sample (*hint: cast to an int)
    symbol = int(tl.logsoftmax_sample(log_probs, temperature))
    
    return symbol, float(log_probs[symbol])



In [41]:
w1_unittest.test_next_symbol(next_symbol, NMTAttn)

[92m All tests passed


In [44]:
def sampling_decode(input_sentence, NMTAttn = None, temperature=0.0, vocab_file=None, vocab_dir=None, next_symbol=next_symbol, tokenize=tokenize, detokenize=detokenize):
    """Returns the translated sentence.

    Args:
        input_sentence (str): sentence to translate.
        NMTAttn (tl.Serial): An LSTM sequence-to-sequence model with attention.
        temperature (float): parameter for sampling ranging from 0.0 to 1.0.
            0.0: same as argmax, always pick the most probable token
            1.0: sampling from the distribution (can sometimes say random things)
        vocab_file (str): filename of the vocabulary
        vocab_dir (str): path to the vocabulary file

    Returns:
        tuple: (list, str, float)
            list of int: tokenized version of the translated sentence
            float: log probability of the translated sentence
            str: the translated sentence
    """
    
    ### START CODE HERE ###
    
    # encode the input sentence
    input_tokens = tokenize(input_sentence, vocab_file, vocab_dir)
    
    # initialize an empty the list of output tokens
    cur_output_tokens = []
    
    # initialize an integer that represents the current output index
    cur_output = 0
    
    # Set the encoding of the "end of sentence" as 1
    EOS = 1
    
    # check that the current output is not the end of sentence token
    while cur_output_tokens != EOS:
        
        # update the current output token by getting the index of the next word (hint: use next_symbol)
        cur_output, log_prob = next_symbol(NMTAttn, input_tokens, cur_output_tokens, temperature)
        
        # append the current output token to the list of output tokens
        cur_output_tokens.append(cur_output)        
    
    # detokenize the output tokens
    sentence = detokenize(cur_output_tokens)
    
    ### END CODE HERE ###
    
    return cur_output_tokens, log_prob, sentence

In [43]:
sampling_decode("I love languages.", NMTAttn=model, temperature=0.0, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR)

TypeError: join() argument must be str, bytes, or os.PathLike object, not 'NoneType'