In [1]:
# 1 importing libs

import sys
import os

import numpy as np

import textwrap
wrapper = textwrap.TextWrapper(width=70)

import trax
from trax import layers as tl
from trax.fastmath import numpy as jnp
import tensorflow_datasets as tfds

# to print the entire np array
np.set_printoptions(threshold=sys.maxsize)

In [2]:
train_stream_fn = trax.data.TFDS('cnn_dailymail',
                                 data_dir='data/',
                                 keys=('article', 'highlights'),
                                 train=True)

# This should be much faster as the data is downloaded already.
eval_stream_fn = trax.data.TFDS('cnn_dailymail',
                                data_dir='data/',
                                keys=('article', 'highlights'),
                                train=False)

In [3]:
#vocabulary to perform the word to index

def tokenize(input_str, EOS=1):
    """Input str to features dict, ready for inference"""
    # Use the trax.data.tokenize method. It takes streams and returns streams,
    # we get around it by making a 1-element stream with `iter`.
    inputs =  next(trax.data.tokenize(iter([input_str]),
                                      vocab_dir='vocab_dir/',
                                      vocab_file='summarize32k.subword.subwords'))
    # Mark the end of the sentence with EOS
    return list(inputs) + [EOS]

def detokenize(integers):
    """List of ints to str"""
    s = trax.data.detokenize(integers,
                             vocab_dir='vocab_dir/',
                             vocab_file='summarize32k.subword.subwords')
    return wrapper.fill(s)

In [4]:
# Special tokens
SEP = 0 # Padding or separator token
EOS = 1 # End of sentence token

# Concatenate tokenized inputs and targets using 0 as separator.
def preprocess(stream):
    for (article, summary) in stream:
        joint = np.array(list(article) + [EOS, SEP] + list(summary) + [EOS])
        mask = [0] * (len(list(article)) + 2) + [1] * (len(list(summary)) + 1) # Accounting for EOS and SEP
        yield joint, joint, np.array(mask)

# You can combine a few data preprocessing steps into a pipeline like this.
input_pipeline = trax.data.Serial(
    # Tokenizes
    trax.data.Tokenize(vocab_dir='vocab_dir/',
                       vocab_file='summarize32k.subword.subwords'),
    # Uses function defined above
    preprocess,
    # Filters out examples longer than 4096
    trax.data.FilterByLength(4096)
)

# Apply preprocessing to data streams.
train_stream = input_pipeline(train_stream_fn())
eval_stream = input_pipeline(eval_stream_fn())

train_input, train_target, train_mask = next(train_stream)

assert sum((train_input - train_target)**2) == 0  # They are the same in Language Model (LM).

In [5]:
# 4 of length < 512. And so on. 
boundaries =  [128, 256,  512, 1024]
batch_sizes = [16,  8,  4,  2,  1]

# Create the streams.
train_batch_stream = trax.data.BucketByLength(
    boundaries, batch_sizes)(train_stream)

eval_batch_stream = trax.data.BucketByLength(
    boundaries, batch_sizes)(eval_stream)

In [6]:
detokenize(next(eval_batch_stream)[0][0])

"A middle-school teacher in China has inked hundreds of sketches that\nare beyond be-leaf. Politics teacher Wang Lian, 35,  has created 1000\nstunning ink drawings covering subjects as varied as cartoon\ncharacters and landscapes to animals, birds according to\xa0the\xa0People's\nDaily Online. The intricate scribbles on leaves feature Wang's\nfavourite sites across the city of Nanjing, which include the\nPresidential Palace, Yangtze River Bridge, the ancient Jiming Temple\nand the Qinhuai River. Natural canvas: Artist and teacher Wang Lian\nhas done hundreds of drawings, like this temple, on leaves she\ncollects in the park and on the streets . Delicate: She uses an ink\npen to gently draw the local scenes and buildings on the dried out\nleaves . 'Although teaching politics is my job, drawing is my passion\nand hobby,' said Wang. 'I first tried drawing on leaves about 10 years\nago and fell in love with it as an art form immediately. 'It's like\ndrawing on very old parchment paper, you

In [7]:
TransformerLM = trax.models.TransformerLM(
    vocab_size=33300,
    d_model=512,
    d_ff=2048,
    n_layers=6,
    n_heads=8,
    dropout=0.1,
    max_len=4096,
    mode='train',
    ff_activation=tl.Relu)

In [8]:
from trax.supervised import training

model_path = "/home/yuguro/Desktop/personal/coursera/text_summarizer/model/"
# UNQ_C8
# GRADED FUNCTION: train_model
def training_loop(TransformerLM, train_gen, eval_gen, output_dir=model_path):
    '''
    Input:
        TransformerLM (trax.layers.combinators.Serial): The model you are building.
        train_gen (generator): Training stream of data.
        eval_gen (generator): Evaluation stream of data.
        output_dir (str): folder to save your file.
        
    Returns:
        trax.supervised.training.Loop: Training loop.
    '''
    output_dir = os.path.expanduser(output_dir)  # trainer is an object
    lr_schedule = trax.lr.warmup_and_rsqrt_decay(n_warmup_steps=1000, max_value=0.01)

    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    train_task = training.TrainTask( 
      labeled_data=train_gen, # The training generator
      loss_layer=tl.CrossEntropyLoss(), # Loss function 
      optimizer=trax.optimizers.Adam(0.01), # Optimizer (Don't forget to set LR to 0.01)
      lr_schedule=lr_schedule,
      n_steps_per_checkpoint=5
    )

    eval_task = training.EvalTask( 
      labeled_data=eval_gen, # The evaluation generator
      metrics=[tl.CrossEntropyLoss(), tl.Accuracy()] # CrossEntropyLoss and Accuracy
    )

    ### END CODE HERE ###

    loop = training.Loop(TransformerLM,
                         train_task,
                         eval_tasks=[eval_task],
                         output_dir=output_dir)
    
    return loop

In [9]:
# Should take around 1.5 minutes
# !rm -f model/model.pkl.gz

loop = training_loop(TransformerLM, train_batch_stream, eval_batch_stream)
loop.load_checkpoint('teste_20_04')
loop.run(150)
loop.save_checkpoint('teste_20_04')


Step    475: Ran 5 train steps in 105.68 secs
Step    475: train CrossEntropyLoss | -3691.59619141
Step    475: eval  CrossEntropyLoss | -3874.77514648
Step    475: eval          Accuracy |  0.04255319

Step    480: Ran 5 train steps in 110.42 secs
Step    480: train CrossEntropyLoss | -4202.98583984
Step    480: eval  CrossEntropyLoss | -4117.95312500
Step    480: eval          Accuracy |  0.03773585

Step    485: Ran 5 train steps in 134.92 secs
Step    485: train CrossEntropyLoss | -4287.93652344
Step    485: eval  CrossEntropyLoss | -3691.33544922
Step    485: eval          Accuracy |  0.03947368

Step    490: Ran 5 train steps in 119.40 secs
Step    490: train CrossEntropyLoss | -4577.04589844
Step    490: eval  CrossEntropyLoss | -4577.70507812
Step    490: eval          Accuracy |  0.02643172

Step    495: Ran 5 train steps in 143.92 secs
Step    495: train CrossEntropyLoss | -4723.82666016
Step    495: eval  CrossEntropyLoss | -4314.51318359
Step    495: eval          Accuracy

In [3]:
# Get the model architecture
model = trax.models.TransformerLM(
    vocab_size=33300,
    d_model=512,
    d_ff=2048,
    n_layers=6,
    n_heads=8,
    dropout=0.1,
    max_len=4096,
    mode='eval',
    ff_activation=tl.Relu)

# model = TransformerLM(mode='eval')

# Load the pre-trained weights
model.init_from_file('model/model.pkl.gz', weights_only=True)



In [4]:
# UNQ_C9
def next_symbol(cur_output_tokens, model):
    """Returns the next symbol for a given sentence.

    Args:
        cur_output_tokens (list): tokenized sentence with EOS and PAD tokens at the end.
        model (trax.layers.combinators.Serial): The transformer model.

    Returns:
        int: tokenized symbol.
    """
    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    
    # current output tokens length
    token_length = len(cur_output_tokens)
    # calculate the minimum power of 2 big enough to store token_length
    # HINT: use np.ceil() and np.log2()
    # add 1 to token_length so np.log2() doesn't receive 0 when token_length is 0
    padded_length = 2**int(np.ceil(np.log2(token_length + 1)))

    # Fill cur_output_tokens with 0's until it reaches padded_length
    padded = cur_output_tokens + [0] * (padded_length - token_length)
    padded_with_batch = np.array(padded)[None, :] # Don't replace this 'None'! This is a way of setting the batch dim

    # model expects a tuple containing two padded tensors (with batch)
    output, _ = model((padded_with_batch, padded_with_batch)) 
    # HINT: output has shape (1, padded_length, vocab_size)
    # To get log_probs you need to index output with 0 in the first dim
    # token_length in the second dim and all of the entries for the last dim.
    log_probs = output[0, token_length, :]
    
    ### END CODE HERE ###
    
    return int(np.argmax(log_probs))

In [5]:
# UNQ_C10
# Decoding functions.
def greedy_decode(input_sentence, model):
    """Greedy decode function.

    Args:
        input_sentence (string): a sentence or article.
        model (trax.layers.combinators.Serial): Transformer model.

    Returns:
        string: summary of the input.
    """
    
    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    # Use tokenize()
    cur_output_tokens = tokenize(input_sentence) + [0]
    generated_output = [] 
    cur_output = 0 
    EOS = 1 
    
    while cur_output != EOS:
        # Get next symbol
        cur_output = next_symbol(cur_output_tokens, model)
        # Append next symbol to original sentence
        cur_output_tokens.append(cur_output)
        # Append next symbol to generated sentence
        generated_output.append(cur_output)
        print(detokenize(generated_output))
    
    ### END CODE HERE ###
    
    return detokenize(generated_output)

In [None]:
# Test it out on a sentence!
test_sentence = "It was a sunny day when I went to the market to buy some flowers. But I only found roses, not tulips."
print(wrapper.fill(test_sentence), '\n')
print(greedy_decode(test_sentence, model))

In [None]:
# Test it out!
sentence_test_nxt_symbl = "I want to fly in the sky tomorrow and it'll be fun."
detokenize([next_symbol(tokenize(sentence_test_nxt_symbl)+[0], model)])

In [9]:
    
    sentence = "I want to fly in the sky tomorrow and it'll be fun."
    cur_output_tokens = tokenize(sentence)+[0]
    # cur_output_tokens.append(1628)
    token_length = len(cur_output_tokens)
    padded_length = 2**int(np.ceil(np.log2(token_length + 1)))

    padded = cur_output_tokens + [0] * (padded_length - token_length)
    padded_with_batch = np.array(padded)[None, :] # Don't replace this 'None'! This is a way of setting the batch dim

    output, _ = model((padded_with_batch, padded_with_batch))
    log_probs = output[0, token_length, :]

In [13]:
detokenize([int(np.argmax(log_probs))])

'10'