In [1]:
from datasets import load_dataset, load_metric
from transformers import pipeline

raw_dataset = load_dataset('cnn_dailymail', '3.0.0')

classifier = pipeline("summarization")

  from .autonotebook import tqdm as notebook_tqdm
Reusing dataset cnn_dailymail (/Users/jeroen/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)
100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 161.67it/s]
No model was supplied, defaulted to t5-small (https://huggingface.co/t5-small)
2022-04-01 16:46:45.371865: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [2]:
def show_examples(dataset, num_samples=3, seed=42):
    samples = dataset["train"].shuffle(seed=seed).select(range(num_samples))
        
    for idx, sample in enumerate(samples):
        display(f'sample {idx}: {sample["article"]} \n')
        display(f'highlight {idx}: {sample["highlights"]} \n')
        display(f'id: {sample["id"]}')
        display('-------')
        
def get_samples(dataset, num_samples=10):
    return dataset["train"].shuffle(seed=1).select(range(num_samples))

def get_random_sample(dataset):
    sample = dataset["train"].shuffle(seed=1).select(range(1)) 
    return [sample["article"][0], sample["highlights"][0]]

In [3]:
show_examples(raw_dataset, 1, 23)
# get_random_sample(raw_dataset)

Loading cached shuffled indices for dataset at /Users/jeroen/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234/cache-17d3e0cea13717ef.arrow




'highlight 0: Transportation safety board beginning four days of hearings .\nBoard examines reported "drastic increase" in accidents and deaths .\nNine air ambulance crashes killed 35 people during one-year period .\nBoard\'s 2006 safety recommendations not fully implemented, it says . \n'

'id: 4929e54ae3f6711b4bd8da27a46d0f8a90c3b3bf'

'-------'

### Model description
BART is a transformer encoder-encoder (seq2seq) model with a bidirectional (BERT-like) encoder and an autoregressive (GPT-like) decoder. BART is pre-trained by (1) corrupting text with an arbitrary noising function, and (2) learning a model to reconstruct the original text.

BART is particularly effective when fine-tuned for text generation (e.g. summarization, translation) but also works well for comprehension tasks (e.g. text classification, question answering).

In [4]:
from transformers import AutoTokenizer

# A text-to-text transformer from Google
# https://ai.googleblog.com/2020/02/exploring-transfer-learning-with-t5.html
pretrained_model_name = "t5-small"

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)

In [5]:
article, highlight = get_random_sample(raw_dataset)
print(tokenizer.tokenize(article))

Loading cached shuffled indices for dataset at /Users/jeroen/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234/cache-d65aa25a11484207.arrow
Token indices sequence length is longer than the specified maximum sequence length for this model (798 > 512). Running this sequence through the model will result in indexing errors




In [6]:
metric = load_metric("rouge")
metric

Metric(name: "rouge", features: {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}, usage: """
Calculates average rouge scores for a list of hypotheses and references
Args:
    predictions: list of predictions to score. Each predictions
        should be a string with tokens separated by spaces.
    references: list of reference for each prediction. Each
        reference should be a string with tokens separated by spaces.
    rouge_types: A list of rouge types to calculate.
        Valid names:
        `"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring,
        `"rougeL"`: Longest common subsequence based scoring.
        `"rougeLSum"`: rougeLsum splits text using `"
"`.
        See details in https://github.com/huggingface/datasets/issues/617
    use_stemmer: Bool indicating whether Porter stemmer should be used to strip word suffixes.
    use_agregator: Return aggregates if this is set to True
Retu

In [7]:
max_input_length = 1024
max_target_length = 128

prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["article"]]
    
    model_inputs = tokenizer(examples["article"], max_length=max_input_length, truncation=True)
    
    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

Use 🤗 Datasets map function to apply the preprocessing function over the entire dataset. You can speed up the map function by setting batched=True to process multiple elements of the dataset at once:

In [9]:
tokenized_cnn_dailymail = raw_dataset.map(preprocess_function, batched=True)

100%|█████████████████████████████████████████| 288/288 [02:51<00:00,  1.68ba/s]
100%|███████████████████████████████████████████| 14/14 [00:07<00:00,  1.81ba/s]
100%|███████████████████████████████████████████| 12/12 [00:06<00:00,  1.80ba/s]


In [10]:
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

model = TFAutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name)
model

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


<transformers.models.t5.modeling_tf_t5.TFT5ForConditionalGeneration at 0x107fa6fb0>

In [23]:
batch_size = 8
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 1


In [24]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")
tokenized_cnn_dailymail["train"]

Dataset({
    features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 287113
})

In [29]:
train_dataset = tokenized_cnn_dailymail["train"].to_tf_dataset(
    batch_size=batch_size,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=True,
    collate_fn=data_collator,
)
validation_dataset = tokenized_cnn_dailymail["validation"].to_tf_dataset(
    batch_size=8,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=False,
    collate_fn=data_collator,
)
generation_dataset = (
    tokenized_cnn_dailymail["validation"]
    .shuffle()
    .select(list(range(200)))
    .to_tf_dataset(
        batch_size=8,
        columns=["input_ids", "attention_mask", "labels"],
        shuffle=False,
        collate_fn=data_collator,
    )
)

In [30]:
from transformers import AdamWeightDecay
import tensorflow as tf

optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! Please ensure your labels are passed as keys in the input dict so that they are accessible to the model during the forward pass. To disable this behaviour, please pass a loss argument, or explicitly pass loss=None if you do not want your model to compute a loss.


In [20]:
import numpy as np
import nltk


def metric_fn(eval_predictions):
    predictions, labels = eval_predictions
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    for label in labels:
        label[label < 0] = tokenizer.pad_token_id  # Replace masked label tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Rouge expects a newline after each sentence
    decoded_predictions = [
        "\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_predictions
    ]
    decoded_labels = [
        "\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels
    ]
    result = metric.compute(
        predictions=decoded_predictions, references=decoded_labels, use_stemmer=True
    )
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    # Add mean generated length
    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions
    ]
    result["gen_len"] = np.mean(prediction_lens)

    return result

In [None]:
from transformers.keras_callbacks import PushToHubCallback, KerasMetricCallback
from tensorflow.keras.callbacks import TensorBoard

tensorboard_callback = TensorBoard(log_dir="./summarization_model_save/logs")

# push_to_hub_callback = PushToHubCallback(
#     output_dir="./summarization_model_save",
#     tokenizer=tokenizer,
#     hub_model_id=push_to_hub_model_id,
# )

metric_callback = KerasMetricCallback(
    metric_fn, eval_dataset=generation_dataset, predict_with_generate=True
)

callbacks = [metric_callback, tensorboard_callback]

model.fit(
    train_dataset, validation_data=validation_dataset, epochs=1, callbacks=callbacks
)



    3/35889 [..............................] - ETA: 81:00:31 - loss: 2.3241