In [1]:
from datasets import load_dataset, load_metric
from transformers import pipeline

raw_dataset = load_dataset('cnn_dailymail', '3.0.0')

classifier = pipeline("summarization")

  from .autonotebook import tqdm as notebook_tqdm
Reusing dataset cnn_dailymail (/Users/jeroen/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 100.20it/s]
No model was supplied, defaulted to t5-small (https://huggingface.co/t5-small)
Downloading: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.17k/1.17k [00:00<00:00, 161kB/s]
Downloading: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

In [2]:
def show_examples(dataset, num_samples=3, seed=42):
    samples = dataset["train"].shuffle(seed=seed).select(range(num_samples))
        
    for idx, sample in enumerate(samples):
        display(f'sample {idx}: {sample["article"]} \n')
        display(f'highlight {idx}: {sample["highlights"]} \n')
        display(f'id: {sample["id"]}')
        display('-------')
        
def get_samples(dataset, num_samples=10):
    return dataset["train"].shuffle(seed=1).select(range(num_samples))

def get_random_sample(dataset):
    sample = dataset["train"].shuffle(seed=1).select(range(1)) 
    return [sample["article"][0], sample["highlights"][0]]

In [3]:
show_examples(raw_dataset, 1, 23)
# get_random_sample(raw_dataset)

Loading cached shuffled indices for dataset at /Users/jeroen/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234/cache-17d3e0cea13717ef.arrow




'highlight 0: Transportation safety board beginning four days of hearings .\nBoard examines reported "drastic increase" in accidents and deaths .\nNine air ambulance crashes killed 35 people during one-year period .\nBoard\'s 2006 safety recommendations not fully implemented, it says . \n'

'id: 4929e54ae3f6711b4bd8da27a46d0f8a90c3b3bf'

'-------'

### Model description
BART is a transformer encoder-encoder (seq2seq) model with a bidirectional (BERT-like) encoder and an autoregressive (GPT-like) decoder. BART is pre-trained by (1) corrupting text with an arbitrary noising function, and (2) learning a model to reconstruct the original text.

BART is particularly effective when fine-tuned for text generation (e.g. summarization, translation) but also works well for comprehension tasks (e.g. text classification, question answering).

In [13]:
from transformers import AutoTokenizer

# A text-to-text transformer from Google
# https://ai.googleblog.com/2020/02/exploring-transfer-learning-with-t5.html
pretrained_model_name = "t5-small"

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)

In [5]:
article, highlight = get_random_sample(raw_dataset)
print(tokenizer.tokenize(article))

Loading cached shuffled indices for dataset at /Users/jeroen/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234/cache-d65aa25a11484207.arrow
Token indices sequence length is longer than the specified maximum sequence length for this model (798 > 512). Running this sequence through the model will result in indexing errors




In [6]:
metric = load_metric("rouge")
metric

Metric(name: "rouge", features: {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}, usage: """
Calculates average rouge scores for a list of hypotheses and references
Args:
    predictions: list of predictions to score. Each predictions
        should be a string with tokens separated by spaces.
    references: list of reference for each prediction. Each
        reference should be a string with tokens separated by spaces.
    rouge_types: A list of rouge types to calculate.
        Valid names:
        `"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring,
        `"rougeL"`: Longest common subsequence based scoring.
        `"rougeLSum"`: rougeLsum splits text using `"
"`.
        See details in https://github.com/huggingface/datasets/issues/617
    use_stemmer: Bool indicating whether Porter stemmer should be used to strip word suffixes.
    use_agregator: Return aggregates if this is set to True
Retu

In [11]:
max_input_length = 1024
max_target_length = 128

prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["article"]]
    
    model_inputs = tokenizer(examples["article"], max_length=max_input_length, truncation=True)
    
    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

Use 🤗 Datasets map function to apply the preprocessing function over the entire dataset. You can speed up the map function by setting batched=True to process multiple elements of the dataset at once:

In [10]:
tokenized_cnn_dailymail = billsum.map(preprocess_function, batched=True)

Loading cached shuffled indices for dataset at /Users/jeroen/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234/cache-d65aa25a11484207.arrow


{'input_ids': [[37, 3062, 13, 3370, 9503, 21212, 216, 29, 16502, 65, 11518, 13, 160, 27635, 44, 8, 3141, 22, 7, 3338, 12, 16363, 8602, 12177, 7, 3, 104, 72, 145, 3, 9, 5112, 227, 160, 2039, 8151, 14761, 4513, 10219, 32, 7, 76, 4985, 24, 2237, 12, 160, 1687, 5, 3128, 159, 15, 216, 29, 16502, 3977, 16, 2464, 9742, 6426, 45, 46, 7952, 255, 4682, 383, 3730, 12, 15526, 160, 9883, 5, 451, 141, 118, 16, 11, 91, 13, 2833, 21, 2391, 203, 227, 3, 9, 2343, 12177, 5427, 1462, 26, 160, 3, 27169, 4169, 648, 16, 3, 9, 3860, 10219, 32, 7, 76, 4985, 2986, 16, 4407, 5, 389, 1744, 1273, 10, 21212, 11, 3128, 159, 15, 216, 29, 16502, 28, 391, 88, 9, 2565, 11, 160, 4284, 19454, 274, 3128, 159, 15, 31, 7, 2986, 3, 5, 461, 9545, 3392, 10, 391, 88, 9, 2565, 216, 29, 16502, 6, 3, 22665, 28, 160, 3062, 2473, 6, 3547, 19, 2066, 53, 21, 394, 4750, 227, 160, 2039, 3128, 159, 15, 3977, 13, 3, 9, 14761, 4513, 10219, 32, 7, 76, 4985, 2986, 3, 5, 852, 160, 3062, 391, 88, 9, 2565, 216, 29, 16502, 19, 3, 14138, 3, 9, 126

In [14]:
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

model = TFAutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name)
model

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


<transformers.models.t5.modeling_tf_t5.TFT5ForConditionalGeneration at 0x2fa2ce110>

In [None]:
batch_size = 8
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 1

model_name = model_checkpoint.split("/")[-1]
push_to_hub_model_id = f"{model_name}-finetuned-xsum"