In [1]:
# we use data from https://huggingface.co/datasets/cnn_dailymail

In [2]:
# If we download data using hugging face dataset

In [3]:
from datasets import load_dataset
# I just downloaded the smallest subset
docs = load_dataset('cnn_dailymail', '1.0.0', split='test')

In [4]:
# Get a train set and a val set from the downloaded data
docs_train_val = docs.train_test_split(train_size=0.05, test_size=0.01)

In [5]:
docs_train_val

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 574
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 115
    })
})

In [6]:
import keras

2023-11-30 11:29:50.872516: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-30 11:29:51.888258: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-30 11:29:51.888817: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
from transformers import AutoTokenizer
# Get the tokenizer - making words into numbers
tokenizer = AutoTokenizer.from_pretrained('t5-small')

In [8]:
# prefix is needed for T5
prefix = 'summarize:'

def tokenization(docs, max_article_length=1024, max_summary_length=64):

    articles = [prefix + doc for doc in docs['article']]

    tokenized_articles = tokenizer(articles, max_length=max_article_length, truncation=True)

    tokenized_summaries = tokenizer(docs['highlights'], max_length=max_summary_length, truncation=True)

    tokenized_articles['labels'] = tokenized_summaries['input_ids']

    return tokenized_articles

In [9]:
# tokenize the data
tokenized_docs = docs_train_val.map(tokenization, batched=True)

Map:   0%|          | 0/574 [00:00<?, ? examples/s]

Map:   0%|          | 0/115 [00:00<?, ? examples/s]

In [10]:
# keep only the tokens
tokens_docs = tokenized_docs.remove_columns(['article','highlights', 'id'])

In [11]:
tokens_docs

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 574
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 115
    })
})

In [12]:
from transformers import TFAutoModelForSeq2SeqLM
# get the model
model = TFAutoModelForSeq2SeqLM.from_pretrained('t5-small')

2023-11-30 11:30:16.048101: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [13]:
from transformers import DataCollatorForSeq2Seq
# get a collator for data padding
collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [14]:
# process data
train_data = model.prepare_tf_dataset(
    tokens_docs['train'],
    shuffle=True,
    batch_size=8,
    tokenizer= tokenizer,
    collate_fn=collator,
)

val_data = model.prepare_tf_dataset(
    tokens_docs['test'],
    shuffle=False,
    batch_size=8,
    tokenizer= tokenizer,
    collate_fn=collator,
)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [15]:
# Use Adam as the optimizer
optimizer = keras.optimizers.Adam(learning_rate=2e-5)
model.compile(optimizer=optimizer)

In [16]:
model.fit(train_data, validation_data=val_data, epochs=2, verbose=True)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x7f2fbc7a5710>

In [17]:
from transformers import pipeline

In [18]:
# model.save_weights('summarized_model')

In [19]:
pipe = pipeline("summarization", model=model, tokenizer=tokenizer, framework="tf")

In [20]:
# get 10 samples for evaluation
eval_articles = docs['article'][-10:]
eval_highlights = docs['highlights'][-10:]

In [21]:
import evaluate

rouge_score = evaluate.load("rouge")

In [22]:
generated_summaries = [pipe(article) for article in eval_articles]

Token indices sequence length is longer than the specified maximum sequence length for this model (1730 > 512). Running this sequence through the model will result in indexing errors
2023-11-30 12:03:12.417400: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x1a5da580 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2023-11-30 12:03:12.417443: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2023-11-30 12:03:12.956920: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:255] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-11-30 12:03:14.810094: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


In [23]:
len(generated_summaries)

10

In [24]:
generated_summaries = [k[0]['summary_text'] for k in generated_summaries]

In [25]:
from rouge_score import rouge_scorer

In [26]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
for i in range(10):
    scores = scorer.score(eval_highlights[i], generated_summaries[i])
    print('test case:', i+1)
    for key in scores:
        print(f'{key}: {scores[key]}')
    print('-'*50)

test case: 1
rouge1: Score(precision=0.3018867924528302, recall=0.17777777777777778, fmeasure=0.22377622377622378)
rouge2: Score(precision=0.057692307692307696, recall=0.033707865168539325, fmeasure=0.0425531914893617)
rougeL: Score(precision=0.16981132075471697, recall=0.1, fmeasure=0.1258741258741259)
--------------------------------------------------
test case: 2
rouge1: Score(precision=0.3333333333333333, recall=0.5882352941176471, fmeasure=0.42553191489361697)
rouge2: Score(precision=0.22033898305084745, recall=0.3939393939393939, fmeasure=0.2826086956521739)
rougeL: Score(precision=0.31666666666666665, recall=0.5588235294117647, fmeasure=0.4042553191489362)
--------------------------------------------------
test case: 3
rouge1: Score(precision=0.265625, recall=0.4358974358974359, fmeasure=0.3300970873786408)
rouge2: Score(precision=0.1111111111111111, recall=0.18421052631578946, fmeasure=0.13861386138613863)
rougeL: Score(precision=0.203125, recall=0.3333333333333333, fmeasure=0.