In [None]:
!pip install transformers==4.20.0
!pip install keras_nlp==0.3.0
!pip install datasets
!pip install huggingface-hub
!pip install nltk
!pip install rouge-score


In [2]:
import os
import logging

import nltk
import numpy as np
import tensorflow as tf
from tensorflow import keras

# Only log error messages
tf.get_logger().setLevel(logging.ERROR)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
import keras_nlp

In [4]:
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

In [5]:
from datasets  import load_dataset

In [6]:
model_path = "t5-small"
data_path = "cnn_dailymail"



In [None]:
dataset = load_dataset(data_path, '3.0.0', split='train')

In [8]:
subset_dataset = dataset.select(range(300))

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
tokenizer(subset_dataset['article'][1], truncation=True ,return_tensors='tf')

In [12]:
final_dataset = subset_dataset.train_test_split(
    train_size=0.75, test_size=0.25
)

In [13]:
def tokenization(examples):
    inputs = [doc for doc in examples["article"]]
    targets = [summary for summary in examples["highlights"]]
    model_inputs = tokenizer(inputs, max_length=400, truncation=True, padding='max_length')

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=100, truncation=True, padding='max_length')

    model_inputs['labels'] = labels['input_ids']
    return model_inputs


In [None]:
tokenized_dataset = final_dataset.map(tokenization, batched=True)

In [15]:
processed_data = tokenized_dataset.remove_columns(['article','highlights', 'id'])

In [None]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_path)

In [17]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model= model, return_tensors="tf")

In [None]:
train_dataset = model.prepare_tf_dataset(processed_data['train'], batch_size=16, tokenizer= tokenizer, collate_fn=data_collator, shuffle=True, drop_remainder=True)

In [19]:
test_dataset = model.prepare_tf_dataset(processed_data['test'], batch_size=16, tokenizer= tokenizer, collate_fn=data_collator, shuffle=False, drop_remainder=True)

In [20]:
optimizer = keras.optimizers.Adam(learning_rate=2e-5)
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [21]:
rouge_l = keras_nlp.metrics.RougeL()

In [57]:
results_storage = []
def metric_fn(eval_predictions):
    predictions, labels = eval_predictions
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    for label in labels:
        label[label < 0] = tokenizer.pad_token_id  # Replace masked label tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge_l(decoded_labels, decoded_predictions)
    result = {"RougeL": result["f1_score"]}
    results_storage.append(result)
    return result

In [58]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(
    metric_fn, eval_dataset=test_dataset)


In [59]:
callbacks = [metric_callback]

In [60]:
model.evaluate(test_dataset, callbacks=[metric_callback])



[8.90249252319336, 0.3764062523841858]

In [36]:
#model.fit(train_dataset, validation_data=test_dataset, epochs=1, verbose=True)

In [33]:
loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)


In [34]:
metrics = ['accuracy']
model.compile(optimizer=optimizer, loss=loss_function, metrics=metrics, run_eagerly=True)

In [35]:
model.save_weights('summarized_model')

In [None]:
!pip install rouge-score

In [None]:
from transformers import pipeline

In [None]:
pipe = pipeline("summarization", model=model, tokenizer=tokenizer, framework="tf")

In [None]:
subset_dataset[12]

{'article': 'BREMEN, Germany -- Carlos Alberto, who scored in FC Porto\'s Champions League final victory against Monaco in 2004, has joined Bundesliga club Werder Bremen for a club record fee of  7.8 million euros ($10.7 million). Carlos Alberto enjoyed success at FC Porto under Jose Mourinho. "I\'m here to win titles with Werder," the 22-year-old said after his first training session with his new club. "I like Bremen and would only have wanted to come here." Carlos Alberto started his career with Fluminense, and helped them to lift the Campeonato Carioca in 2002. In January 2004 he moved on to FC Porto, who were coached by José Mourinho, and the club won the Portuguese title as well as the Champions League. Early in 2005, he moved to Corinthians, where he impressed as they won the Brasileirão,but in 2006 Corinthians had a poor season and Carlos Alberto found himself at odds with manager, Emerson Leão. Their poor relationship came to a climax at a Copa Sul-Americana game against Club A

In [None]:
pipe(subset_dataset['article'][12])

[{'summary_text': "Carlos Alberto joins Werder Bremen for a club record fee of 7.8 million euros ($10.7 million) the 22-year-old scored in FC Porto's champions league final victory against Monaco in 2004 . he started his career with fluminense and helped them lift the Campeonato Carioca in 2002 ."}]

In [None]:
pipe("""WASHINGTON -LRB- CNN -RRB- -- The U.S. Navy arrested nine more suspected pirates off the coast of Somalia Thursday -- the second capture in two days -- after receiving a distress call from an Indian-flagged commercial ship . Suspected pirates are arrested in the Gulf of Aden . According to the Navy announcement , at 4 a.m. local time the Indian-flagged Premdivya sent a distress call to all ships in the area reporting that she had been fired upon by a small skiff , and suspected pirates were attempting to board it . A U.S. Navy helicopter crew was launched from the USS Vella Gulf and fired two warning shots at the small boat to get them to stop . A Navy boarding team was then launched to investigate the skiff 's crew and found rocket-propelled grenades and other weapons on board the small craft , according to Navy officials . The suspected pirates were taken aboard the USS Vella Gulf and processed . They 'll be moved to a temporary holding facility aboard the larger USNS Lewis and Clark , according to the statement . The Navy is now holding a total of 16 suspected pirates while the U.S. and Kenyan governments work out legal details on how the suspects will be moved to Kenya for prosecution . Last month , the United States and Kenya signed an agreement saying that suspected pirates captured by U.S. ships will be moved to Kenya to be tried for their crimes . The capture Wednesday of seven suspected pirates marks the first time the United States was able to capture and hold pirates since its forces began patrolling the dangerous waters off Somalia . Piracy has become a chronic problem off the Horn of Africa in recent years , with some pirates operating from largely lawless Somalia . Pirates attacked nearly 100 vessels and hijacked as many as 40 in the waters off the coast of Somalia in 2008 , according to the International Maritime Bureau . The task force led by the Vella Gulf was set up in January in an effort to clamp down on the attacks in the region , the southern approach to the Red Sea and the Suez Canal .""")


[{'summary_text': 'nine more suspected pirates are arrested in the Gulf of Aden . a distress call from an Indian-flagged commercial ship was sent to all ships in the area . the navy is holding a total of 16 suspects while the united states and Kenya work out legal details .'}]