In [35]:
!pip install datasets



In [36]:
!pip install transformers==4.20.0
!pip install keras_nlp==0.3.0
!pip install huggingface-hub
!pip install nltk
!pip install rouge-score



## Content-Text-Summarizer

In [37]:
from datasets import load_dataset
dataset = load_dataset("cnn_dailymail", "3.0.0","train")

In [38]:
print("dataset")
dataset


dataset


DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [39]:
import logging
import os
import nltk
import numpy as np
import tensorflow as tf
from tensorflow import keras

# Only log error messages
tf.get_logger().setLevel(logging.ERROR)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [40]:
import keras_nlp
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

In [41]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")
max_input_length = 512
max_target_length = 30

print(dataset)
def preprocess_function(data):
    model_inputs = tokenizer(
        data["article"],
        max_length=max_input_length,
        truncation=True,
    )
    labels = tokenizer(
        data["highlights"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)


DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})


Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [44]:
import evaluate

rouge_score = evaluate.load("rouge")

In [45]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [46]:
from nltk.tokenize import sent_tokenize


def three_sentence_summary(text):
    return "\n".join(sent_tokenize(text)[:3])



In [47]:
def evaluate_baseline(dataset, metric):
    summaries = [three_sentence_summary(text) for text in dataset["article"]]
    return metric.compute(predictions=summaries, references=dataset["highlights"])

In [48]:
import pandas as pd

score = evaluate_baseline(dataset["validation"], rouge_score)
score
# rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
# rouge_dict = dict((rn, round(score[rn].mid.fmeasure * 100, 2)) for rn in rouge_names)
# rouge_dict

{'rouge1': 0.3949384941068029,
 'rouge2': 0.17561523704991952,
 'rougeL': 0.2494699777830668,
 'rougeLsum': 0.361246973619621}

In [49]:
from transformers import TFAutoModelForSeq2SeqLM

In [50]:
model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-small")

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [51]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [52]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf",)

In [53]:
tokenized_datasets = tokenized_datasets.remove_columns(
    dataset["train"].column_names
)

In [54]:
features = [tokenized_datasets["train"][i] for i in range(2)]
data_collator(features)

{'input_ids': <tf.Tensor: shape=(2, 512), dtype=int32, numpy=
array([[  301, 24796,  4170, ..., 16575,   976,     1],
       [11953,    31,     7, ...,    31,     7,     1]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(2, 512), dtype=int32, numpy=
array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]], dtype=int32)>, 'labels': <tf.Tensor: shape=(2, 30), dtype=int32, numpy=
array([[ 8929, 16023,  2213,  4173,  6324, 12591,    15,  2347,  3996,
         1755,   329, 13462,    38,     3,    88,  5050,   507,  2089,
            3,     5,  5209,  7556,   845,     3,    88,    65,   150,
         1390,    12,     1],
       [17054,   120,     3,  1092,    16, 11171,    16,  8327,    33,
          629,    26,    30,     8,    96,  1161,  7483,  1501,   121,
        12330,  9316,   312,    99,   348,   845,   167,    33,   132,
           38,     3,     1]], dtype=int32)>, 'decoder_input_ids': <tf.Tensor: shape=(2, 30), dtype=int32, numpy=
array([[    0,  8929, 16023,  2213,  

In [74]:
!pip install pyarrow
!pip install pandas-gbq==0.19.2
!pip install --upgrade pyarrow

Collecting pandas-gbq==0.19.2
  Downloading pandas_gbq-0.19.2-py2.py3-none-any.whl (25 kB)
Installing collected packages: pandas-gbq
  Attempting uninstall: pandas-gbq
    Found existing installation: pandas-gbq 0.17.9
    Uninstalling pandas-gbq-0.17.9:
      Successfully uninstalled pandas-gbq-0.17.9
Successfully installed pandas-gbq-0.19.2


In [80]:
tokenized_datasets["train"]
df = tokenized_datasets["train"].to_pandas()
rows_with_none_values = df.isnull().any(axis=1)

# Drop rows with `None` values
df_dropped = df[~rows_with_none_values]
from datasets import Dataset

# Convert the filtered DataFrame back to a Dataset
filtered_dataset = Dataset.from_pandas(df_dropped)


In [82]:
train_dataset = model.prepare_tf_dataset(tokenized_datasets['train'], batch_size=32, tokenizer= tokenizer, collate_fn=data_collator, shuffle=True, drop_remainder=True)

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


In [83]:
test_dataset = model.prepare_tf_dataset(tokenized_datasets['test'], batch_size=32, tokenizer= tokenizer, collate_fn=data_collator, shuffle=False, drop_remainder=True)

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


In [84]:
validation_dataset= model.prepare_tf_dataset(tokenized_datasets['validation'], batch_size=32, tokenizer= tokenizer, collate_fn=data_collator, shuffle=False, drop_remainder=True)

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


In [85]:
from transformers import create_optimizer
import tensorflow as tf

In [86]:
model_checkpoint="/t5-small"
num_train_epochs = 8
num_train_steps = len(train_dataset) * num_train_epochs
model_name = model_checkpoint.split("/")[-1]

optimizer, schedule = create_optimizer(
    init_lr=5.6e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)

model.compile(optimizer=optimizer)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [None]:
rouge_l = keras_nlp.metrics.RougeL()
def metric_fn(eval_predictions):
    predictions, labels = eval_predictions
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    for label in labels:
        label[label < 0] = tokenizer.pad_token_id  # Replace masked label tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge_l(decoded_labels, decoded_predictions)
    result = {"RougeL": result["f1_score"]}

    return result
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(
    metric_fn, eval_dataset=test_dataset)
callbacks = [metric_callback]
tf.config.run_functions_eagerly(True)
model.fit(train_dataset, validation_data=test_dataset, epochs=5, verbose=True)
model.save_weights('summarized_model')


Epoch 1/5


In [None]:
tokenizer.save_vocabulary("/content/t5-small-finetuned-cnn-en/")

In [None]:
from transformers import pipeline

In [None]:
pipe = pipeline("summarization", model=model, tokenizer=tokenizer, framework="tf")


In [None]:
pipe(dataset[12])