In [1]:
pip install transformers datasets tensorflow sentencepiece

Note: you may need to restart the kernel to use updated packages.


In [2]:
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
import numpy as np

2025-07-23 12:38:41.688132: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753274321.709774    1337 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753274321.716382    1337 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
!pip install datasets --quiet
from datasets import load_dataset
dataset = load_dataset("xsum")

In [4]:
 #Tokenizer
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name)


I0000 00:00:1753274335.282997    1337 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0
All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [5]:

prefix = "summarize: "
max_input_length = 512
max_target_length = 64

In [6]:
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")

    labels = tokenizer(text_target=examples["summary"], max_length=max_target_length, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [7]:
dataset["train"].shape

(204045, 3)

In [8]:
dataset["test"].shape

(11334, 3)

In [9]:
train_data =dataset["train"].select(range(40000))
val_data = dataset["validation"].select(range(5000))

train_dataset = train_data.map(preprocess_function, batched=True)
val_dataset = val_data.map(preprocess_function, batched=True)

In [10]:
def convert_to_tf_dataset(dataset):
    return tf.data.Dataset.from_tensor_slices((
        {
            "input_ids": np.array(dataset["input_ids"]),
            "attention_mask": np.array(dataset["attention_mask"])
        },
        np.array(dataset["labels"])
    ))

In [11]:

train_tf_dataset = convert_to_tf_dataset(train_dataset).shuffle(1024).batch(8).prefetch(tf.data.AUTOTUNE)
val_tf_dataset = convert_to_tf_dataset(val_dataset).batch(4).prefetch(tf.data.AUTOTUNE)


In [12]:
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)

model.compile(
    optimizer=optimizer  
)

In [13]:
dataset

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11334
    })
})

In [14]:

callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=2, monitor="val_loss"),
    tf.keras.callbacks.ModelCheckpoint("best_model", save_best_only=True)
]
model.fit(
    train_tf_dataset,
    validation_data=val_tf_dataset,
    epochs=2
)

Epoch 1/2


I0000 00:00:1753274395.234403    1401 service.cc:148] XLA service 0x7adc2814a770 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1753274395.234448    1401 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1753274395.257557    1401 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1753274395.342400    1401 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.




<tf_keras.src.callbacks.History at 0x7adca84262d0>

In [15]:
dataset['test']

Dataset({
    features: ['document', 'summary', 'id'],
    num_rows: 11334
})

In [16]:

filtered_test_data = dataset["test"].filter(lambda x: x["document"] and x["document"].strip() != "")
test_data = dataset["test"].select(range(5)) 

def preprocess_test_function_safe(examples):
    inputs = [prefix + doc for doc in examples["document"] if doc and doc.strip()]
    tokenized = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True,
        padding="max_length",
        return_tensors="tf"  
    )
    return tokenized, inputs

test_inputs, filtered_docs = preprocess_test_function_safe(test_data)

outputs = model.generate(
    input_ids=test_inputs["input_ids"],
    attention_mask=test_inputs["attention_mask"],
    max_length=max_target_length,
    num_beams=4,
    early_stopping=True,
    decoder_start_token_id=tokenizer.pad_token_id
)

decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)

for i, pred in enumerate(decoded_preds):
    print(f"\n--- Article {i+1} ---")
    print("Original:\n", filtered_docs[i])
    print("\nPredicted Summary:\n", pred)


Filter:   0%|          | 0/11334 [00:00<?, ? examples/s]


--- Article 1 ---
Original:
 summarize: Prison Link Cymru had 1,099 referrals in 2015-16 and said some ex-offenders were living rough for up to a year before finding suitable accommodation.
Workers at the charity claim investment in housing would be cheaper than jailing homeless repeat offenders.
The Welsh Government said more people than ever were getting help to address housing problems.
Changes to the Housing Act in Wales, introduced in 2015, removed the right for prison leavers to be given priority for accommodation.
Prison Link Cymru, which helps people find accommodation after their release, said things were generally good for women because issues such as children or domestic violence were now considered.
However, the same could not be said for men, the charity said, because issues which often affect them, such as post traumatic stress disorder or drug dependency, were often viewed as less of a priority.
Andrew Stevens, who works in Welsh prisons trying to secure housing for pri

In [17]:
print(outputs)


tf.Tensor(
[[    0 27344  1175    52     7    16 10256   228    36    30     8  6162
     21  1296   767   406     3     9   418    13   199     6  7813   845
      5     1     0     0     0     0     0     0     0     0     0     0]
 [    0    71   388    65  4283    16  1614  4977    28 20507     7 10883
   2319    16 16504     5     1     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0]
 [    0    71   386    18  1201    18  1490   388    65   118  4977    28
      3 31540    26  4984    29     9  5341    11     3 31540    26  3322
    227     3     9 16201    16     3     9  4049     5     1     0     0]
 [    0  1244  4027    51    43  7817     3     9  1798  9145   907  1288
  10477    12     8  1886    31     7  4192 25990     5     1     0     0
      0     0     0     0     0     0     0     0     0     0     0     0]
 [    0    37    96 11584    53    18    51   603  3142    53   121  3178
      3    60 29955    

In [19]:
model.save_pretrained("t5_model")
tokenizer.save_pretrained("t5_model")


('t5_model/tokenizer_config.json',
 't5_model/special_tokens_map.json',
 't5_model/spiece.model',
 't5_model/added_tokens.json',
 't5_model/tokenizer.json')

In [21]:
import shutil
shutil.make_archive("news_summarizer_t5", 'zip', "t5_model")

'/kaggle/working/news_summarizer_t5.zip'

In [23]:
!zip -r t5_model1.zip t5_model

  adding: t5_model/ (stored 0%)
  adding: t5_model/tokenizer.json (deflated 74%)
  adding: t5_model/spiece.model (deflated 48%)
  adding: t5_model/config.json (deflated 63%)
  adding: t5_model/tf_model.h5

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 (deflated 12%)
  adding: t5_model/special_tokens_map.json (deflated 85%)
  adding: t5_model/tokenizer_config.json (deflated 95%)
  adding: t5_model/generation_config.json (deflated 29%)
