In [None]:
!pip install transformers 

Collecting transformers
  Downloading transformers-4.13.0-py3-none-any.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 5.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 49.7 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 22.9 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 18.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 507 kB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attem

In [None]:
import tensorflow.compat.v2 as tf
import tensorflow_datasets as tfds
import numpy as np
from transformers import T5Tokenizer, TFT5ForConditionalGeneration
import datetime
import os
import gc
%load_ext tensorboard

In [None]:
tf.__version__

'2.7.0'

In [None]:
import transformers as var
var.__version__

'4.13.0'

Requirnments: tensorflow==2.3.1 & transformers==3.4.0

In [None]:
!pip install transformers==3.4.0
!pip install tensorflow==2.3.1



Dataset paths

In [None]:
train_data, info = tfds.load('cnn_dailymail', split = 'train', data_dir = 'data/', with_info=True)
test_data = tfds.load('cnn_dailymail', split = 'test', data_dir = 'data/')

INFO:absl:No config specified, defaulting to first: cnn_dailymail/plain_text
INFO:absl:Load dataset info from data/cnn_dailymail/plain_text/3.0.0
INFO:absl:Reusing dataset cnn_dailymail (data/cnn_dailymail/plain_text/3.0.0)
INFO:absl:Constructing tf.data.Dataset for split train, from data/cnn_dailymail/plain_text/3.0.0
INFO:absl:No config specified, defaulting to first: cnn_dailymail/plain_text
INFO:absl:Load dataset info from data/cnn_dailymail/plain_text/3.0.0
INFO:absl:Reusing dataset cnn_dailymail (data/cnn_dailymail/plain_text/3.0.0)
INFO:absl:Constructing tf.data.Dataset for split test, from data/cnn_dailymail/plain_text/3.0.0


Fine tunning T5 Model

In [None]:
class T5Model(TFT5ForConditionalGeneration):
    def __init__(self, *args, log_dir=None, cache_dir= None, **kwargs):
        super().__init__(*args, **kwargs)
    
    @tf.function
    def train_step(self, data):
        x = data[0]
        y = x['labels']
        with tf.GradientTape() as tape:
            outputs = self(inputs = x['inputs'], attention_mask = x['attention_mask'], labels = y, training=True, return_dict=True)
            loss = outputs.loss
            logits = outputs.logits
            loss = tf.reduce_mean(loss)
            grads = tape.gradient(loss, self.trainable_variables)
            
        self.optimizer.apply_gradients(zip(grads, self.trainable_variables)) 
        self.compiled_metrics.update_state(y, logits)
        metrics = {m.name: m.result() for m in self.metrics}
        metrics.update({'loss': loss})
        
        return metrics

    def test_step(self, data):
        x = data[0]
        y = x['labels']
        output = self(inputs = x['inputs'], attention_mask = x['attention_mask'], labels = y, training=False, return_dict=True)
        loss = output.loss
        logits = output.logits
        loss = tf.reduce_mean(loss)
        self.compiled_metrics.update_state(y, logits)
        metrics = {m.name: m.result() for m in self.metrics}
        metrics.update({'loss': loss})
        return metrics

Parameters

In [None]:
!pip install sentencepiece



In [None]:
model_path = './t5/models'
log_path = './t5/logs'
config = {
    'batch_size' : 4,
    'epochs' : 1,
    'learning_rate' :1e-4,
    'max_len' : 512,
    'summary_len' : 150
}
data_size = {
    'train': 10000,
    'test' : 1000
}
params = {'source_len' : 512,
          'target_len' : 150,
          'batch_size' : 4
          }
tokenizer = T5Tokenizer.from_pretrained("t5-base")

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Encode and Generate Data

In [None]:
class DataGenerator(tf.keras.utils.Sequence):
  def  __init__(self, data, tokenizer, mode, source_len, target_len, batch_size):
   self.data = data
   self.tokenizer = tokenizer
   self.source_len = source_len
   self.target_len = target_len
   self.batch_size = batch_size
   self.mode = mode

  def __len__(self):
     return int(np.ceil(data_size[self.mode]/self.batch_size))
    
  def __getitem__(self, index):
    dataset_batch = self.data.skip(self.batch_size).take(self.batch_size)
    encoded_batch = self.encode_data(dataset_batch, self.tokenizer, self.source_len , self.target_len)
    return encoded_batch

  def encode_data(self, data, tokenizer, source_len , target_len ):
    source = data.map(lambda text: 'summarize: ' + text['article'])
    source = list(map(lambda text: str(text,'utf-8'),list(tfds.as_numpy(source))))
    target = data.map(lambda text: text['highlights'])
    target = list(map(lambda text: str(text,'utf-8'),list(tfds.as_numpy(target))))

    batch_encoding = tokenizer.prepare_seq2seq_batch(
        src_texts = source,
        tgt_texts =  target,
        max_length= source_len, 
        max_target_length= target_len,
        padding = 'max_length',
        return_tensors = 'tf')
    batch_encoding['labels'] = tf.where(batch_encoding['labels']==tokenizer.pad_token_id, -100, batch_encoding['labels'])

    return {'inputs' : batch_encoding['input_ids'], 
            'attention_mask' : batch_encoding['attention_mask'],
            'labels':batch_encoding['labels']
            } 

In [None]:
training_data = DataGenerator(train_data.take(data_size['train']), tokenizer, 'train', **params)
validation_data = DataGenerator(test_data.take(data_size['test']), tokenizer, 'test', **params)

Callbacks

In [None]:
log_dir = log_path + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
                                                     
checkpoint_filepath = model_path + "/" + "T5-{epoch:04d}-{val_loss:.4f}.ckpt"
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

callbacks = [tensorboard_callback, model_checkpoint_callback] 

Create T5 model

In [None]:
def create_model():
  model = T5Model.from_pretrained("t5-base")
  optimizer = tf.keras.optimizers.Adam(lr=config['learning_rate'])
  metrics = tf.keras.metrics.SparseCategoricalAccuracy(name = 'accuracy')
  model.compile(optimizer=optimizer, metrics = metrics)
  return model
model = create_model()

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing T5Model.

All the layers of T5Model were initialized from the model checkpoint at t5-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use T5Model for predictions without further training.


Model Taining

In [None]:
epochs = config['epochs']
model.fit( training_data, validation_data = validation_data, epochs = epochs)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


 191/2500 [=>............................] - ETA: 28:27:09 - accuracy: 0.4795 - loss: 0.1217

KeyboardInterrupt: ignored

Save model

In [None]:
model.save_pretrained(model_path)

To download test text

In [None]:
from google.colab import files
uploaded = files.upload()

Saving TEST.txt to TEST.txt


In [None]:
with open('/content/sample (1).txt', 'r') as f:
  source_txt = f.read()
source_txt = 'summarize: ' + source_txt 
print (source_txt)

summarize: Today, we face transnational terrorist threats like Da’esh and Al-Qaida that are enduring and able to adapt to new technologies, but also expanding to include individuals and groups that commit terrorist attacks connected to xenophobia, racism and other forms of intolerance”, said Mr. Voronkov. 

The UN counter-terrorism architecture, largely set up in the wake of the 9/11 attack, helps Member States implement effective frameworks to prevent, address, investigate and prosecute acts of terrorism.  

It is also ramping up efforts to help countries adapt to the rapidly changing nature of the threat, which has become more digital and de-centralized in recent years.  

Noting that the world is currently witnessing a rapidly evolving situation in Afghanistan “which could have far-reaching implications” around the globe, he cited Da’esh’s expanded presence in that country and pointed out that several members of the Taliban have been designated as terrorists by the Security Council.

Testing the model

In [None]:
source = tokenizer.prepare_seq2seq_batch(
      src_texts = source_txt,
      max_length=512, 
      padding = 'max_length',
      return_tensors = 'tf')
input_ids = source['input_ids']
attention_mask = source['attention_mask']
'''summary = model.generate(input_ids = input_ids,
                attention_mask = attention_mask,
                 max_length=150, num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True)'''
summary = model.generate(input_ids = input_ids,
                attention_mask = attention_mask, 
                max_length=150, 
                num_beams=3,
                no_repeat_ngram_size=2, 
                early_stopping=True)                          
decoded_summary = tokenizer.decode(summary.numpy()[0])
print ('Summary:\n', decoded_summary)

Summary:
 the counter-terrorism architecture was set up in the wake of the 9/11 attack. it helps Member States implement effective frameworks to prevent, address, investigate and prosecute acts of terrorism.
