# Abstractive Text Summarization


## Imports

In [None]:
!pip install datasets
!pip install --upgrade tensorflow
!pip install tensorflow-cloud
!pip install tensorflow_cloud
!pip install tensorflow-addons
!pip install --upgrade keras
!pip install keras-nlp
!pip install rouge-score
# !pip install rouge
# !pip install tf-nightly
# !pip install pyarrow

In [None]:
""" Datasets """
# import pyarrow
from datasets import load_dataset, load_dataset_builder
import gensim.downloader as api

""" Building model """
import tensorflow as tf
import keras
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.layers import Add, Attention, Bidirectional, Concatenate, Dense, Embedding, Flatten, Input, Layer, LayerNormalization, LSTM, MultiHeadAttention, TextVectorization
from tfm.vision.layers import PositionalEncoding
from keras.models import Model, Sequential
from keras.optimizers import Adam, SGD
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer, tokenizer_from_json
from keras.regularizers import *
from keras.utils import plot_model

""" Training/Testing model """
from keras.callbacks import Callback
from keras.initializers import Zeros
from keras.metrics import Metric, F1Score
# from keras_nlp.metrics import RougeL
from rouge_score import rouge_scorer as rs
# from rouge import Rouge
# from tensorflow_addons.seq2seq import BeamSearchDecoder
# from keras_nlp.utils import beam_search

""" TF Cloud Training """
# import tensorflow_cloud as tfc
# from tensorflow_cloud.core.docker_config import DockerConfig

""" Data processing/visualization """
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
# import nltk
# from nltk.corpus import stopwords
# nltk.download('stopwords')
import numpy as np
import numpy.core.defchararray as npc
import pandas as pd
import re

""" Cloud """
from google.colab import auth, files
from google.cloud import storage

""" Other """
import sys
import os
import time

## Load Data


In [None]:
""" Check Data """
ds_name = 'cnn_dailymail'
ds_sub = '3.0.0'
builder = load_dataset_builder(ds_name, ds_sub)

Downloading builder script:   0%|          | 0.00/8.33k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/9.88k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/15.1k [00:00<?, ?B/s]

In [None]:
""" Check Data cont. """
print(builder.info.description)
builder.info.features

CNN/DailyMail non-anonymized summarization dataset.

There are two features:
  - article: text of news article, used as the document to be summarized
  - highlights: joined text of highlights with <s> and </s> around each
    highlight, which is the target summary



{'article': Value(dtype='string', id=None),
 'highlights': Value(dtype='string', id=None),
 'id': Value(dtype='string', id=None)}

In [None]:
""" Load Dataset """
dataset = load_dataset(ds_name, ds_sub)

split_train = len(dataset['train']['article'])
split_test = len(dataset['test']['article'])
print("Train split length: ", split_train)
print("Test split length: ", split_test)

# if not tfc.remote():
#   split_train = 1000
#   split_val = 1000

st = 30
pre_auth = False
co_model = 7

Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/572k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Train split length:  287113
Test split length:  11490


## Data Preperation

### - Prep

In [None]:
""" Stop words list  """
stop_words_str = "a,an,are,as,at,be,by,CNN,for,had,has,if,in,into,is,it,of,on,or,Reuters,such,that,to,the,their,then,there,these,they,this,to,was,will,with,--"

In [None]:
""" Tokens """
sos_token = "<sos> "
eos_token = " <eos>"

""" Get Stopwords """
# replace contraction words
stop_words_lst = stop_words_str.split(',')
pattern_sw = "\\b(" + "|".join(stop_words_lst) + ")\\b"

### - Clean/Get Data

In [None]:
""" Data generator functions """
def data_generator(subset, feature, split_size, batch_size):
  for i in range(0, (split_size//batch_size)*batch_size, batch_size):
    batch = dataset[subset][feature][i:i+batch_size]
    batch = tf.strings.lower(batch)
    batch = tf.strings.regex_replace(batch, '[.?!,¿]', r' \0 ')
    batch = tf.strings.regex_replace(batch, pattern_sw, '')
    batch = tf.strings.split(batch)
    batch = tf.strings.reduce_join(batch, axis=-1, separator=' ')
    batch = tf.strings.join([sos_token, batch, eos_token])
    yield batch

def get_gen(subset, feature, split_size, batch_size):
  gen = lambda: data_generator(subset, feature, split_size, batch_size)
  # args = (subset, feature, split_size, batch_size)
  data = tf.data.Dataset.from_generator(gen, output_signature=tf.TensorSpec(shape=(batch_size,), dtype=tf.string))
  return data

In [None]:
""" Get Data """
x_train = get_gen('train', 'article', split_train, 2500)
y_train = get_gen('train', 'highlights', split_train, 2500)
x_test  = get_gen('test', 'article', split_test, 500)
y_test  = get_gen('test', 'highlights', split_test, 500)

In [None]:
# """ Batch dataset """ # not needed as of now
# dataset["train"].map(lambda text: text, batched=True)
# dataset["test"].map(lambda text: text, batched=True)

### - Tokenize Data

In [None]:
""" Clean text Function """
## Not sure if cleaning data through the tokenizer over the generator is quicker ##
def clean_data(text):
  text = tf.strings.lower(text)
  text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ')
  text = tf.strings.regex_replace(text, pattern_sw, '')
  text = tf.strings.split(text)
  text = tf.strings.reduce_join(text, axis=-1, separator=' ')
  text = tf.strings.join([sos_token, text, eos_token])
  return text

In [None]:
""" Tokenizer """
max_vocab = 5_000

x_tokenizer = TextVectorization(
    standardize=clean_data,
    max_tokens=max_vocab,
    ragged=True
)
y_tokenizer = TextVectorization(
    standardize=clean_data,
    max_tokens=max_vocab,
    # output_sequence_length=50,
    ragged=True
)

In [None]:
""" Adapt data """
x_tokenizer.adapt(x_train)
y_tokenizer.adapt(y_train)

In [None]:
""" Tokenizer vocab samples """
print(f"X: {x_tokenizer.get_vocabulary()[:10]}")
print(f"Y: {y_tokenizer.get_vocabulary()[:10]}")

X: ['', '[UNK]', '.', ',', 'and', 'he', 'his', 'said', "'", 'have']
Y: ['', '[UNK]', '.', '<sos>', '<eos>', ',', 'and', 'he', 'his', 'from']


##Model


### - Develop Model

In [None]:
""" Load Pre-Trained Word Embeddings """
vocab_dim = max_vocab
emb_dim = 300
w2v_model = api.load('word2vec-google-news-300')



In [None]:
""" Improvements??? """
# Beam Search
# Hierarchal attention mechanism
# Reinforcement learning training
# Multi-task learning framework

In [None]:
class FeedForward(Layer):
  def __init__(self, latents, activation='relu'):
    super().__init__()

    self.lat = latents
    self.act = activation

    self.dense_0 = Dense(self.lat, self.act)
    self.dense_1 = Dense(self.lat*4)

  def call(self, inputs):
    x = self.dense_0(inputs)
    x = self.dense_1(x)
    return x

In [None]:
def get_model(att_0=3, att_1=3,
              lat_0=512, lat_1=512,
              dr_0=0.1, dr_1=0.1
              vocab_len=vocab_dim, emb_len=emb_dim):

  # Encoder
  enc_input = Input(shape=(None,))
  enc_token = x_tokenizer(enc_input)
  enc_emb   = Embedding(weights=[w2v_model], mask_zero=True, trainable=False)(enc_token)
  enc_pe    = PositionalEncoding()(enc_emb)

  enc_att   = MultiHeadAttention(num_heads=att_0, key_dim=emb_len, dropout=dr_0)(enc_pe)
  enc_add   = Add()([enc_att, enc_emb])
  enc_norm  = LayerNormalization()(enc_add)

  enc_ffn   = FeedForward(lat_0)(enc_norm)
  enc_add   = Add()([enc_ffn, enc_norm])
  enc_norm  = LayerNormalization()(enc_add)


  # Decoder
  dec_input = Input(shape=(None,))
  dec_token = y_tokenizer(dec_input)
  dec_emb   = Embedding(weights=[w2v_model], mask_zero=True, trainable=False)(dec_token)
  dec_pe    = PositionalEncoding()(dec_emb)

  dec_att   = MultiHeadAttention(num_heads=att_1, key_dim=emb_len, dropout=dr_1 , use_casual_mask=True)(dec_pe)
  dec_add   = Add()([dec_att, dec_emb])
  dec_norm  = LayerNormalization()(dec_add)

  att       = MultiHeadAttention(num_heads=att_0, key_dim=emb_len)(dec_norm, enc_norm)
  add       = Add()([att, dec_norm])
  norm      = LayerNormalization()(add)

  ffn       = FeedForward(lat_1)(norm)
  add       = Add()([enc_ffn, enc_norm])
  norm      = LayerNormalization()(enc_add)

  output    = Dense(vocab_len, activation='softmax')(norm)

  # Create model
  ### enc_input.input, dec_input(enc_input.output) ###
  model = Model([enc_input, dec_input], output, name='Text_Summarization_Model')

  return model

### - Develop Metrics/Callbacks

In [None]:
""" Create rouge score metric """
class RougeMetric(Metric):

  def __init__(self, method='avg'):
    super().__init__(name='f1_rs')

    if method not in {'avg', 'min', 'max'}:
      raise ValueError("Invalid score method, expected 'min', 'avg' or 'max' (str)")
    self.method = method
    self.rouge_scoring = rs.RougeScorer(['rougeL'])

    if self.method == 'min':
      self.f1_score = tf.Variable(1.0, dtype=tf.float32, trainable=False)
    else:
      self.f1_score = tf.Variable(0.0, dtype=tf.float32, trainable=False)
    self.co = tf.Variable(0.0, dtype=tf.float32, trainable=False)

  def sequences_to_texts(self, sequence): # (47, ) -> (1, )
      return tokenizer.sequences_to_texts(sequence.numpy().reshape(1, -1))

  def tf_sequences_to_texts(self, sequence): # (47, ) -> (1, )
      return tf.py_function(self.sequences_to_texts, [sequence], tf.string)

  def get_f1(self, ref, hyp): # (2, ) -> int
      score = self.rouge_scoring.score(ref.numpy(), hyp.numpy())
      return score['rougeL'].fmeasure

  def get_rouge(self, vals): # (2, ) -> int
      return tf.py_function(self.get_f1, [vals[0], vals[1]], tf.float32)

  def update_state(self, y_true, y_preds, sample_weight=None):
    max_preds = tf.convert_to_tensor(tf.argmax(y_preds, axis=-1)) # (50, 47)

    text_preds = tf.map_fn(self.tf_sequences_to_texts, max_preds, dtype=tf.string) # (50, 47) -> (50, )
    text_true = tf.map_fn(self.tf_sequences_to_texts, y_true, dtype=tf.string) # (50, 47) -> (50, )

    scores_f1 = tf.map_fn(self.get_rouge, (text_true, text_preds), dtype=tf.float32) # (50, 50) -> (50, )

    if self.method == 'min':
      self.f1_score.assign(tf.minimum(self.f1_score, tf.reduce_min(scores_f1)))
    elif self.method == 'avg':
      self.f1_score.assign_add(tf.reduce_sum(scores_f1))
      self.co.assign_add(tf.cast(tf.shape(y_true)[0], dtype=tf.float32))
    elif self.method == 'max':
      self.f1_score.assign(tf.maximum(self.f1_score, tf.reduce_max(scores_f1)))
    else:
      raise ValueError("Invalid score method when updating f1_score, expected 'min', 'avg' or 'max' (str)")

  def result(self):
    if self.method == 'avg':
      avg = self.f1_score / self.co
      return tf.round(avg * 10_000) / 10_000
    else:
      return tf.round(self.f1_score * 10_000) / 10_000

  def reset_state(self):
    if self.method == 'min':
        self.f1_score.assign(1.0)
    else:
        self.f1_score.assign(0.0)
    self.co.assign(0.0)


In [None]:
""" Create Callbacks """
def get_callbacks(cp_path,
                  rlr_factor=0.1, rlr_patience=3,
                  es_patience=6):
                  #val_loss, val_f1_rs
  early_stop = EarlyStopping(monitor='loss',
                             patience=es_patience)

  reduce_lr = ReduceLROnPlateau(factor=rlr_factor,
                                 patience=rlr_patience)

  model_cp = ModelCheckpoint(filepath=cp_path,
                             monitor='loss',
                             save_best_only=True,
                             save_freq='epoch',
                             verbose=1)

  return early_stop, model_cp, reduce_lr

In [None]:
""" Compile Model """
def compile_model(model, lr=0.001, rm_metric='avg'):

  model.compile(optimizer=Adam(lr),
                loss='sparse_categorical_crossentropy',
                metrics=get_callbacks("/content/"))

##Training


### - Set Up Cloud

In [None]:
""" Create paths """
GCP_PROJECT_ID = 'model-training-383203'
GCS_BUCKET  = 'model_sum'
REGION = 'us-central1'
JOB_NAME = f'model_{co_model}'
AUTH_JSON = '/content/model-training-383203-38e4420de909.json'
REQUIRE = '/content/model-require.txt'
co_model += 1

In [None]:
""" Define storage paths """
GCS_BASE_PATH = f'gs://{GCS_BUCKET}/{JOB_NAME}'
TENSORBOARD_LOGS = os.path.join(GCS_BASE_PATH,"logs")
MODEL_CP = os.path.join(GCS_BASE_PATH,"checkpoints")
SAVED_MODEL_DIR = os.path.join(GCS_BASE_PATH,"saved_model")
TOKENIZE_DIR = os.path.join(GCS_BASE_PATH, 'tokenizer')

In [None]:
""" Authorize user and Set storage paths """
if not tfc.remote() and ("google.colab" in sys.modules):
  if not pre_auth:
    !gcloud auth login
    !gcloud config set project 136963608278
    auth.authenticate_user()
    pre_auth = True

  if pre_auth:
    os.environ["GOOGLE_CLOUD_PROJECT"] = GCP_PROJECT_ID
    os.environ["GCS_BUCKET"] = GCS_BUCKET
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = AUTH_JSON
    os.environ['REGION'] = REGION

### - Train Model

In [None]:
""" Get model """
model = get_model()
compile_model(model, rm_metric='avg')
callbacks = get_callbacks(TENSORBOARD_LOGS, MODEL_CP, es_patience=10)

In [None]:
plot_model(model, to_file='model_arch.png', show_shapes=True, show_layer_names=True)

In [None]:
""" Define hyper-parameters"""
if tfc.remote():
  val_split = 0.20
  num_batch = 32
  num_epoch = 1024
else:
  val_split = 0.15
  num_batch = 8
  num_epoch = 250

In [None]:
""" Train model """
history = model.fit([x_train, y_train[:,:-1]], y_train[:,1:],
                     validation_split=val_split,
                     batch_size=num_batch,
                     epochs=num_epoch,
                     callbacks=callbacks,
                     verbose=2)

In [None]:
""" Save extra model data """
model.save(SAVED_MODEL_DIR)

storage_client = storage.Client()
bucket = storage_client.bucket(GCS_BUCKET)
blob = bucket.blob(TOKENIZE_DIR)
token_json = tokenizer.to_json()

with open('tokenizer.json', 'w') as f:
  f.write(token_json)

blob.upload_from_filename('tokenizer.json')

In [None]:
# docker = DockerConfig(image_build_bucket=GCS_BUCKET)
# # entry_point = ...
# tfc.run(
#         requirements_txt=REQUIRE,
#         distribution_strategy="auto",
#         docker_config=docker
# )

In [None]:
# """ Uploading """
storage_client = storage.Client()
bucket = storage_client.bucket(GCS_BUCKET)
blob = bucket.blob(TOKENIZE_DIR)
token_json = tokenizer.to_json()

with open('tokenizer.json', 'w') as f:
  f.write(token_json)

blob.upload_from_filename('tokenizer.json')

In [None]:
# """ Downloading """
storage_client = storage.Client()
bucket = storage_client.bucket(GCS_BUCKET)
blob = bucket.blob(TOKENIZE_DIR)
blob.download_to_filename('tokenizer.json')

with open('tokenizer.json', 'r') as f:
  token_json = f.read()
  test = tokenizer_from_json(token_json)