# Abstractive LSTM Text Summarization


## Imports

In [None]:
!pip install datasets
!pip install tensorflow
!pip install tensorflow-cloud
!pip install tensorflow_cloud
# !pip install tensorflow-addons
# !pip install keras-nlp
!pip install rouge-score
# !pip install rouge
!pip install tf-nightly
!pip install pyarrow
!pip install pyarrow==8.0.0

In [None]:
!pip install tensorflow_cloud

In [None]:
!pip install numpy



In [None]:
""" Datasets """
import pyarrow
from datasets import load_dataset, load_dataset_builder
# import gensim.downloader as api

""" Building model """
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
from tensorflow.keras.layers import Attention, Bidirectional, Concatenate, Dense, Embedding, Flatten, Input, LayerNormalization, LSTM, MultiHeadAttention
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer, tokenizer_from_json
from tensorflow.keras.regularizers import *
from tensorflow.keras.utils import plot_model

""" Training/Testing model """
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.initializers import Zeros
from tensorflow.keras.metrics import Metric, F1Score
# from keras_nlp.metrics import RougeL
from rouge_score import rouge_scorer as rs
# from rouge import Rouge
# from tensorflow_addons.seq2seq import BeamSearchDecoder
# from keras_nlp.utils import beam_search

""" TF Cloud Training """
import tensorflow_cloud as tfc
from tensorflow_cloud.core.docker_config import DockerConfig

""" Data processing/visualization """
# import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import numpy as np
import pandas as pd
import re

""" Cloud """
from google.colab import auth, files
from google.cloud import storage

""" Other """
import sys
import os
import time

TypeError: ignored

## Load Data


In [None]:
""" Check Data """
ds_name = 'cnn_dailymail' # 'GEM/wiki_lingua' 'ccdv/pubmed-summarization' 'scientific_papers'
ds_sub = '3.0.0'
builder = load_dataset_builder(ds_name, ds_sub)

In [None]:
""" Check Data cont. """
print(builder.info.description)
builder.info.features

In [None]:
""" Load Dataset """
dataset = load_dataset(ds_name, ds_sub)

split_train = len(dataset['train']['article'])
split_val = len(dataset['validation']['article'])

if not tfc.remote():
  split_train = 1000
  split_val = 1000

st = 30
pre_auth = False
co_model = 7

## Data Preperation

### - Split Data

In [None]:
""" Split Data and add sos/eos tokens """
sos_token = '<sos>'
eos_token = '<eos>'

x_train = np.array([f'{sos_token} {art} {eos_token}' for art in dataset['train']['article'][:split_train]])
time.sleep(st)
y_train = np.array([f'{sos_token} {sum} {eos_token}' for sum in dataset['train']['highlights'][:split_train]])
time.sleep(st)

x_val = np.array([f'{sos_token} {art} {eos_token}' for art in dataset['validation']['article'][:split_val]])
y_val = np.array([f'{sos_token} {sum} {eos_token}' for sum in dataset['validation']['highlights'][:split_val]])

time.sleep(st)
x_train = np.concatenate([x_train, x_val], axis=0)
time.sleep(st)
y_train = np.concatenate([y_train, y_val], axis=0)

del x_val, y_val

x_test = np.array([f'{sos_token} {art} {eos_token}' for art in dataset['test']['article']])
y_test = np.array([f'{sos_token} {sum} {eos_token}' for sum in dataset['test']['highlights']])

print(f'x_train shape: {x_train.shape}, y_train shape: {y_train.shape}')

In [None]:
""" Data Samples """
df = pd.DataFrame({'Article':x_train[:5], 'Summary':y_train[:5]})
df.head()

In [None]:
del dataset, df

### - Clean Data

In [None]:
### optimize preprocessing ###
"""
tokenize data first
split data

get all stop word indexes from tokenizer.word_index
remove stop stop words??

pad data as normal
"""

In [None]:
stop_words = stopwords.words('english')
stop_words.extend(['cnn', 'reuters'])

In [None]:
""" Lowercase all words """
x_train, y_train = np.char.lower(x_train), np.char.lower(y_train)
x_test, y_test = np.char.lower(x_test), np.char.lower(y_test)

In [None]:
""" Remove stop words """
pattern = re.compile("\\b(" + "|".join(stop_words) + ")\\b")

vec_pattern = np.vectorize(lambda text:pattern.sub('', text))
time.sleep(st)
x_train = vec_pattern(x_train)
time.sleep(st)
y_train = vec_pattern(y_train)

del pattern, stop_words, vec_pattern

### - Transform Data

In [None]:
""" Tokenize data """
tokenizer = Tokenizer(filters='"#$%&(!?.)\'*+,-/:;=@[\\]^_`{|}~\t\n')#, oov_token="<unk>")

tokenizer.fit_on_texts(x_train)
tokenizer.fit_on_texts(y_train)

x_train, y_train = tokenizer.texts_to_sequences(x_train), tokenizer.texts_to_sequences(y_train)
x_test, y_test = tokenizer.texts_to_sequences(x_test), tokenizer.texts_to_sequences(y_test)

In [None]:
""" Pad tokenized data """
x_train, y_train = pad_sequences(x_train, padding='post'), pad_sequences(y_train, padding='post')
x_test, y_test = pad_sequences(x_test, padding='post'), pad_sequences(y_test, padding='post')

In [None]:
""" After Processing Data Samples """
df = pd.DataFrame(x_train[:3])
df.head()

In [None]:
del df

##Model


### - Develop Model

In [None]:
""" Load Pre-Trained Word Embeddings """
w2v_model = api.load('word2vec-google-news-300')



In [None]:
""" Create Embedding Matrix """
vocab_dim = len(tokenizer.word_index)+1
emb_dim = 300
x_row, x_col = x_train.shape
y_row, y_col = y_train.shape

emb_matrix = np.zeros((vocab_dim, emb_dim))

for word, idx in tokenizer.word_index.items():
  if word in w2v_model:
    emb_matrix[idx] = w2v_model[word]

del w2v_model

In [None]:
""" Develop Model """

"""
################ CREATING INITIAL STATES OF LSTM ##################
# Define the input layer

input_shape = (10, 20)  # 10 timesteps, 20 features
lstm_units = 128

inputs = Input(shape=input_shape)

# Define the LSTM layer with initial states
lstm_layer = LSTM(units=lstm_units, return_sequences=False, return_state=False)

# Specify the initial states
initial_cell_state = np.zeros((1, lstm_units))
initial_hidden_state = np.zeros((1, lstm_units))
initial_state = [initial_cell_state, initial_hidden_state]

# Call the LSTM layer with initial states
outputs, final_cell_state, final_hidden_state = lstm_layer(inputs, initial_state=initial_state)
#####################################
  Beam Search
  decoder_beam = BeamSearchDecoder(cell=decoder_att_out.cell,
                                   beam_width=beam_width,
                                   embedding_fn=decoder_emb_layer,
                                   output_layer=decoder_dense) #decoder_lstm.cell

  decoder_beam = beam_search(token_probability_fn,
                             prompt=
                             max_length=
                             num_beams=
                             )
##################
hierarchal attention mechanism
reinforcement learing training
multi-task learning framework
##################
"""

In [None]:
def get_model(lat_0=128, lat_1=128,
                  dr_0=0.35, dr_1=0.35, dr_2=0.35,
                  att_0=2, att_1=2,
                  beam_width=4,
                  vocab_len=vocab_dim, emb_len=emb_dim):

  # Encoder
  encoder_input     = Input(shape=(None,), name='Input_0')

  encoder_emb_layer = Embedding(weights=[emb_matrix], mask_zero=True, trainable=False, name='Embedding_0')
  encoder_emb_input = encoder_emb_layer(encoder_input)

  encoder_lstm_0    = Bidirectional(LSTM(units=lat_0, dropout=dr_0, return_sequences=True), name='Bidirectional_LSTM_0')
  encoder_lstm_out  = encoder_lstm_0(encoder_emb_input)

  encoder_lay_norm  = LayerNormalization(name='Layer_Norm_0')
  encoder_lstm_norm = encoder_lay_norm(encoder_lstm_out)

  encoder_self_att  = MultiHeadAttention(num_heads=att_0, key_dim=emb_len, query_key_shared=True, name='Attention_0')
  encoder_att_out   = encoder_self_att(encoder_lstm_norm)

  encoder_lstm_1    = Bidirectional(LSTM(units=lat_1, dropout=dr_1, return_sequences=True, return_state=True), name='Bidirectional_LSTM_1')
  encoder_out, forward_h, forward_c, backward_h, backward_c = encoder_lstm_1(encoder_att_out)

  forward_h_concat  = Concatenate(name='Concat_0')([forward_h, backward_h])
  forward_c_concat  = Concatenate(name='Concat_1')([forward_c, backward_c])
  encoder_states    = [forward_h_concat, forward_c_concat]


  # Decoder
  decoder_input     = Input(shape=(None,), name='Input_1')

  decoder_emb_layer = Embedding(weights=[emb_matrix], mask_zero=True, trainable=False, name='Embedding_1')
  decoder_emb_out   = decoder_emb_layer(decoder_input)

  decoder_lstm      = LSTM(units=lat_1*2, dropout=dr_2, return_sequences=True, name='LSTM_0')
  decoder_lstm_out  = decoder_lstm(decoder_emb_out, initial_state=encoder_states)

  decoder_lay_norm  = LayerNormalization(name='Layer_Norm_1')
  decoder_lstm_norm = decoder_lay_norm(decoder_lstm_out)

  decoder_multi_att = MultiHeadAttention(num_heads=att_1, key_dim=emb_len, name='Attention_1')
  decoder_att_out   = decoder_multi_att(query=decoder_lstm_norm, key=encoder_out)

  decoder_dense     = Dense(vocab_len, activation='softmax', name='Dense_0')
  decoder_dense_out = decoder_dense(decoder_att_out)

  # Create model
  model = Model([encoder_input.input, decoder_input(encoder_input.output)], decoder_dense_out, name='Text_Summarization_Model')

  return model

In [None]:

def get_model(lat_0=128, lat_1=128,
                  dr_0=0.35, dr_1=0.35, dr_2=0.35,
                  att_0=2, att_1=2,
                  beam_width=4,
                  vocab_len=vocab_dim, emb_len=emb_dim):

  # Encoder
  encoder_input     = Input(shape=(None,), name='Input_0')

  encoder_emb_layer = Embedding(weights=[emb_matrix], mask_zero=True, trainable=False, name='Embedding_0')
  encoder_emb_input = encoder_emb_layer(encoder_input)

  encoder_lstm_0    = Bidirectional(LSTM(units=lat_0, dropout=dr_0, return_sequences=True), name='Bidirectional_LSTM_0')
  encoder_lstm_out  = encoder_lstm_0(encoder_emb_input)

  encoder_lay_norm  = LayerNormalization(name='Layer_Norm_0')
  encoder_lstm_norm = encoder_lay_norm(encoder_lstm_out)

  encoder_att       = MultiHeadAttention(num_heads=att_0, key_dim=emb_len, name='Attention_0')
  encoder_att_out   = encoder_att(encoder_lstm_norm, encoder_lstm_norm)

  encoder_lstm_1    = Bidirectional(LSTM(units=lat_1, dropout=dr_1, return_sequences=True, return_state=True), name='Bidirectional_LSTM_1')
  encoder_out, forward_h, forward_c, backward_h, backward_c = encoder_lstm_1(encoder_att_out)

  encoder_lay_norm  = LayerNormalization(name='Layer_Norm_1')
  encoder_lstm_norm_1 = encoder_lay_norm(encoder_out)

  encoder_att_1     = MultiHeadAttention(num_heads=att_0, key_dim=emb_len, name='Attention_1')
  encoder_att_out_1 = encoder_att_1(encoder_lstm_norm_1, encoder_lstm_norm_1)

  forward_h_concat  = Concatenate(name='Concat_0')([forward_h, backward_h])
  forward_c_concat  = Concatenate(name='Concat_1')([forward_c, backward_c])
  encoder_states    = [forward_h_concat, forward_c_concat]


  # Decoder
  decoder_input = Input(shape=(None,), name='Input_1')

  decoder_emb_layer = Embedding(weights=[emb_matrix], mask_zero=True, trainable=False, name='Embedding_1')
  decoder_emb_out = decoder_emb_layer(decoder_input)

  decoder_lstm = LSTM(units=lat_1*2, dropout=dr_2, return_sequences=True, name='LSTM_0')
  decoder_lstm_out = decoder_lstm(decoder_emb_out, initial_state=encoder_states)

  decoder_lay_norm = LayerNormalization(name='Layer_Norm_2')
  decoder_lstm_norm = decoder_lay_norm(decoder_lstm_out)

  decoder_multi_att = MultiHeadAttention(num_heads=att_1, key_dim=emb_len, name='Attention_2')
  decoder_att_out = decoder_multi_att(query=decoder_lstm_norm, value=encoder_att_out_1)

  decoder_lstm_1 = LSTM(units=lat_1*2, dropout=dr_2, return_sequences=True, name='LSTM_1')
  decoder_lstm_out_1 = decoder_lstm_1(decoder_att_out)

  decoder_lay_norm_1 = LayerNormalization(name='Layer_Norm_3')
  decoder_lstm_norm_1 = decoder_lay_norm_1(decoder_lstm_out_1)

  decoder_multi_att_1 = MultiHeadAttention(num_heads=att_1, key_dim=emb_len, name='Attention_3')
  decoder_att_out_1 = decoder_multi_att_1(query=decoder_lstm_norm_1, value=encoder_att_out_1)

  decoder_dense = Dense(vocab_len, activation='softmax', name='Dense_0')
  decoder_dense_out = decoder_dense(decoder_att_out_1)

  # Create model
  model = Model([encoder_input, decoder_input], decoder_dense_out, name='Text_Summarization_Model')

  return model

### - Develop Metrics/Callbacks

In [None]:
""" Create rouge score metric """
class RougeMetric(Metric):

  def __init__(self, method='avg'):
    super().__init__(name='f1_rs')

    if method not in {'avg', 'min', 'max'}:
      raise ValueError("Invalid score method, expected 'min', 'avg' or 'max' (str)")
    self.method = method
    self.rouge_scoring = rs.RougeScorer(['rougeL'])

    if self.method == 'min':
      self.f1_score = tf.Variable(1.0, dtype=tf.float32, trainable=False)
    else:
      self.f1_score = tf.Variable(0.0, dtype=tf.float32, trainable=False)
    self.co = tf.Variable(0.0, dtype=tf.float32, trainable=False)

  def sequences_to_texts(self, sequence): # (47, ) -> (1, )
      return tokenizer.sequences_to_texts(sequence.numpy().reshape(1, -1))

  def tf_sequences_to_texts(self, sequence): # (47, ) -> (1, )
      return tf.py_function(self.sequences_to_texts, [sequence], tf.string)

  def get_f1(self, ref, hyp): # (2, ) -> int
      score = self.rouge_scoring.score(ref.numpy(), hyp.numpy())
      return score['rougeL'].fmeasure

  def get_rouge(self, vals): # (2, ) -> int
      return tf.py_function(self.get_f1, [vals[0], vals[1]], tf.float32)

  def update_state(self, y_true, y_preds, sample_weight=None):
    max_preds = tf.convert_to_tensor(tf.argmax(y_preds, axis=-1)) # (50, 47)

    text_preds = tf.map_fn(self.tf_sequences_to_texts, max_preds, dtype=tf.string) # (50, 47) -> (50, )
    text_true = tf.map_fn(self.tf_sequences_to_texts, y_true, dtype=tf.string) # (50, 47) -> (50, )

    scores_f1 = tf.map_fn(self.get_rouge, (text_true, text_preds), dtype=tf.float32) # (50, 50) -> (50, )

    if self.method == 'min':
      self.f1_score.assign(tf.minimum(self.f1_score, tf.reduce_min(scores_f1)))
    elif self.method == 'avg':
      self.f1_score.assign_add(tf.reduce_sum(scores_f1))
      self.co.assign_add(tf.cast(tf.shape(y_true)[0], dtype=tf.float32))
    elif self.method == 'max':
      self.f1_score.assign(tf.maximum(self.f1_score, tf.reduce_max(scores_f1)))
    else:
      raise ValueError("Invalid score method when updating f1_score, expected 'min', 'avg' or 'max' (str)")

  def result(self):
    if self.method == 'avg':
      avg = self.f1_score / self.co
      return tf.round(avg * 10_000) / 10_000
    else:
      return tf.round(self.f1_score * 10_000) / 10_000

  def reset_state(self):
    if self.method == 'min':
        self.f1_score.assign(1.0)
    else:
        self.f1_score.assign(0.0)
    self.co.assign(0.0)


In [None]:
""" Create Callbacks """
def get_callbacks(tb_path, cp_path,
                  rlr_factor=0.1, rlr_patience=3,
                  es_patience=6):
                  #val_loss, val_f1_rs
  early_stop   = EarlyStopping(monitor='loss',
                               patience=es_patience)

  reduce_lr    = ReduceLROnPlateau(factor=rlr_factor,
                                   patience=rlr_patience)

  tensor_board = TensorBoard(log_dir=tb_path)

  model_cp     = ModelCheckpoint(filepath=cp_path,
                                 monitor='loss',
                                 save_best_only=True,
                                 save_freq='epoch',
                                 verbose=1)

  return early_stop, model_cp, tensor_board#, reduce_lr

In [None]:
""" Compile Model """
def compile_model(model, lr=0.001, rm_metric='avg'):

  model.compile(optimizer=Adam(lr),
                loss='sparse_categorical_crossentropy',
                metrics=[RougeMetric(rm_metric)])

##Train/Test

### - Set Up Cloud

In [None]:
""" Create paths """
GCP_PROJECT_ID = 'model-training-383203'
GCS_BUCKET  = 'model_sum'
REGION = 'us-central1'
JOB_NAME = f'model_{co_model}'
AUTH_JSON = '/content/model-training-383203-38e4420de909.json'
REQUIRE = '/content/model-require.txt'
co_model += 1

In [None]:
""" Define storage paths """
GCS_BASE_PATH = f'gs://{GCS_BUCKET}/{JOB_NAME}'
TENSORBOARD_LOGS = os.path.join(GCS_BASE_PATH,"logs")
MODEL_CP = os.path.join(GCS_BASE_PATH,"checkpoints")
SAVED_MODEL_DIR = os.path.join(GCS_BASE_PATH,"saved_model")
TOKENIZE_DIR = os.path.join(GCS_BASE_PATH, 'tokenizer')

In [None]:
""" Authorize user and Set storage paths """
if not tfc.remote() and ("google.colab" in sys.modules):
  if not pre_auth:
    !gcloud auth login
    !gcloud config set project 136963608278
    auth.authenticate_user()
    pre_auth = True

  if pre_auth:
    os.environ["GOOGLE_CLOUD_PROJECT"] = GCP_PROJECT_ID
    os.environ["GCS_BUCKET"] = GCS_BUCKET
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = AUTH_JSON
    os.environ['REGION'] = REGION

Go to the following link in your browser:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=32555940559.apps.googleusercontent.com&redirect_uri=https%3A%2F%2Fsdk.cloud.google.com%2Fauthcode.html&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fappengine.admin+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcompute+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=9OWwLsMBg76F9UftSml4wPbBizUR1G&prompt=consent&access_type=offline&code_challenge=oyZZhbfIg5lkXMrQHtte249uMuO3hHRWRL6uQOPsgVo&code_challenge_method=S256

Enter authorization code: 4/0AVHEtk7of_n1yEbRpxhZGR3yVTLQ60jF1o4rY6YkveMGW3SVPscZBvLxo82jVa--R7BvIQ

You are now logged in as [jetheat2002@gmail.com].
Your current project is [None].  You can change this setting by running:
  $ gcloud config set project PROJECT_ID
Upd

### - Train Model

In [None]:
""" Get model """
model = get_model()
compile_model(model, rm_metric='avg')
callbacks = get_callbacks(TENSORBOARD_LOGS, MODEL_CP, es_patience=10)

In [None]:
plot_model(model, to_file='model_arch.png', show_shapes=True, show_layer_names=True)

In [None]:
""" Define hyper-parameters"""
if tfc.remote():
  val_split = 0.20
  num_batch = 32
  num_epoch = 1024
else:
  val_split = 0.15
  num_batch = 8
  num_epoch = 250

In [None]:
""" Train model """
history = model.fit([x_train, y_train[:,:-1]], y_train[:,1:],
                     validation_split=val_split,
                     batch_size=num_batch,
                     epochs=num_epoch,
                     callbacks=callbacks,
                     verbose=2)

In [None]:
""" Save extra model data """
model.save(SAVED_MODEL_DIR)

storage_client = storage.Client()
bucket = storage_client.bucket(GCS_BUCKET)
blob = bucket.blob(TOKENIZE_DIR)
token_json = tokenizer.to_json()

with open('tokenizer.json', 'w') as f:
  f.write(token_json)

blob.upload_from_filename('tokenizer.json')

In [None]:
# docker = DockerConfig(image_build_bucket=GCS_BUCKET)
# # entry_point = ...
# tfc.run(
#         requirements_txt=REQUIRE,
#         distribution_strategy="auto",
#         docker_config=docker
# )

In [None]:
# """ Uploading """
# storage_client = storage.Client()
# bucket = storage_client.bucket(GCS_BUCKET)
# blob = bucket.blob(TOKENIZE_DIR)
# token_json = tokenizer.to_json()

# with open('tokenizer.json', 'w') as f:
#   f.write(token_json)

# blob.upload_from_filename('tokenizer.json')

In [None]:
# """ Downloading """
# storage_client = storage.Client()
# bucket = storage_client.bucket(GCS_BUCKET)
# blob = bucket.blob(TOKENIZE_DIR)
# blob.download_to_filename('tokenizer.json')

# with open('tokenizer.json', 'r') as f:
#   token_json = f.read()
#   test = tokenizer_from_json(token_json)