In [1]:
import numpy as np
import tensorflow as tf
import pickle
import json
from preprocess_v2 import *
from nltk.translate.bleu_score import sentence_bleu


In [2]:
np.random.seed(2470)
train_content, train_title, test_content, test_title = train_test_split()
(content_vocab, content_word_index, content_index_word, 
 title_vocab, title_word_index, title_index_word) = vectorize_data(train_content, train_title)

train_content_vec = CONTENT_VECTORIZER(train_content)
train_title_vec = TITLE_VECTORIZER(train_title)
test_content_vec = CONTENT_VECTORIZER(test_content)
test_title_vec = TITLE_VECTORIZER(test_title)

print(train_content_vec.shape, train_title_vec.shape, test_content_vec.shape, test_title_vec.shape)

glove_index = build_glove_embed_index()
title_embedding_init, title_vocab_size = build_embedding_init(title_word_index, glove_index)
content_embedding_init, content_vocab_size = build_embedding_init(content_word_index, glove_index)


2023-05-10 15:04:25.819332: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


(25184, 256) (25184, 16) (1325, 256) (1325, 16)
Unique words in glove: 400003
Hits: 14315; Misses: 685
Hits: 68712; Misses: 21651


In [3]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embedding_size, window_size, initializer, trainable=False):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.window_size = window_size
        self.initializer = initializer
        self.trainable = trainable
        
        self.embedding = tf.keras.layers.Embedding(self.vocab_size, self.embedding_size, mask_zero=True,
                                                   embeddings_initializer=self.initializer,
                                                   trainable=self.trainable)    
        self.positional_encoding = positional_encoding(window_size, embedding_size)
    
    def compute_mask(self, *args, **kwargs):
        return self.embedding.compute_mask(*args, **kwargs)
        
    def call(self, x, add_positional_embedding=True):
        length = tf.shape(x)[1]
        if add_positional_embedding:
            return self.embedding(x)+positional_encoding(length, self.embedding_size)
        else:
            return self.embedding(x)

class BaseAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()
        
class CrossAttention(BaseAttention):
    def call(self, x, context):
        attn_output = self.mha(query=x, key=context, value=context)
        x = self.add([x, attn_output])
        x = self.layernorm(x)
        return x

class GlobalSelfAttention(BaseAttention):
    def call(self, x, attention_mask=None):
        attn_output = self.mha(query=x, value=x, key=x, attention_mask=attention_mask)
        x = self.add([x, attn_output])
        x = self.layernorm(x)
        return x

class CausalSelfAttention(BaseAttention):
    def call(self, x, attention_mask=None):
        attn_output = self.mha(query=x, value=x, key=x, use_causal_mask=True, attention_mask=attention_mask)
        x = self.add([x, attn_output])
        x = self.layernorm(x)
        return x
    
class FeedForward(tf.keras.layers.Layer):
    def __init__(self, embedding_size, ff_dim, dropout_rate=0.1):
        super().__init__()
        self.seq = tf.keras.Sequential([tf.keras.layers.Dense(ff_dim, activation='relu'),
                                        tf.keras.layers.Dense(embedding_size),
                                        tf.keras.layers.Dropout(dropout_rate)])
        self.add = tf.keras.layers.Add()
        self.layer_norm = tf.keras.layers.LayerNormalization()

    def call(self, x):
        x = self.add([x, self.seq(x)])
        x = self.layer_norm(x) 
        return x

In [4]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, num_heads, embedding_size, ff_dim, dropout_rate=0.1):
        super().__init__()
        self.num_heads = num_heads
        self.embedding_size = embedding_size
        self.ff_dim = ff_dim
        self.dropout_rate = dropout_rate
        
        self.self_attention = GlobalSelfAttention(num_heads=num_heads, key_dim=embedding_size,
                                                  dropout=dropout_rate)
        self.ffn = FeedForward(embedding_size, ff_dim)

    def call(self, x, attention_mask=None):
        x = self.self_attention(x, attention_mask=attention_mask)
        x = self.ffn(x)
        return x

class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, num_heads, ff_dim, vocab_size, embedding_size, 
                 window_size, embedding_initializer, embedding_trainability=False,  dropout_rate=0.1):
        super().__init__()

        self.num_layers = num_layers
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.window_size = window_size
        self.embedding_initializer = embedding_initializer
        self.embedding_trainability = embedding_trainability
        
        self.pos_embedding = PositionalEmbedding(self.vocab_size, self.embedding_size, self.window_size,
                                                 self.embedding_initializer, self.embedding_trainability)
        
        self.enc_layers = [EncoderLayer(num_heads, embedding_size, ff_dim, dropout_rate) for i in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(dropout_rate)

    def call(self, x):
        # x is tokenized numerical values
        mask = self.pos_embedding.compute_mask(x)
        mask = mask[:,tf.newaxis,:]
        x = self.pos_embedding(x)
        
        # Add dropout.
        x = self.dropout(x)
        
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, attention_mask=mask)
        return x


In [5]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, num_heads, embedding_size, ff_dim, dropout_rate=0.1):
        super(DecoderLayer, self).__init__()

        self.causal_self_attention = CausalSelfAttention(num_heads=num_heads, key_dim=embedding_size, 
                                                         dropout=dropout_rate)
        self.cross_attention = CrossAttention(num_heads=num_heads, key_dim=embedding_size, 
                                              dropout=dropout_rate)
        self.ffn = FeedForward(embedding_size, ff_dim)

    def call(self, x, context, attention_mask=None):
        x = self.causal_self_attention(x=x, attention_mask=attention_mask)
        x = self.cross_attention(x=x, context=context)
        x = self.ffn(x)
        return x
    
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, num_heads, ff_dim, vocab_size, embedding_size, window_size,
                 embedding_initializer, embedding_trainability=False, dropout_rate=0.1):
        
        super(Decoder, self).__init__()
        
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.window_size = window_size
        
        self.embedding_initializer = embedding_initializer
        self.embedding_trainability = embedding_trainability


        self.pos_embedding = PositionalEmbedding(self.vocab_size, self.embedding_size, self.window_size,
                                                 self.embedding_initializer, self.embedding_trainability)
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        
        
        self.dec_layers = [DecoderLayer(num_heads, embedding_size, ff_dim,  dropout_rate=dropout_rate) 
                           for i in range(num_layers)]
        
    def call(self, x, context):
        # `x` is token-IDs shape (batch, target_seq_len)
        mask = self.pos_embedding.compute_mask(x)
        mask = mask[:,tf.newaxis,:]
        x = self.pos_embedding(x)  # (batch_size, target_seq_len, d_model)
        x = self.dropout(x)

        for i in range(self.num_layers):
            x  = self.dec_layers[i](x, context, attention_mask=mask)

        # The shape of x is (batch_size, target_seq_len, d_model).
        return x

In [6]:
class TransformerModel(tf.keras.Model):
    def __init__(self, num_layers, num_heads, ff_dim, embedding_size, 
                 content_vocab_size, title_vocab_size, content_window_size, title_window_size,
                 content_embedding_initializer, title_embedding_initializer,
                 content_embedding_trainability, title_embedding_trainability, 
                 dropout_rate=0.1):
        
        super().__init__()
        self.encoder = Encoder(num_layers, num_heads, ff_dim, content_vocab_size, embedding_size, 
                               content_window_size, content_embedding_initializer, content_embedding_trainability,
                               dropout_rate)
        self.decoder = Decoder(num_layers, num_heads, ff_dim, title_vocab_size, embedding_size,
                               title_window_size, title_embedding_initializer, title_embedding_trainability,
                               dropout_rate)
        
        self.dense_layer = tf.keras.layers.Dense(title_vocab_size)
    
    def call(self, inputs):
        content, title = inputs        
        context = self.encoder(content)
        output = self.decoder(title, context)
        logits = self.dense_layer(output)
        return logits

In [7]:
num_layers = 2
num_heads = 8
ff_dim = 256
embedding_size = GLOVE_EMBED_SZ
content_window_size = CONTENT_SEQ_LEN
title_window_size = TITLE_SEQ_LEN
content_embedding_initializer = tf.keras.initializers.Constant(content_embedding_init)
title_embedding_initializer = tf.keras.initializers.Constant(title_embedding_init)
content_embedding_trainability = True
title_embedding_trainability = True
dropout_rate = 0.1

train_title_labels = train_title_vec[:,:,tf.newaxis]
test_title_labels = test_title_vec[:,:,tf.newaxis]

model = TransformerModel(num_layers, num_heads, ff_dim, embedding_size, content_vocab_size, title_vocab_size,
                         content_window_size, title_window_size, content_embedding_initializer, title_embedding_initializer,
                         content_embedding_trainability, title_embedding_trainability, dropout_rate)



In [8]:
model_name = 'modelv2-2blocks-8heads-256ffdim-trainableemb'

def masked_loss(label, pred):
    mask = label != 0
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    loss = tf.expand_dims(loss_object(label, pred),axis=2)
    
    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask
    
    loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
    return loss


def masked_accuracy(label, pred):
    pred = tf.expand_dims(tf.argmax(pred, axis=2), axis=2)
    label = tf.cast(label, pred.dtype)
    match = label == pred

    mask = label != 0

    match = match & mask
    match = tf.cast(match, dtype=tf.float32)
    mask = tf.cast(mask, dtype=tf.float32)
    return tf.reduce_sum(match)/tf.reduce_sum(mask)

model.compile(optimizer='Adam', loss=masked_loss, metrics=[masked_accuracy])

model.fit(x=(train_content_vec, train_title_vec[:,:-1]), y=train_title_labels[:,1:], 
          batch_size=200, epochs=30)

model_weights_path = f'../models/weights/{model_name}'

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [9]:
import os

def save_model_weights(filepath):
    if os.path.isfile(filepath):
        confirmation = input('File exists; hit y to override: ')
        if confirmation.lower()=='y':
            model.save_weights(filepath)
        else:
            print('Not saving; try saving with different filename')
    else:
        model.save_weights(filepath)

save_model_weights(model_weights_path)

# LOAD WEIGHTS USING:
# model.load_weights('../models/weights/modelv2-2blocks-5heads-256ffdim-trainableemb')



In [10]:
def sentence_from_ind(indexes, index_word_dict=title_index_word):
    sentence = ""
    
    for index in indexes:
   
        sentence += index_word_dict[index]
        sentence += " "
    
    return sentence

## NOT NECESSARY ANYMORE- DISCUSS AND REMOVE
# predictions = model.predict(x=(test_content_vec[:100], test_title_vec[:100][:,:-1]))

# for i in range(0,10):
#     tokens = np.argmax(predictions[i],axis=1)
#     true = test_title_labels[i].numpy().reshape((16,))
#     count = 0
#     for num in tokens:
#         if num == 0:
#             count +1

#     # if count < 10:
#     # print(tokens)
#     print(f'Predicted Sentence {i}:',sentence_from_ind(tokens))
#     print(f'True Sentence {i}:',sentence_from_ind(true))
#     print()

In [13]:
import pandas as pd

def text_to_title(content, model=model, output_len=TITLE_SEQ_LEN):
    """Converts vectorized text to title
    Arguments:
        content - vectorized text"""
    
    start, end = tf.constant(title_word_index['<start>'], dtype=tf.int64), tf.constant(title_word_index['<end>'], dtype=tf.int64)
    start = start[tf.newaxis]
    end = end[tf.newaxis]
    
    output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
    output_array = output_array.write(0, start)

    for i in tf.range(output_len):
        output = tf.transpose(output_array.stack())
        predictions = model([content[tf.newaxis], output], training=False)
        
        # Select the last token from the `seq_len` dimension.
        predictions = predictions[:, -1:, :]  # Shape `(batch_size, 1, vocab_size)`.
        predicted_id = tf.argmax(predictions, axis=2)

        # Concatenate the `predicted_id` to the output which is given to the
        # decoder as its input.
        output_array = output_array.write(i+1, predicted_id[0])

        if predicted_id == end:
            break
        
    output = output_array.stack().numpy().reshape(1,-1)
    predicted_title = sentence_from_ind(output[0].tolist())
    return predicted_title

true_titles = []
predicted_titles = []
BLEU_scores = []

for index in range(200):
    content_vec, true_title = test_content_vec[index], test_title[index]
    predicted_title = text_to_title(content_vec)
    true_titles.append(true_title)
    predicted_titles.append(predicted_title)
    BLEU_scores.append(sentence_bleu([true_title.split()], predicted_title.split(), weights=(1, 0, 0, 0)))
    
    if index%50==0:
        print(index)

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


0


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


50
100
150


In [15]:
df = pd.DataFrame(data=[true_titles, predicted_titles, BLEU_scores]).T
df.columns = ['true_title','predicted_title','BLEU_score']
df.to_csv(f'../results/{model_name}-results.csv', index=False)

In [16]:
# model.save('../models/modelv2-2blocks-8heads-256ffdim-trainableemb')



INFO:tensorflow:Assets written to: ../models/modelv2-2blocks-8heads-256ffdim-trainableemb/assets


INFO:tensorflow:Assets written to: ../models/modelv2-2blocks-8heads-256ffdim-trainableemb/assets
  return serialization.serialize_keras_object(obj)


In [19]:
# model_reload = tf.keras.models.load_model("../models/modelv2-2blocks-8heads-256ffdim-trainableemb", custom_objects={'masked_loss': masked_loss, 'masked_accuracy': masked_accuracy}) #
# model_reload.summary()

Model: "transformer_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder (Encoder)           multiple                  9785212   
                                                                 
 decoder (Decoder)           multiple                  2894312   
                                                                 
 dense_8 (Dense)             multiple                  1515000   
                                                                 
Total params: 14,194,524
Trainable params: 14,194,524
Non-trainable params: 0
_________________________________________________________________


In [20]:
true_titles = []
predicted_titles = []
BLEU_scores = []

for index in range(10):
    content_vec, true_title = test_content_vec[index], test_title[index]
    predicted_title = text_to_title(content_vec,model=model_reload)
    true_titles.append(true_title)
    predicted_titles.append(predicted_title)
    BLEU_scores.append(sentence_bleu([true_title.split()], predicted_title.split(), weights=(1, 0, 0, 0)))
    
    if index%50==0:
        print(index)

ValueError: Exception encountered when calling layer 'transformer_model' (type TransformerModel).

Could not find matching concrete function to call loaded from the SavedModel. Got:
  Positional arguments (1 total):
    * [<tf.Tensor 'inputs:0' shape=(1, 256) dtype=int64>,
 <tf.Tensor 'inputs_1:0' shape=(1, 1) dtype=int64>]
  Keyword arguments: {'training': False}

 Expected these arguments to match one of the following 4 option(s):

Option 1:
  Positional arguments (1 total):
    * (TensorSpec(shape=(None, 256), dtype=tf.int64, name='inputs_0'),
 TensorSpec(shape=(None, 15), dtype=tf.int64, name='inputs_1'))
  Keyword arguments: {'training': False}

Option 2:
  Positional arguments (1 total):
    * (TensorSpec(shape=(None, 256), dtype=tf.int64, name='inputs_0'),
 TensorSpec(shape=(None, 15), dtype=tf.int64, name='inputs_1'))
  Keyword arguments: {'training': True}

Option 3:
  Positional arguments (1 total):
    * (TensorSpec(shape=(None, 256), dtype=tf.int64, name='input_1'),
 TensorSpec(shape=(None, 15), dtype=tf.int64, name='input_2'))
  Keyword arguments: {'training': False}

Option 4:
  Positional arguments (1 total):
    * (TensorSpec(shape=(None, 256), dtype=tf.int64, name='input_1'),
 TensorSpec(shape=(None, 15), dtype=tf.int64, name='input_2'))
  Keyword arguments: {'training': True}

Call arguments received by layer 'transformer_model' (type TransformerModel):
  • args=(['tf.Tensor(shape=(1, 256), dtype=int64)', 'tf.Tensor(shape=(1, 1), dtype=int64)'],)
  • kwargs=<class 'inspect._empty'>