In [None]:
# import necessary packages 
import tensorflow as tf
import pickle
import numpy as np

# get train content data from pickle 
with open('train_content_embeddings.pkl','rb') as f:
    train_content_emb = pickle.load(f)

# get test content data from pickle 
with open('test_content_embeddings.pkl','rb') as f:
    test_content_emb = pickle.load(f)

# get train title data from pickle 
with open('train_title_embeddings.pkl','rb') as f:
    train_title_emb = pickle.load(f)
    
# get test title data from pickle 
with open('test_title_embeddings.pkl','rb') as f:
    test_title_emb = pickle.load(f)
    
# get test title labels data from pickle 
with open('test_title_labels.pkl','rb') as f:
    test_title_tokens = pickle.load(f)
    
# get train title labels data from pickle 
with open('train_title_labels.pkl','rb') as f:
    train_title_tokens = pickle.load(f)

# get word dictionary data from pickle 
with open('index_to_word.pkl','rb') as f:
    word_dict = pickle.load(f)


In [None]:
# function to convert indexes to words using defined dict 
def sentence_from_ind(indexes):
    sentence = ""
    
    for index in indexes:
   
        sentence += word_dict[index]
        sentence += " "
    
    return sentence 
        

In [None]:
# create datasets for training and testing 
train_content_data = tf.convert_to_tensor(train_content_emb[:3000])
train_title_data = tf.convert_to_tensor(train_title_emb[:3000])
train_title_labels = tf.convert_to_tensor(train_title_tokens[:3000])

test_content_data = tf.convert_to_tensor(test_content_emb[:600])
test_title_data = tf.convert_to_tensor(test_title_emb[:600])
test_title_labels = tf.convert_to_tensor(test_title_tokens[:600])

In [None]:
# Encoder block class for transformer model 
class EncoderBlock(tf.keras.layers.Layer):
    def __init__(self, emb_sz, num_heads, key_dim, **kwargs):
        super(EncoderBlock, self).__init__(**kwargs)

        # dense layers for encoder block --> NOTE TO GENERALIZE 
        self.ff_layer = tf.keras.Sequential([
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(emb_sz)
        ])
        
        # self attention layer 
        self.self_atten = tf.keras.layers.MultiHeadAttention(num_heads, key_dim)
        
        # normailization layers 
        self.layer_norm_1 =  tf.keras.layers.LayerNormalization()
        self.layer_norm_2 =  tf.keras.layers.LayerNormalization()
       
    def call(self, embedded_articles):
        
        '''
        embedded_artiles: (batch_size (TBD), window_size (512), embedding_size (768))
        '''
        # self attention on embedded articles --> z (window_size x key_dims)
        z_matrix = self.self_atten(embedded_articles, embedded_articles)
        
        # add part of Add and Normalize 
        residuals = embedded_articles + z_matrix
        
        # normalize the added matrixes 
        normalized_resid = self.layer_norm_1(residuals)
        
        # feed forward the normalized output 
        ff_output = self.ff_layer(normalized_resid)
        
        # normalize the first normalization and the output of feed forward
        normalized_resid2 = normalized_resid + ff_output
        
        encoder_output = self.layer_norm_2(normalized_resid2)
        
        return encoder_output

In [None]:
# decoder block for the transformer model 
class DecoderBlock(tf.keras.layers.Layer):
    def __init__(self, emb_sz, num_heads, key_dim, **kwargs):
        super(DecoderBlock, self).__init__(**kwargs)

        # dense layers for decoder block --> may need to change output 768 for embedding size 
        self.ff_layer = tf.keras.Sequential([
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(emb_sz)
        ])
        
        # self attention layer 
        self.self_atten = tf.keras.layers.MultiHeadAttention(num_heads, key_dim)
        
        # cross attention layer 
        self.cross_atten = tf.keras.layers.MultiHeadAttention(num_heads, key_dim)
        
        # normailization layers 
        self.layer_norm_1 =  tf.keras.layers.LayerNormalization()
        self.layer_norm_2 =  tf.keras.layers.LayerNormalization()
        self.layer_norm_3 =  tf.keras.layers.LayerNormalization()
       

    def call(self, encoder_output, decoder_input):
        
        '''
        encoder_output: (batch_size (TBD), window_size (512), embedding_size (768))
        '''
        
        # self atten on the inputs to the decoder --> titles 
        z_matrix = self.self_atten(decoder_input, decoder_input)
        
        # add and normalize the residuals from the self atten mechanism 
        residuals = decoder_input + z_matrix
        normalized_resid = self.layer_norm_1(residuals)
        
        # perform cross attention on normalized self-atten and the decoder context 
        cross_atten_matrix = self.cross_atten(normalized_resid, encoder_output)
        
        # normalize the first normalization and the output of feed forward
        residual_2 = normalized_resid + cross_atten_matrix
        normalized_resid2 = self.layer_norm_2(residual_2)
        
        # feed forward the normalized output 
        ff_output = self.ff_layer(normalized_resid2)
        
        # normalize and add the second layers 
        residual_3 = ff_output + normalized_resid2
        decoder_output = self.layer_norm_3(residual_3)
        
        return decoder_output

In [None]:
# model with one encoder and one decoder block 
class TransformerModel(tf.keras.Model):
    
    def __init__(self, emb_sz, num_heads, key_dim, vocab_size, **kwargs):
        
        super().__init__()
        
        # create encoder and decoder blocks from classes 
        self.encoder_block1 = EncoderBlock(emb_sz,num_heads,key_dim)
        self.encoder_block2 = EncoderBlock(emb_sz,num_heads,key_dim)
     
        self.decoder_block1 = DecoderBlock(emb_sz,num_heads,key_dim)
        self.decoder_block2 = DecoderBlock(emb_sz,num_heads,key_dim)
    
        # dense layer for final output 
        self.dense_layer = tf.keras.layers.Dense(vocab_size, activation='softmax')

    
    def call(self, inputs):

        # encoder blocks 
        encoder_output1 = self.encoder_block1(inputs[0])
        encoder_output2 = self.encoder_block2(encoder_output1)
        
        # decoder blocks 
        decoder_output1 = self.decoder_block1(encoder_output2, inputs[1])
        decoder_output2 = self.decoder_block2(encoder_output2, decoder_output1)

        # final logit outputs 
        logits = self.dense_layer(decoder_output2)
        
        
        return logits


In [None]:
# initalize model
model = TransformerModel(emb_sz=100, num_heads=5, key_dim=64, vocab_size=15000)

# compile model with Adam optimizer and SCC loss 
model.compile(optimizer='Adam',loss='sparse_categorical_crossentropy')

# train model with token titles as labels 
model.fit(x=(train_content_data, train_title_data[:,:-1]), y=train_title_labels[:,1:], batch_size=200, epochs=15)

In [None]:
# create predictions 
pred = model.predict((test_content_data,test_title_data))
# train = model.predict((train_content_data,train_title_data))

In [None]:
# print sentence predictions 
for i in range(0,20):
    tokens = np.argmax(pred[i],axis=1)
    true = test_title_labels[i].numpy().reshape((32,))


    print(f'Predicted Sentence {i}:',sentence_from_ind(tokens))
    print(f'True Sentence {i}:',sentence_from_ind(true))
    print()

In [None]:
# # import necessary packages 
# import tensorflow as tf
# import pickle
# import numpy as np


# # get data from pickle 
# with open('encoded_articles.pkl','rb') as f:
#     articles_dict = pickle.load(f)

# content = []
# title_emb = []
# title_tokens = []

# test_content = []
# test_title_emb = []
# test_title_tokens = []


# # separate content from titles 
# for article in articles_dict:
    
    
#     content.append(article['content_embed'])
#     title_emb.append(article['title_embed'])
#     title_tokens.append(article['title'])

    
# # convert lists to tensors 
# content_data = tf.convert_to_tensor(content)
# title_emb_data = tf.convert_to_tensor(title_emb)
# title_token_data = tf.convert_to_tensor(title_tokens)

# content_data = tf.reshape(content_data,[22,512,768])
# title_emb_data = tf.reshape(title_emb_data,[22,32,768])
# title_token_data = tf.reshape(title_token_data,[22,32,1])

# # print shapes 
# print('Content shape:',content_data.shape)
# print('Title Embedding shape:', title_emb_data.shape)
# print('Title Token shape:', title_token_data.shape)