In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Reshape, TimeDistributed, Attention, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd
from emoji import replace_emoji
import re

In [2]:
# define a dictionary for contracition mapping 
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",
                           "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",
                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",
                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",
                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",
                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",
                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",
                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",
                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",
                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",
                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",
                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",
                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",
                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",
                           "you're": "you are", "you've": "you have"}

In [3]:
def text_preprocess(txt, sum):
    def f(txt):
        newtxt = txt.decode('utf-8')
        newtxt = replace_emoji(newtxt,'')
        newtxt = newtxt.lower()
        newtxt = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newtxt.split(" ")])
        return newtxt
    txt = f(txt)
    sum = "soseq " + f(sum) + " eoseq"
    return txt, sum

In [4]:
ds_train, ds_test = tfds.load(name='samsum', split=['train[:10%]','test'], as_supervised=True)
ds_train = np.array([text_preprocess(*d) for d in ds_train.as_numpy_iterator()]) # ndarray shape:(14732, 2)
ds_test = np.array([text_preprocess(*d) for d in ds_test.as_numpy_iterator()]) # ndarray shape:(819, 2)

In [5]:
def create_tokenizer(txt_to_fit, min_token_count=1):
    filt = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r'
    tokenizer = Tokenizer(filters=filt)
    tokenizer.fit_on_texts(txt_to_fit)
    count = 0 # Number of rare words 
    total_count = 0 # count of every unique word in texts 
    for key, value in tokenizer.word_counts.items():
        # update counts and frequencies
        total_count += 1
        # if the value of word was lower than our threshold, count!
        if value < min_token_count:
            count += 1
    # get the top most common words number 
    common_words = total_count - count
    tokenizer = Tokenizer(filters=filt, num_words=common_words)
    tokenizer.fit_on_texts(txt_to_fit)
    return tokenizer

In [6]:
# Tokenize input and target texts in the train dataset
tokenizer_input = create_tokenizer(ds_train[:,0], 300)
input_sequences = tokenizer_input.texts_to_sequences(ds_train[:,0]) # list of length 14732

tokenizer_target = create_tokenizer(ds_train[:,1], 20)
target_sequences = tokenizer_target.texts_to_sequences(ds_train[:,1]) # list of length 14732

# Pad sequences
input_sequences = pad_sequences(input_sequences, padding='post', truncating='post') # ndarray shape:(14732, 813)
target_sequences = pad_sequences(target_sequences, padding='post', truncating='post') # ndarray shape:(14732, 64)

# Prepare target data
target_sequences_one_hot = tf.keras.utils.to_categorical(target_sequences, 
                                                         num_classes=tokenizer_target.num_words+1) # ndarray shape (14732, 64, 17599)

# 
x_train = [input_sequences, target_sequences[:,:-1]]
y_train = target_sequences_one_hot[:,1:]

In [7]:
print("input sequence nvocabs:", tokenizer_input.num_words)
print("input sequence shape:", input_sequences.shape)
print("sample:", tokenizer_input.sequences_to_texts([input_sequences[0]]))
print('\n')
print("target sequence nvocabs:", tokenizer_target.num_words)
print("target sequence shape:", target_sequences.shape)
print("target sequence shape (one-hotted):", target_sequences_one_hot.shape)
print("sample:", tokenizer_target.sequences_to_texts([target_sequences[0]]))

input sequence nvocabs: 72
input sequence shape: (1473, 266)
sample: ['i just to you know that i a really time with you yeah i really as well if you are up for it i would really like to see you i am but i have a really up yeah no i but if you to go just me know yeah of for sure have a']


target sequence nvocabs: 192
target sequence shape: (1473, 45)
target sequence shape (one-hotted): (1473, 45, 193)
sample: ['soseq and tonight would like to meet but is eoseq']


In [11]:
def define_model(ntokens_input, ntokens_target):  
    LATENT_DIM = 250
    EMBED_DIM = 90

    #----------- Encoder ---------------
    encoder_input = Input(shape=(None, ), name="encoder_input")
    # Embedding 
    encoder_embed = Embedding(ntokens_input, EMBED_DIM, trainable=True, mask_zero=False, name="encoder_embeding")(encoder_input)
    # Encoder LSTM Block 1
    encoder_lstm1 = LSTM(LATENT_DIM, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.4, name="enc_lstm_1")
    encoder_output1, state_h1, state_c1 = encoder_lstm1(encoder_embed)
    # Encoder LSTM Block 2
    encoder_lstm2 = LSTM(LATENT_DIM, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.4, name="enc_lstm_2")
    encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1)
    # Encoder LSTM Block 3
    encoder_lstm3 = LSTM(LATENT_DIM, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.4, name="enc_lstm_3")
    encoder_output3, state_h3, state_c3 = encoder_lstm3(encoder_output2)

    #------------ Decoder --------------
    # define the input 
    decoder_input = Input(shape=(None, ), name="decoder_input")
    # Embedding for Decoder 
    decoder_embed_layer = Embedding(ntokens_target, EMBED_DIM, trainable=True)
    decoder_embed = decoder_embed_layer(decoder_input)
    # Decoder LSTM
    decoder_lstm = LSTM(LATENT_DIM, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.4, name="decoder_lstm")
    decoder_lstm_output, decoder_h1, decoder_c1 = decoder_lstm(decoder_embed, initial_state=[state_h3, state_c3])
    # Attention Layer use this:
    # from attention import AttentionLayer
    # attention_layer = AttentionLayer(name="attention_layer")
    # attention_output, attention_state = attention_layer([encoder_output3, decoder_lstm_output])
    # Attention Layer OR use this:
    attention_layer = Attention(name="attention_layer")
    attention_output = attention_layer([decoder_lstm_output, encoder_output3])
    # concatinate decoder and attention otuputs 
    decoder_concat = Concatenate(axis=-1, name="decoder_concat")([decoder_lstm_output, attention_output])
    # Time Distributed Dense 
    decoder_dense = TimeDistributed(Dense(ntokens_target, activation="softmax"))
    decoder_output = decoder_dense(decoder_concat)
    
    # define single model for encoder
    def standalone_encoder():
        return tf.keras.models.Model(encoder_input, outputs=[encoder_output3, state_h3, state_c3])

    # define single model for decoder
    def standalone_decoder():
        # define inputs 
        decoder_input_state_h = Input(shape=(LATENT_DIM, ))
        decoder_input_state_c = Input(shape=(LATENT_DIM, ))
        decoder_input_state_hidden = Input(shape=(None, LATENT_DIM))
        
        # Embedding of the decoder 
        decoder_embedding_pred = decoder_embed_layer(decoder_input)
        # Decoder LSTM (For making prediction, the state of the decoder should be set to the state of its previos time step)
        decoder_output_p, state_h_p, state_c_p = decoder_lstm(decoder_embedding_pred, initial_state=[decoder_input_state_h, decoder_input_state_c])
        
        # Attention
        attention_out_inference = attention_layer([decoder_output_p, decoder_input_state_hidden])
        decoder_concat = Concatenate(axis=-1, name="concat_attention")([decoder_output_p, attention_out_inference])
        
        # Final prediction
        decoder_final_output = decoder_dense(decoder_concat)
        
        # make the final decoder model
        return tf.keras.models.Model([decoder_input, decoder_input_state_hidden, decoder_input_state_h, decoder_input_state_c], [decoder_final_output, state_h_p, state_c_p])

    train_model = tf.keras.models.Model([encoder_input, decoder_input], decoder_output)
    train_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=["accuracy"])

    return train_model, standalone_encoder(), standalone_decoder()

In [12]:
train_model, encoder_model, decoder_model = define_model(tokenizer_input.num_words+1, tokenizer_target.num_words+1)
train_model.summary()

















Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_input (InputLayer)     [(None, None)]       0           []                               
                                                                                                  
 encoder_embeding (Embedding)   (None, None, 90)     6570        ['encoder_input[0][0]']          
                                                                                                  
 enc_lstm_1 (LSTM)              [(None, None, 250),  341000      ['encoder_embeding[0][0]']       
                                 (None, 250),                                                     
                                 (None, 250)]                                                     
                                                                                              

In [13]:
train_model.load_weights("w.h5")

In [None]:
model.fit(x_train, y_train, batch_size=4, epochs=50)

In [12]:
# save model and it's weights
model.save_weights("w.h5")

In [14]:
def sequence_decoder(input_sequence):
    """
    This function decods the sequence predicted
    """
    # encode the input 
    encoder_output, encoder_h, encoder_c = encoder_model.predict(input_sequence, verbose=0)
    # Define an embpy target with it's first woed set to input sequence 
    target_sequence = np.zeros((1, 1))
    target_sequence[0, 0] = tokenizer_target.word_index["soseq"]

    # define variables for when to stop and the decoded sentence
    decoded_sentence = ""
    stop = False

    while not stop:
        # when we did not reach the place to stop (eoseq), decode 
        output_token, h, c = decoder_model.predict([target_sequence, encoder_output, encoder_h, encoder_c], verbose=0)

        # predict token
        predicted_token_index = np.argmax(output_token[0, -1, :])
        predicted_token = tokenizer_target.index_word[predicted_token_index]
        
        # if the token was not end of sequence, continue and add the predicted token to the dcecoded sentence variable 
        if  predicted_token != "eoseq":
            decoded_sentence += " " + predicted_token

        # if we've reached eoseq token or reached the summary length limit, make the stop state true 
        if predicted_token == "eoseq":
            stop = True

        # update the target sequence 
        target_sequence = np.zeros((1,1))
        target_sequence[0, 0] = predicted_token_index

        # Update internal states
        encoder_h, encoder_c = h, c

    return decoded_sentence

In [15]:
for i in range(0, 16):
    print("-Review:\n", tokenizer_input.sequences_to_texts([input_sequences[i]])[0])
    print("-Original summary:\n", tokenizer_target.sequences_to_texts([target_sequences[i]])[0][6:-6])
    print("-Predicted summary:\n", sequence_decoder(input_sequences[i].reshape(1,-1)))
    print("\n")

-Review:
 i just to you know that i a really time with you yeah i really as well if you are up for it i would really like to see you i am but i have a really up yeah no i but if you to go just me know yeah of for sure have a
-Original summary:
 and tonight would like to meet but is
-Predicted summary:
  and are to the


-Review:
 what was the of the i will you the i get
-Original summary:
 will a to the when he home
-Predicted summary:
  and are going to the


-Review:
 can i your sure it will on you is it i it to when is she it to you not sure how she it a of you her and if she it i do not to be you are not it is your all she just to it on i am going on a with this and we are going to the i have to good ok ok i will one you will me your you have a
-Original summary:
 would like to she is going to the with her date the to a of and will have to her to give it back
-Predicted summary:
  and are a and will meet to the


-Review:
 file the is that go to
-Original summary:
 sent
-Predicted 