In [1]:
import io
import os
import re

import string
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

import tensorflow_datasets as tfds

from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [2]:
data_path="../data/cornell_movie_dialogs_corpus/cornell movie-dialogs corpus"
with open(data_path+"/movie_lines.txt",'r') as file:
    raw_data=file.readlines()


In [3]:
with open(data_path+"/movie_conversations.txt") as convo:
    conversations=convo.readlines()
conversations=conversations

In [4]:
def align_conversations():
    converse=[]
    for line in conversations:
        find_object=re.findall(r'(L\d+)', line)
        converse.append(find_object)
    return converse

In [5]:
def filter_sentence(input_data):
    y=tf.strings.regex_replace(input_data,r"[\t\n]+",r" ")
    y=tf.strings.regex_replace(y,r"[^a-zA-Z0-9?!,']+",r" ")
    y=tf.strings.strip(y)  
    return y

In [6]:
def pre_processing_raw(input_data):
        y=" "+input_data+" "
        y = re.sub(r"([?.!,'] )", r" \1 ", y)
        y = re.sub(r"( [?.!,'])", r" \1 ", y)
        y = re.sub(r"( ?[?.!,'][\n\t])", r" \1 ", y)
        y=y.lower()
        return y

In [7]:
def preprocess_dataset():
    data_lookup={}
    
    for lines in raw_data:
        label=re.match(r"L\w+",lines).group(0)
        y=re.sub(r"\w+ (\+\+\+\$\+\+\+ \w+ )+\+\+\+\$\+\+\+ "," ",lines)
        y = re.sub(r"([?.!,'] )", r" \1 ", y)
        y = re.sub(r"( [?.!,'])", r" \1 ", y)
        y = re.sub(r"( ?[?.!,'][\n\t])", r" \1 ", y)
        y=filter_sentence(tf.constant(y))
        y=y.numpy()
        data_lookup[label]=y
        
    return data_lookup

In [8]:
def divide_data(conversation_lookups,conversation_lists):
    question=[]
    answer=[]
    for convo in conversation_lists:
        for i in range(len(convo)-1):
            question.append(conversation_lookups[convo[i]])
            answer.append(conversation_lookups[convo[i+1]])
    return tf.constant(question),tf.constant(answer)

In [9]:
conversation_lookups=preprocess_dataset()
conversation_lists=align_conversations()



In [10]:
questions,answers=divide_data(conversation_lookups,conversation_lists)
print(questions[20])
print(answers[20])
print(len(questions))

tf.Tensor(b"I really , really , really wanna go , but I can't Not unless my sister goes", shape=(), dtype=string)
tf.Tensor(b"I'm workin ' on it But she doesn't seem to be goin ' for him", shape=(), dtype=string)
221616


In [11]:

# Create a custom standardization function to strip HTML break tags '<br />'.
def custom_standardization(input_data):
    
    lowercase = tf.strings.lower(input_data)
    word_tokens=tf.strings.regex_replace(lowercase, r"[\w ]+\+\+\+\$\+\+\++", '')
    word_tokens=tf.strings.regex_replace(word_tokens, r"[\n\t]", '')
    #y=tf.strings.regex_replace(word_tokens,r'[%s]' % re.escape(string.punctuation), '')
    y=tf.strings.strip(word_tokens)
    y=" <start> "+y+" <end> "
    return y

# Vocabulary size and number of words in a sequence.
vocab_size = 15000
sequence_length = 40

# Use the text vectorization layer to normalize, split, and map strings to 
# integers. Note that the layer uses the custom standardization defined above. 
# Set maximum_sequence length as all samples are not of the same length.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
vectorize_layer.adapt(tf.constant(list(conversation_lookups.values())))

In [12]:
vocab = vectorize_layer.get_vocabulary()
print(vocab[:10])

['', '[UNK]', '<start>', '<end>', ',', 'you', '?', 'i', 'the', 'to']


In [13]:
VOCAB_SIZE=len(vocab)
print(VOCAB_SIZE)

15000


In [14]:
questions=vectorize_layer(questions)
answers=vectorize_layer(answers)


In [15]:
BATCH_SIZE = 64
BUFFER_SIZE = 20000

# decoder inputs use the previous target as input
# remove START_TOKEN from targets
dataset = tf.data.Dataset.from_tensor_slices((
    {
        'inputs': questions,
        'dec_inputs': answers[:, :-1]
    },
    {
        'outputs': answers[:, 1:]
    },
))

dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [16]:

def scaled_dot_product_attention(query, key, value, mask):
    """Calculate the attention weights. """
    matmul_qk = tf.matmul(query, key, transpose_b=True)

    # scale matmul_qk
    depth = tf.cast(tf.shape(key)[-1], tf.float32)
    logits = matmul_qk / tf.math.sqrt(depth)

    # add the mask to zero out padding tokens
    if mask is not None:
        logits += (mask * -1e9)

    # softmax is normalized on the last axis (seq_len_k)
    attention_weights = tf.nn.softmax(logits, axis=-1)

    output = tf.matmul(attention_weights, value)

    return output

In [17]:

class MultiHeadAttention(tf.keras.layers.Layer):

    def __init__(self, d_model, num_heads, name=None):
        super(MultiHeadAttention, self).__init__(name=name)
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.query_dense = tf.keras.layers.Dense(units=d_model)
        self.key_dense = tf.keras.layers.Dense(units=d_model)
        self.value_dense = tf.keras.layers.Dense(units=d_model)

        self.dense = tf.keras.layers.Dense(units=d_model)

    def split_heads(self, inputs, batch_size):
        inputs = tf.reshape(
            inputs, shape=(batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(inputs, perm=[0, 2, 1, 3])

    def call(self, inputs):
        query, key, value, mask = inputs['query'], inputs['key'], inputs[
            'value'], inputs['mask']
        batch_size = tf.shape(query)[0]
        
        #print(tf.shape(query))
        # linear layers
        query = self.query_dense(query)
        key = self.key_dense(key)
        value = self.value_dense(value)

        # split heads
        query = self.split_heads(query, batch_size)
        key = self.split_heads(key, batch_size)
        value = self.split_heads(value, batch_size)

        # scaled dot-product attention
        
        scaled_attention = scaled_dot_product_attention(query, key, value, mask)
        #print(tf.shape(scaled_attention))
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
        

        # concatenation of heads
        concat_attention = tf.reshape(scaled_attention,shape=(batch_size, -1, self.d_model))

        # final linear layer
        outputs = self.dense(concat_attention)

        return outputs
    
    def get_config(self):
        config = {
        'd_model': self.d_model,
        'num_heads':self.num_heads,
        
      
            
         }
        return config
    @classmethod
    def from_config(cls, config):
        return cls(**config)


In [18]:

def create_padding_mask(x):

    mask = tf.cast(tf.math.equal(x, 0), tf.float32)
    # (batch_size, 1, 1, sequence length)
    return mask[:, tf.newaxis, tf.newaxis, :]

In [19]:
def create_look_ahead_mask(x):

    seq_len = tf.shape(x)[1]
    look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
    padding_mask = create_padding_mask(x)
    return tf.maximum(look_ahead_mask, padding_mask)

In [20]:
class PositionalEncoding(tf.keras.layers.Layer):

    def __init__(self, position, d_model):
        
        super(PositionalEncoding, self).__init__()
        self.position=position
        self.pos_encoding = self.positional_encoding(self.position, d_model)
        self.d_model=d_model
        
    def get_angles(self, position, i, d_model):
        angles = 1 / tf.pow(10000, (2 * (i // 2)) / tf.cast(d_model, tf.float32))
        return position * angles

    def positional_encoding(self, position, d_model):
        angle_rads = self.get_angles(
            position=tf.range(position, dtype=tf.float32)[:, tf.newaxis],
            i=tf.range(d_model, dtype=tf.float32)[tf.newaxis, :],
            d_model=d_model)
        # apply sin to even index in the array
        sines = tf.math.sin(angle_rads[:, 0::2])
        # apply cos to odd index in the array
        cosines = tf.math.cos(angle_rads[:, 1::2])

        pos_encoding = tf.concat([sines, cosines], axis=-1)
        pos_encoding = pos_encoding[tf.newaxis, ...]
        return tf.cast(pos_encoding, tf.float32)

    def call(self, inputs):
        return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :]
    def get_config(self):
        config = {
        'd_model': self.d_model,
            'position':self.position,
            
         }
        return config
    @classmethod
    def from_config(cls, config):
        return cls(**config)


In [21]:
def encoder_layer(units, d_model, num_heads, dropout, name="encoder_layer"):
    inputs = tf.keras.Input(shape=(None, d_model), name="inputs")
    padding_mask = tf.keras.Input(shape=(1, 1, None), name="padding_mask")
    #print(inputs)
    attention = MultiHeadAttention(
      d_model, num_heads, name="attention")({
          'query': inputs,
          'key': inputs,
          'value': inputs,
          'mask': padding_mask
      })
    attention = tf.keras.layers.Dropout(rate=dropout)(attention)
    attention = tf.keras.layers.LayerNormalization(epsilon=1e-6)(inputs + attention)

    outputs = tf.keras.layers.Dense(units=units, activation='relu')(attention)
    outputs = tf.keras.layers.Dense(units=d_model)(outputs)
    outputs = tf.keras.layers.Dropout(rate=dropout)(outputs)
    outputs = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention + outputs)

    return tf.keras.Model(inputs=[inputs, padding_mask], outputs=outputs, name=name)

In [22]:
def encoder(vocab_size,
            num_layers,
            units,
            d_model,
            num_heads,
            dropout,
            name="encoder"):
    inputs = tf.keras.Input(shape=(None,), name="inputs")
    padding_mask = tf.keras.Input(shape=(1, 1, None), name="padding_mask")

    embeddings = tf.keras.layers.Embedding(vocab_size, d_model , name="embeddings")(inputs)
    embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32))
    embeddings = PositionalEncoding(vocab_size, d_model)(embeddings)

    outputs = tf.keras.layers.Dropout(rate=dropout)(embeddings)

    for i in range(num_layers):
        outputs = encoder_layer(
            units=units,
            d_model=d_model,
            num_heads=num_heads,
            dropout=dropout,
            name="encoder_layer_{}".format(i),
        )([outputs, padding_mask])

    return tf.keras.Model(
      inputs=[inputs, padding_mask], outputs=outputs, name=name)

In [23]:
def decoder_layer(units, d_model, num_heads, dropout, name="decoder_layer"):
    inputs = tf.keras.Input(shape=(None, d_model), name="inputs")
    enc_outputs = tf.keras.Input(shape=(None, d_model), name="encoder_outputs")
    look_ahead_mask = tf.keras.Input(shape=(1, None, None), name="look_ahead_mask")
    padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask')

    attention1 = MultiHeadAttention(
      d_model, num_heads, name="attention_1")(inputs={
          'query': inputs,
          'key': inputs,
          'value': inputs,
          'mask': look_ahead_mask
      })
    attention1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention1 + inputs)

    attention2 = MultiHeadAttention(
      d_model, num_heads, name="attention_2")(inputs={
          'query': attention1,
          'key': enc_outputs,
          'value': enc_outputs,
          'mask': padding_mask
      })
    attention2 = tf.keras.layers.Dropout(rate=dropout)(attention2)
    attention2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention2 + attention1)

    outputs = tf.keras.layers.Dense(units=units, activation='relu')(attention2)
    outputs = tf.keras.layers.Dense(units=d_model)(outputs)
    outputs = tf.keras.layers.Dropout(rate=dropout)(outputs)
    outputs = tf.keras.layers.LayerNormalization(epsilon=1e-6)(outputs + attention2)

    return tf.keras.Model(
      inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask],
      outputs=outputs,name=name)

In [24]:
def decoder(vocab_size,
            num_layers,
            units,
            d_model,
            num_heads,
            dropout,
            name='decoder'):
    inputs = tf.keras.Input(shape=(None,), name='inputs')
    enc_outputs = tf.keras.Input(shape=(None, d_model), name='encoder_outputs')
    look_ahead_mask = tf.keras.Input(
      shape=(1, None, None), name='look_ahead_mask')
    padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask')

    embeddings = tf.keras.layers.Embedding(vocab_size, d_model,name="embeddings")(inputs)
    embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32))
    embeddings = PositionalEncoding(vocab_size, d_model)(embeddings)

    outputs = tf.keras.layers.Dropout(rate=dropout)(embeddings)

    for i in range(num_layers):
        outputs = decoder_layer(
            units=units,
            d_model=d_model,
            num_heads=num_heads,
            dropout=dropout,
            name='decoder_layer_{}'.format(i),
        )(inputs=[outputs, enc_outputs, look_ahead_mask, padding_mask])

    return tf.keras.Model(
      inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask],
      outputs=outputs,
      name=name)

In [25]:

def transformer(vocab_size,
                num_layers,
                units,
                d_model,
                num_heads,
                dropout,
                name="transformer"):
    inputs = tf.keras.Input(shape=(None,), name="inputs")
    dec_inputs = tf.keras.Input(shape=(None,), name="dec_inputs")

    enc_padding_mask = tf.keras.layers.Lambda(create_padding_mask, output_shape=(1, 1, None), name='enc_padding_mask')(inputs)
    # mask the future tokens for decoder inputs at the 1st attention block
    look_ahead_mask = tf.keras.layers.Lambda(
      create_look_ahead_mask,
      output_shape=(1, None, None),
      name='look_ahead_mask')(dec_inputs)
    # mask the encoder outputs for the 2nd attention block
    dec_padding_mask = tf.keras.layers.Lambda(
      create_padding_mask, output_shape=(1, 1, None),
      name='dec_padding_mask')(inputs)

    enc_outputs = encoder(
      vocab_size=vocab_size,
      num_layers=num_layers,
      units=units,
      d_model=d_model,
      num_heads=num_heads,
      dropout=dropout,
    )(inputs=[inputs, enc_padding_mask])

    dec_outputs = decoder(
      vocab_size=vocab_size,
      num_layers=num_layers,
      units=units,
      d_model=d_model,
      num_heads=num_heads,
      dropout=dropout,
    )(inputs=[dec_inputs, enc_outputs, look_ahead_mask, dec_padding_mask])

    outputs = tf.keras.layers.Dense(units=vocab_size, name="outputs")(dec_outputs)

    return tf.keras.Model(inputs=[inputs, dec_inputs], outputs=outputs, name=name)

In [26]:


# Hyper-parameters
MAX_SAMPLES = 220000
NUM_LAYERS = 2
D_MODEL = 128
NUM_HEADS = 16
UNITS = 512
DROPOUT = 0.1

# Maximum sentence length
MAX_LENGTH = 40
BATCH_SIZE = 64
BUFFER_SIZE = 20000


model = transformer(
    vocab_size=VOCAB_SIZE,
    num_layers=NUM_LAYERS,
    units=UNITS,
    d_model=D_MODEL,
    num_heads=NUM_HEADS,
    dropout=DROPOUT)

# Retrieve the config
config = model.get_config()

# At loading time, register the custom objects with a `custom_object_scope`:
custom_objects={"MultiHeadAttention": MultiHeadAttention,"PositionalEncoding":PositionalEncoding}
with tf.keras.utils.custom_object_scope(custom_objects):
    new_model = tf.keras.Model.from_config(config)

json_config = model.to_json()


In [27]:
def loss_function(y_true, y_pred):
    y_true = tf.reshape(y_true, shape=(-1, MAX_LENGTH - 1))

    loss = tf.keras.losses.SparseCategoricalCrossentropy(
      from_logits=True, reduction='none')(y_true, y_pred)

    mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
    loss = tf.multiply(loss, mask)

    return tf.reduce_mean(loss)

In [28]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):

    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps**-1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
    def get_config(self):
        config = {
        'd_model': self.d_model,
        'warmup_steps': self.warmup_steps,

         }
        return config

In [29]:
learning_rate = CustomSchedule(D_MODEL)

optimizer = tf.keras.optimizers.Adam(
    learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
def accuracy(y_true, y_pred):
    # ensure labels have shape (batch_size, MAX_LENGTH - 1)
    y_true = tf.reshape(y_true, shape=(-1, MAX_LENGTH - 1))
    return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)

model.compile(optimizer=optimizer, loss=loss_function, metrics=[accuracy])

In [30]:
START_TOKEN=vectorize_layer(["<start>"])[0][1]
END_TOKEN=vectorize_layer(["<end>"])[0][1]

In [31]:
checkpoint_path = "../asset/"

ckpt = tf.train.Checkpoint(transformer=model,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

# if a checkpoint exists, restore the latest checkpoint.


In [None]:
EPOCHS = 10
for i in range(EPOCHS):
    
#change this dataset to original dataset before working
    if ckpt_manager.latest_checkpoint:
        ckpt.restore(ckpt_manager.latest_checkpoint)
        print ('Latest checkpoint restored!!')
   
    model.fit(dataset, epochs=1)
    ckpt_save_path = ckpt_manager.save()

Latest checkpoint restored!!
Latest checkpoint restored!!
Latest checkpoint restored!!
Latest checkpoint restored!!
Latest checkpoint restored!!
Latest checkpoint restored!!

In [None]:
def evaluate(sentence):
    sentence = pre_processing_raw(sentence)
    sentence = filter_sentence(tf.constant(sentence))
    sentence=vectorize_layer([sentence])
    
    START_TOKEN=vectorize_layer(["<start>"])[0][1]
    END_TOKEN=vectorize_layer(["<end>"])[0][1]

    output =tf.expand_dims([START_TOKEN], 0)
    
    for i in range(MAX_LENGTH):
       
        predictions = model(inputs=[sentence,output], training=False)
       
        # select the last word from the seq_len dimension
        predictions = predictions[:, -1:, :]
        
        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int64)

        # return the result if the predicted_id is equal to the end token
        if tf.equal(predicted_id, [END_TOKEN]):
            break

        # concatenated the predicted_id to the output which is given to the decoder
        # as its input.
        
        output = tf.concat([output, predicted_id], axis=-1)
        
    return tf.squeeze(output, axis=0)


def selfdecode(input_data):
    
    answer=[vocab[i] for i in input_data]
    if(len(answer)==0):
        return "I don't know the answer to this question!"
    answer=" ".join(answer)
    return answer


def predict(sentence):
    prediction = evaluate(sentence)
   
    answer=[i for i in prediction if ((i !=START_TOKEN)and(i!=END_TOKEN))]
    predicted_sentence = selfdecode(answer)
    
    print('Input: {}'.format(sentence))
    print('Output: {}'.format(predicted_sentence))

    return predicted_sentence

In [None]:
assets_path='../asset/'

In [None]:
weights = model.get_layer('encoder').get_layer("embeddings").get_weights()[0]
print(weights.shape) 

In [None]:
output=predict("how are you?")


In [None]:
model.save(assets_path+"my_model")

In [None]:

out_v = io.open(assets_path+'vecs.tsv', 'w', encoding='utf-8')
out_m = io.open(assets_path+'meta.tsv', 'w', encoding='utf-8')

for num, word in enumerate(vocab):
    if num == 0: continue # skip padding token from vocab
    vec = weights[num]
    out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
out_v.close()
out_m.close()

try:
    from assets_path import files
except ImportError: 
    pass
else:
    files.download('vecs.tsv')
    files.download('meta.tsv')