In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
import pandas as pd
import csv
import re
import os
import pathlib
import random
import string
import unicodedata
import codecs
from io import open


In [2]:
# !unzip "./datasets/cornell_movie_dialogs_corpus.zip"
# !unzip -q "datasets/glove.6B.zip"

In [3]:
corpus_name = "cornell movie-dialogs corpus"
corpus = os.path.join("./datasets", corpus_name)

def printlines(file, n=10):
    with open(file, 'rb') as datafile:
        lines = datafile.readlines()

        for line in lines[:n]:
            print(line)


printlines(os.path.join(corpus, "movie_lines.txt"))


b'L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!\n'
b'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!\n'
b'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.\n'
b'L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?\n'
b"L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.\n"
b'L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow\n'
b"L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.\n"
b'L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No\n'
b'L870 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?\n'
b'L869 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Like my fear of wearing pastels?\n'


In [4]:
# Splits lines into dictionary of fields
def loadLines(filename, fields):
    lines = {}
    with open(filename, 'r', encoding="iso-8859-1") as f:
        for line in f:
            values = line.split(" +++$+++ ")

            lineObj = {}

            for i, field in enumerate(fields):
                lineObj[field] = values[i]
      
            lines[lineObj["lineID"]] = lineObj

    return lines


# Group fields according to convertions or movie_conversations
def loadConversations(filename, fields, lines):
    conversations = []
    with open(filename, 'r', encoding="iso-8859-1") as f:
        for line in f:
            values = line.split(" +++$+++ ")

            conversationObj = {}

            for i, field in enumerate(fields):
                conversationObj[field] = values[i]

            lineIds = re.findall("L[0-9]+", conversationObj["utteranceIDs"])

            conversationObj["lines"] = []

            for lineId in lineIds:
                conversationObj["lines"].append(lines[lineId])

            conversations.append(conversationObj)

    return conversations

# Make a group pair of chats (ask-reply)
def extractSentencePairs(conversations):
    qa_pairs = []

    for conversationObj in conversations:
        for i in range(len(conversationObj["lines"]) - 1):
            input_line = conversationObj["lines"][i]["text"].strip()
            target_line = conversationObj["lines"][i+1]["text"].strip()

            qa_pairs.append([input_line, target_line])

  
    return qa_pairs


In [5]:
# Our new file
datafile = os.path.join(corpus, "formatted_movie_lines.csv")

# Tab delimiter for our new file
delimiter = "\t"
delimiter = str(codecs.decode(delimiter, "unicode-escape"))

MOVIE_LINES_FIELDS = ["lineID", "characterID", "movieID", "character", "text"]
MOVIE_CONVERSATION_FIELDS = ["character1ID", "character2ID", "movieID", "utteranceIDs"]

# Load lines and conversations
print("\nProcessing lines...")
lines = loadLines(os.path.join(corpus, "movie_lines.txt"), MOVIE_LINES_FIELDS)
print("\nProcessing conversations")
conversations = loadConversations(os.path.join(corpus, "movie_conversations.txt"), MOVIE_CONVERSATION_FIELDS, lines)

print("\nCreate new formatted file")
pd.DataFrame(extractSentencePairs(conversations)).to_csv(datafile, header = False, index=False)

print("\nSample lines from file:")
printlines(datafile)


Processing lines...

Processing conversations

Create new formatted file

Sample lines from file:
b'Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.,"Well, I thought we\'d start with pronunciation, if that\'s okay with you."\n'
b'"Well, I thought we\'d start with pronunciation, if that\'s okay with you.",Not the hacking and gagging and spitting part.  Please.\n'
b"Not the hacking and gagging and spitting part.  Please.,Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?\n"
b"You're asking me out.  That's so cute. What's your name again?,Forget it.\n"
b'"No, no, it\'s my fault -- we didn\'t have a proper introduction ---",Cameron.\n'
b'Cameron.,"The thing is, Cameron -- I\'m at the mercy of a particularly hideous breed of loser.  My sister.  I can\'t date until she does."\n'
b'"The thing is, Cameron -- I\'m at the mercy of a particularly hideous breed of loser.  My sister.  I can\

In [19]:
qa_dataset = pd.read_csv(datafile, names=['q','a'])
qa_dataset['a'] = "[sos] " + qa_dataset['a'] + " [eos]"

qa_dataset = qa_dataset[(qa_dataset['q'].str.count(" ") < 10) & (qa_dataset['a'].str.count(" ") < 10)]

In [20]:
100 * qa_dataset.isnull().sum()/len(qa_dataset)

q    0.0
a    0.0
dtype: float64

In [21]:
vocab_size = 15000
sequence_length = 15
batch_size = 32

vectorization = TextVectorization(
    max_tokens=vocab_size, output_mode="int", output_sequence_length=sequence_length,
)

vectorization.adapt(qa_dataset['q'].to_list())
vectorization.adapt(qa_dataset['a'].to_list())

voc = vectorization.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [42]:
batch_size = 128
def format_dataset(que, ans):
    que = vectorization(que)[:, :-1]
    ans = vectorization(ans)
    return ({"encoder_inputs": que, "decoder_inputs": ans[:, :-1]}, ans[:, 1:])

def make_dataset(dataset):
    que, ans = dataset['q'].to_list(), dataset['a'].to_list()

    dataset = tf.data.Dataset.from_tensor_slices((que, ans))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    
    return dataset.shuffle(2048).prefetch(16).cache()

In [43]:
train_ds = make_dataset(qa_dataset)

In [24]:
for inputs, targets in train_ds.take(1):
    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f"targets.shape: {targets.shape}")

inputs["encoder_inputs"].shape: (32, 14)
inputs["decoder_inputs"].shape: (32, 14)
targets.shape: (32, 14)


In [25]:
path_to_glove_file = "datasets/glove.6B.300d.txt"

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [26]:
num_tokens = vocab_size
embedding_dim = 300
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 12243 words (2757 misses)


In [44]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super(TransformerEncoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, tf.newaxis, :], dtype="int32")
        attention_output = self.attention(
            query=inputs, value=inputs, key=inputs, attention_mask=padding_mask
        )
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)


class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super(PositionalEmbedding, self).__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim,
            embeddings_initializer=keras.initializers.Constant(embedding_matrix),
            trainable = True,
            mask_zero = True
        )
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)
    
    def get_config(self):
        config = super(Linear, self).get_config()
        config.update({"units": self.units})
        return config


class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super(TransformerDecoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(latent_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)

        attention_output_1 = self.attention_1(
            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
        )
        out_1 = self.layernorm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        out_2 = self.layernorm_2(out_1 + attention_output_2)

        proj_output = self.dense_proj(out_2)
        return self.layernorm_3(out_2 + proj_output)

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        return tf.tile(mask, mult)

In [45]:
embed_dim = embedding_dim
latent_dim = 2048
num_heads = 8


encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)

encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(x)
encoder = keras.Model(encoder_inputs, encoder_outputs)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, embed_dim), name="decoder_state_inputs")

x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)

x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoded_seq_inputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

decoder_outputs = decoder([decoder_inputs, encoder_outputs])
transformer = keras.Model(
    [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
)

In [49]:
epochs = 40  # This should be at least 30 for convergence

transformer.summary()
transformer.compile(
    "rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)


transformer.fit(train_ds, epochs=epochs)

# Save the entire model as a SavedModel.
transformer.save('saved_model/my_model')
transformer.save_weights('saved_model/weights')

Model: "transformer"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_inputs (InputLayer)     [(None, None)]       0                                            
__________________________________________________________________________________________________
positional_embedding_6 (Positio (None, None, 300)    4504500     encoder_inputs[0][0]             
__________________________________________________________________________________________________
decoder_inputs (InputLayer)     [(None, None)]       0                                            
__________________________________________________________________________________________________
transformer_encoder_4 (Transfor (None, None, 300)    4119848     positional_embedding_6[0][0]     
________________________________________________________________________________________



INFO:tensorflow:Assets written to: saved_model/my_model/assets


INFO:tensorflow:Assets written to: saved_model/my_model/assets


In [72]:
max_decoded_sentence_length = sequence_length - 1
# transformer = transformer.load_weights("saved_model/weights.index")

def decode_sequence(input_sentence):
    tokenized_input_sentence = vectorization([input_sentence])[:, 1:]
    decoded_sentence = "sos"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = vectorization([decoded_sentence])[:, :-1]
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])

        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = list(word_index.keys())[sampled_token_index]
        decoded_sentence += " " + sampled_token
    
        if sampled_token == "eos":
            break
    decoded_sentence = decoded_sentence[4:][:-4]
    return decoded_sentence

for _ in range(2):
    input_sentence = input()
    answer = decode_sequence(input_sentence)
    print(answer)
    if input_sentence == 'byy':
        break

hello
what
what you know me?
i dont know


In [None]:
models = {
    "PositionalEmbedding" : PositionalEmbedding,
    "TransformerDecoder" :TransformerDecoder,
    "TransformerEncoder" : TransformerEncoder,
}

transformer = keras.models.model_from_json(transformer.to_json(), custom_objects = models)