<a href="https://colab.research.google.com/github/khldsqmr/Comparison-of-Chatbot-models/blob/main/Chatbot_Seq2Seq_LSTM_Attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Import Libraries
from __future__ import print_function
import os
import sys
import pandas as pd
import numpy as np
import re
import nltk
from keras.layers import Input, Embedding, LSTM, TimeDistributed, Dense, Bidirectional
from keras.models import Model, load_model
import tensorflow as tf

def main(data_path):

    #Read the Files
    with open(os.path.join(data_path, 'movie_lines.txt'), encoding = 'utf-8', errors = 'ignore') as f:
        movieLines = f.read().split('\n')
    with open(os.path.join(data_path, 'movie_conversations.txt'), encoding = 'utf-8', errors = 'ignore') as f:
        movieConversations = f.read().split('\n')

    #Load the data
    #movieLines = open('movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')
    #movieConversations = open('movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')

    #Data Preprocessing
    #Map each line's id with its text by creating a dictionary
    id2line = {}
    for l in movieLines:
        L = l.split(' +++$+++ ')
        if len(L) == 5:
            id2line[L[0]] = L[4]

    # Create a list of all of the conversations' lines' ids.
    conv_ids = []
    for c in movieConversations[:-1]:
        C = c.split(' +++$+++ ')[-1][1:-1].replace("'","").replace(" ","")
        conv_ids.append(C.split(','))

    #Sort the sentences into questions (inputs) and answers (targets)
    questions = []
    answers = []
    for c in conv_ids:
        for i in range(len(c)-1):
            questions.append(id2line[c[i]])
            answers.append(id2line[c[i+1]])

    #Print length of question set and answer set
    print('Total Number of questions: ', len(questions))
    print('Total Number of answers  :', len(answers))

    #Removing the punctuations and cleaning
    def removePuncAndClean(txt):
        txt = txt.lower()
        txt = re.sub(r"i'm", "i am", txt)
        txt = re.sub(r"he's", "he is", txt)
        txt = re.sub(r"she's", "she is", txt)
        txt = re.sub(r"it's", "it is", txt)
        txt = re.sub(r"that's", "that is", txt)
        txt = re.sub(r"what's", "that is", txt)
        txt = re.sub(r"where's", "where is", txt)
        txt = re.sub(r"how's", "how is", txt)
        txt = re.sub(r"\'ll", " will", txt)
        txt = re.sub(r"\'ve", " have", txt)
        txt = re.sub(r"\'re", " are", txt)
        txt = re.sub(r"\'d", " would", txt)
        txt = re.sub(r"won't", "will not", txt)
        txt = re.sub(r"can't", "cannot", txt)
        txt = re.sub(r"n't", " not", txt)
        txt = re.sub(r"n'", "ng", txt)
        txt = re.sub(r"'bout", "about", txt)
        txt = re.sub(r"'til", "until", txt)
        txt = re.sub(r"[-()\"#/@;:<>{}`+=~|]", "", txt)
        txt = " ".join(txt.split())
        return txt

    #Cleaning the data
    cleanQuestions = []
    for q in questions:
        cleanQuestions.append(removePuncAndClean(q))
    cleanAnswers = []    
    for a in answers:
        cleanAnswers.append(removePuncAndClean(a))

    #Determining the length of sentences
    lengths = []
    for q in cleanQuestions:
        lengths.append(len(q.split()))
    for a in cleanAnswers:
        lengths.append(len(a.split()))

    # Remove questions and answers that are shorter than 1 word and longer than 25 words.
    smallQuestions = []
    smallAnswers = []
    for i, q in enumerate(cleanQuestions):
        if len(q.split()) >= 2 and len(q.split()) <= 25:
            smallQuestions.append(q)
            smallAnswers.append(cleanAnswers[i])

    #Filtering out the answers that are too short or long
    cleanQuestions = []
    cleanAnswers = []

    for i, a in enumerate(smallAnswers):
        if len(a.split()) >= 2 and len(a.split()) <= 25:
            cleanAnswers.append(a)
            cleanQuestions.append(smallQuestions[i])

    #choosing number of samples
    SampleSize = 15000
    cleanQuestions = cleanQuestions[:SampleSize]
    cleanAnswers = cleanAnswers[:SampleSize]

    import nltk
    #tokenizing the questions and answers
    allInputWords = [nltk.word_tokenize(sent) for sent in cleanQuestions]
    allTargetWords = [nltk.word_tokenize(sent) for sent in cleanAnswers]

    #train-validation split
    dataSize = len(allInputWords)

    # We will use the first 0-80th %-tile (80%) of data for the training
    X_train  = allInputWords[:round(dataSize*(80/100))]
    X_train  = [tr_input[::-1] for tr_input in X_train] #reverseing input seq for better performance
    y_train = allTargetWords[:round(dataSize*(80/100))]

    # We will use the remaining for validation
    X_test = allInputWords[round(dataSize*(80/100)):]
    X_test  = [val_input[::-1] for val_input in X_test] #reverseing input seq for better performance
    y_test = allTargetWords[round(dataSize*(80/100)):]

    print('Train data size is: ', len(X_train))
    print("Test data size is : ", len(X_test))

    #Actual Train Sentences
    XX_train = cleanQuestions[:round(dataSize*(80/100))]
    yy_train = cleanAnswers[:round(dataSize*(80/100))]
    #Actual Test Sentences
    XX_test = cleanQuestions[round(dataSize*(80/100)):]
    yy_test = cleanAnswers[round(dataSize*(80/100)):]

    #Convert to dataframe
    dfData =  list(zip(cleanQuestions, cleanAnswers))
    data = pd.DataFrame(dfData, columns = ['input' , 'target'])

    # Add start and end tokens to target sequences
    data.target = data.target.apply(lambda x : 'START '+ x + ' END')
    print('---')
    print('Random Sample Data: ')
    print(data.sample(6))
    print('---')

    # Create a dictionary for the frequency of the vocabulary
    vocabulary = {}
    for question in allInputWords:
        for word in question:
            if word not in vocabulary:
                vocabulary[word] = 1
            else:
                vocabulary[word] += 1

    for answer in allTargetWords:
        for word in answer:
            if word not in vocabulary:
                vocabulary[word] = 1
            else:
                vocabulary[word] += 1     

    #Reducing vocabulary size and replace with UNK.
    threshold = 15
    count = 0
    for k,v in vocabulary.items():
        if v >= threshold:
            count += 1

    print("Size of total vocabulary:", len(vocabulary))

    #word_num 1 is for START tage for decoder
    word_num  = 2 
    encodingDict = {}
    decodingDict = {1: 'START'}
    #Vocabularies that appear above threshold count
    for word, count in vocabulary.items():
        if count >= threshold: 
            encodingDict[word] = word_num 
            decodingDict[word_num ] = word
            word_num += 1

    print("No. of vocabulary used:", word_num)

    #include unknown token for words not in dictionary
    decodingDict[len(encodingDict)+2] = 'UNK'
    encodingDict['UNK'] = len(encodingDict)+2

    dictSize = word_num+1

    #encodingDict: encoding dictionary
    #data: list of strings
    #vector_size: size of an encoded vector
    def modify(encodingDict, data, vector_size=20):
        transformedData = np.zeros(shape=(len(data), vector_size))
        for i in range(len(data)):
            for j in range(min(len(data[i]), vector_size)):
                try:
                    transformedData[i][j] = encodingDict[data[i][j]]
                except:
                    transformedData[i][j] = encodingDict['UNK']
        return transformedData

    #encoding training set
    inputLength = 25
    ouputLength = 25
    encodedTrainInput = modify(encodingDict, X_train, vector_size=inputLength)
    encodedTrainOutput = modify(encodingDict, y_train, vector_size=ouputLength)
    #encoding validation set
    encodedValInput = modify(encodingDict, X_test, vector_size=inputLength)
    encodedValOutput = modify(encodingDict, y_test, vector_size=ouputLength)

    print('Train data size is: ', encodedTrainInput.shape)
    print('Test data size is : ', encodedValInput.shape)

    #Building the Seq2Seq model
    import tensorflow as tf
    tf.keras.backend.clear_session()
    from keras.layers import SimpleRNN

    encoderInput = Input(shape=(inputLength,))
    decoderInput = Input(shape=(ouputLength,))

    #Encoder
    neuronDim = 512
    encoderEmbeddings = Embedding(dictSize, 128, input_length=inputLength, mask_zero=True)(encoderInput)
    encoderLstm = LSTM(neuronDim, return_sequences=True, unroll=True)(encoderEmbeddings)
    encoderState = encoderLstm[:,-1,:]

    print('encoderLstm: ', encoderLstm)
    print('encoderState: ', encoderState)

    #Decoder, with encoderStates as initial state
    decoderEmbeddings = Embedding(dictSize, 128, input_length=ouputLength, mask_zero=True)(decoderInput)
    decoderLstm = LSTM(neuronDim, return_sequences=True, unroll=True)(decoderEmbeddings, initial_state=[encoderState, encoderState])

    print('decoderLstm: ', decoderLstm)

    #Attention Mechanism
    from keras.layers import Activation, dot, concatenate

    attention = dot([decoderLstm, encoderLstm], axes=[2, 2])
    attention = Activation('softmax', name='attention')(attention)
    print('attention: ', attention)

    contextVector = dot([attention, encoderLstm], axes=[2,1])
    print('contextVector: ', contextVector)

    decoderCombinedContext = concatenate([contextVector, decoderLstm])
    print('decoderCombinedContext: ', decoderCombinedContext)

    #Another weight and tanh layer
    decoderOutput = TimeDistributed(Dense(neuronDim, activation="tanh"))(decoderCombinedContext)
    decoderOutput = TimeDistributed(Dense(dictSize, activation="softmax"))(decoderOutput)
    print('decoderOutput: ', decoderOutput)


    #Define the model
    model = Model(inputs=[encoderInput, decoderInput], outputs=[decoderOutput])
    #Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy')
    model.summary()

    #preparing data for encoder and decoder
    trainEncoderInput = encodedTrainInput
    trainDecoderInput = np.zeros_like(encodedTrainOutput)
    trainDecoderInput[:, 1:] = encodedTrainOutput[:,:-1]
    trainDecoderInput[:, 0] = 1
    trainDecoderOutput = np.eye(dictSize)[encodedTrainOutput.astype('int')]

    testEncoderInput = encodedValInput
    testDecoderInput = np.zeros_like(encodedValOutput)
    testDecoderInput[:, 1:] = encodedValOutput[:,:-1]
    testDecoderInput[:, 0] = 1
    testDecoderOutput = np.eye(dictSize)[encodedValOutput.astype('int')]

    '''
    #UNCOMMENT to TRAIN THE MODEL
    for i in range(10):
      model.fit(x=[trainEncoderInput, trainDecoderInput], y=[trainDecoderOutput],
                    validation_data=([testEncoderInput, testDecoderInput], [testDecoderOutput]),
                    #validation_split=0.05,
                    batch_size=64, epochs=10)

    model.save('model_attention_weights.h5')
    '''
    #Load the model
    print('Loading the Model')
    model = load_model('model_attention_60.h5')

    #Define prediction function
    def prediction(raw_input):
        cleanInput = removePuncAndClean(raw_input)
        inputToken = [nltk.word_tokenize(cleanInput)]
        inputToken = [inputToken[0][::-1]]  #reverseing input seq
        encoderInput = modify(encodingDict, inputToken, 25)
        decoderInput = np.zeros(shape=(len(encoderInput), ouputLength))
        decoderInput[:,0] = 1
        for i in range(1, ouputLength):
            decoderOutput = model.predict([encoderInput, decoderInput]).argmax(axis=2)
            decoderInput[:,i] = decoderOutput[:,i]
        return decoderOutput

    def decodeSequence(decodingDict, vector):
        txt = ''
        for i in vector:
            if i == 0:
                break
            txt += ' '
            txt += decodingDict[i]
        return txt

    print("FIVE EXAMPLES: TRAIN SENTENCE PREDICTIONS: ")
    print('---')
    for i in range(5):
        seq_index = np.random.randint(1, len(XX_train))
        output = prediction(XX_train[seq_index])
        print('Question           :', XX_train[seq_index])
        print('Actual Response    : ', yy_train[seq_index])
        print('Predicted Response : ', decodeSequence(decodingDict, output[0]))
        print('----')

    print('----')
    print("FIVE EXAMPLES: TEST SENTENCE PREDICTIONS: ")
    print('---')
    for i in range(5):
        seq_index = np.random.randint(1, len(XX_test))
        output = prediction(XX_test[seq_index])
        print('Question           :', XX_test[seq_index])
        print('Actual Response    : ', yy_test[seq_index])
        print('Predicted Response : ', decodeSequence(decodingDict, output[0]))
        print('----')
    print('----')
    print('Importing libraries to calculate Bleu score...')
    import nltk
    from nltk.translate.bleu_score import SmoothingFunction
    from nltk.translate.bleu_score import sentence_bleu

    c = SmoothingFunction()
    print('---')
    print("Calculating Bleu Score for Train data ...")
    print('---')
    bleuScoresTrain = []
    for x,y in zip(XX_train, yy_train):

        output = prediction(x)

        actualOutput = y
        predictedOutput = decodeSequence(decodingDict, output[0])

        ref = actualOutput.split(' ')
        pred = predictedOutput.split(' ')

        if len(ref) >= 4 and len(pred) >= 4:
            BLEUscore = sentence_bleu([ref], pred, smoothing_function = c.method2)
        elif len(ref) >= 3 and len(pred) >= 3:
            BLEUscore = sentence_bleu([ref], pred, weights = (1.0/3, 1.0/3, 1.0/3), smoothing_function = c.method2)
        elif len(ref) >= 2 and len(pred) >= 2:
            BLEUscore = sentence_bleu([ref], pred, weights = (0.5, 0.5), smoothing_function = c.method2)
        else:
            BLEUscore = sentence_bleu([ref], pred, weights = [1], smoothing_function = c.method2)
        bleuScoresTrain.append(BLEUscore)

    print("Bleu Score for Train data: ", sum(bleuScoresTrain)/float(len(bleuScoresTrain)))
    print('---')
    print("Calculating Bleu Score for Test data...")
    print('---')
    bleuScoresTest = []
    for x,y in zip(XX_test, yy_test):

        output = prediction(x)

        actualOutput = y
        predictedOutput = decodeSequence(decodingDict, output[0])

        ref = actualOutput.split(' ')
        pred = predictedOutput.split(' ')

        if len(ref) >= 4 and len(pred) >= 4:
            BLEUscore = sentence_bleu([ref], pred, smoothing_function = c.method2)
        elif len(ref) >= 3 and len(pred) >= 3:
            BLEUscore = sentence_bleu([ref], pred, weights = (1/3, 1/3, 1/3), smoothing_function = c.method2)
        elif len(ref) >= 2 and len(pred) >= 2:
            BLEUscore = sentence_bleu([ref], pred, weights = (0.5, 0.5), smoothing_function = c.method2)
        else:
            BLEUscore = sentence_bleu([ref], pred, weights = [1], smoothing_function = c.method2)
        bleuScoresTest.append(BLEUscore)

    print("Bleu Score for Test data: ", sum(bleuScoresTest)/float(len(bleuScoresTest)))
    print('---')
    print("END!")


#MAIN FUNCTION
if __name__ == "__main__":
    print('Scanning through the corpus...')
    nltk.download('punkt')
    #data_path = sys.argv[1]
    data_path = '/content/'
    main(data_path)

Scanning through the corpus...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Total Number of questions:  221616
Total Number of answers  : 221616
Train data size is:  12000
Test data size is :  3000
---
Random Sample Data: 
                                                   input                                             target
821                             i am trying to tell you.                       START alright. go ahead. END
7710           i am dealing with what god put before me.  START you believe he wants this? a woman chain...
8128                                  this feels stupid.  START good for a smart girl to feel stupid. pa...
6307                         so what are you doing here?  START he wants to know who brought in the chal...
1920                                          about him?  START no. well, not exactly...it is that...i f...
14685  i cannot wait until i can drive next year. i w...       

In [None]:
# Load the data
movieLines = open('movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')
movieConversations = open('movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')

In [None]:
#Data Preprocessing
#Map each line's id with its text by creating a dictionary
id2line = {}
for l in movieLines:
    L = l.split(' +++$+++ ')
    if len(L) == 5:
        id2line[L[0]] = L[4]

In [None]:
# Create a list of all of the conversations' lines' ids.
conv_ids = []
for c in movieConversations[:-1]:
    C = c.split(' +++$+++ ')[-1][1:-1].replace("'","").replace(" ","")
    conv_ids.append(C.split(','))

In [None]:
'''
#id and conversation sample
for k in conv_ids[3]:
    print (k, id2line[k])
    '''

'\n#id and conversation sample\nfor k in conv_ids[3]:\n    print (k, id2line[k])\n    '

In [None]:
#Sort the sentences into questions (inputs) and answers (targets)
questions = []
answers = []
for c in conv_ids:
    for i in range(len(c)-1):
        questions.append(id2line[c[i]])
        answers.append(id2line[c[i+1]])
        

#Print length of question set and answer set
print('Total Number of questions: ', len(questions))
print('Total Number of answers  :', len(answers))

Total Number of questions:  221616
Total Number of answers  : 221616


In [None]:
#Removing the punctuations and cleaning
def removePuncAndClean(txt):
    txt = txt.lower()
    txt = re.sub(r"i'm", "i am", txt)
    txt = re.sub(r"he's", "he is", txt)
    txt = re.sub(r"she's", "she is", txt)
    txt = re.sub(r"it's", "it is", txt)
    txt = re.sub(r"that's", "that is", txt)
    txt = re.sub(r"what's", "that is", txt)
    txt = re.sub(r"where's", "where is", txt)
    txt = re.sub(r"how's", "how is", txt)
    txt = re.sub(r"\'ll", " will", txt)
    txt = re.sub(r"\'ve", " have", txt)
    txt = re.sub(r"\'re", " are", txt)
    txt = re.sub(r"\'d", " would", txt)
    txt = re.sub(r"won't", "will not", txt)
    txt = re.sub(r"can't", "cannot", txt)
    txt = re.sub(r"n't", " not", txt)
    txt = re.sub(r"n'", "ng", txt)
    txt = re.sub(r"'bout", "about", txt)
    txt = re.sub(r"'til", "until", txt)
    txt = re.sub(r"[-()\"#/@;:<>{}`+=~|]", "", txt)
    txt = " ".join(txt.split())
    return txt

In [None]:
#Cleaning the data
cleanQuestions = []
for q in questions:
    cleanQuestions.append(removePuncAndClean(q))
cleanAnswers = []    
for a in answers:
    cleanAnswers.append(removePuncAndClean(a))

In [None]:
#Determining the length of sentences
lengths = []
for q in cleanQuestions:
    lengths.append(len(q.split()))
for a in cleanAnswers:
    lengths.append(len(a.split()))

'''
# Create a dataframe so that the values can be inspected
lengths = pd.DataFrame(lengths, columns=['counts'])
print(np.percentile(lengths, 80))
print(np.percentile(lengths, 85))
print(np.percentile(lengths, 90))
print(np.percentile(lengths, 95))
'''

"\n# Create a dataframe so that the values can be inspected\nlengths = pd.DataFrame(lengths, columns=['counts'])\nprint(np.percentile(lengths, 80))\nprint(np.percentile(lengths, 85))\nprint(np.percentile(lengths, 90))\nprint(np.percentile(lengths, 95))\n"

In [None]:
# Remove questions and answers that are shorter than 1 word and longer than 25 words.

smallQuestions = []
smallAnswers = []
for i, q in enumerate(cleanQuestions):
    if len(q.split()) >= 2 and len(q.split()) <= 25:
        smallQuestions.append(q)
        smallAnswers.append(cleanAnswers[i])

# Filter out the answers that are too short/long

cleanQuestions = []
cleanAnswers = []

for i, a in enumerate(smallAnswers):
    if len(a.split()) >= 2 and len(a.split()) <= 25:
        cleanAnswers.append(a)
        cleanQuestions.append(smallQuestions[i])


In [None]:
'''
r = np.random.randint(1,len(cleanQuestions))

for i in range(r, r+3):
    print(cleanQuestions[i])
    print(shortAnswers[i])
    print()

'''

'\nr = np.random.randint(1,len(cleanQuestions))\n\nfor i in range(r, r+3):\n    print(cleanQuestions[i])\n    print(shortAnswers[i])\n    print()\n\n'

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
#choosing number of samples
SampleSize = 15000  # Number of samples to train on.
cleanQuestions = cleanQuestions[:SampleSize]
cleanAnswers = cleanAnswers[:SampleSize]
#tokenizing the qns and answers
allInputWords = [nltk.word_tokenize(sent) for sent in cleanQuestions]
allTargetWords = [nltk.word_tokenize(sent) for sent in cleanAnswers]

In [None]:
#train-validation split
dataSize = len(allInputWords)

# We will use the first 0-80th %-tile (80%) of data for the training
X_train  = allInputWords[:round(dataSize*(80/100))]
X_train  = [tr_input[::-1] for tr_input in X_train] #reverseing input seq for better performance
y_train = allTargetWords[:round(dataSize*(80/100))]

# We will use the remaining for validation
X_test = allInputWords[round(dataSize*(80/100)):]
X_test  = [val_input[::-1] for val_input in X_test] #reverseing input seq for better performance
y_test = allTargetWords[round(dataSize*(80/100)):]

print('Train data size is: ', len(X_train))
print("Test data size is : ", len(X_test))

Train data size is:  12000
Test data size is :  3000


In [None]:
XX_train = cleanQuestions[:round(dataSize*(80/100))]
yy_train = cleanAnswers[:round(dataSize*(80/100))]

XX_test = cleanQuestions[round(dataSize*(80/100)):]
yy_test = cleanAnswers[round(dataSize*(80/100)):]

print(XX_train[:2])
print(XX_test)

['can we make this quick? roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad. again.', 'well, i thought we would start with pronunciation, if that is okay with you.']


In [None]:
# Create a dictionary for the frequency of the vocabulary
# Create 
vocabulary = {}
for question in allInputWords:
    for word in question:
        if word not in vocabulary:
            vocabulary[word] = 1
        else:
            vocabulary[word] += 1

for answer in allTargetWords:
    for word in answer:
        if word not in vocabulary:
            vocabulary[word] = 1
        else:
            vocabulary[word] += 1     


Encoder token size is 15000 and decoder token size is 15000


In [None]:
print(vocabulary)



In [None]:
# Remove rare words from the vocabulary.
# We will aim to replace fewer than 5% of words with <UNK>
# You will see this ratio soon.
threshold = 15
count = 0
for k,v in vocabulary.items():
    if v >= threshold:
        count += 1

In [None]:
print("Size of total vocabulary:", len(vocabulary))
print("Size of vocabulary we will use:", count)

Size of total vocabulary: 11763
Size of vocabulary we will use: 1270


In [None]:
#we will create dictionaries to provide a unique integer for each word.
#WORD_CODE_START = 1
#WORD_CODE_PADDING = 0


word_num  = 2 #number 1 is left for WORD_CODE_START for model decoder later
encodingDict = {}
decodingDict = {1: 'START'}
for word, count in vocabulary.items():
    if count >= threshold: #get vocabularies that appear above threshold count
        encodingDict[word] = word_num 
        decodingDict[word_num ] = word
        word_num += 1

print("No. of vocabulary used:", word_num)

No. of vocabulary used: 1272


In [None]:
print(decodingDict)

{1: 'START', 2: 'can', 3: 'we', 4: 'make', 5: 'this', 6: '?', 7: 'and', 8: 'are', 9: 'having', 10: 'an', 11: 'break', 12: 'up', 13: 'on', 14: 'the', 15: '.', 16: 'again', 17: 'well', 18: ',', 19: 'i', 20: 'thought', 21: 'would', 22: 'start', 23: 'with', 24: 'if', 25: 'that', 26: 'is', 27: 'okay', 28: 'you', 29: 'not', 30: 'part', 31: 'please', 32: 'asking', 33: 'me', 34: 'out', 35: 'so', 36: 'your', 37: 'name', 38: 'thing', 39: 'am', 40: 'at', 41: 'of', 42: 'a', 43: 'my', 44: 'sister', 45: 'date', 46: 'until', 47: 'she', 48: 'does', 49: 'used', 50: 'to', 51: 'be', 52: 'really', 53: 'when', 54: 'started', 55: 'high', 56: 'school', 57: 'then', 58: 'it', 59: 'was', 60: 'just', 61: 'like', 62: 'got', 63: 'sick', 64: 'or', 65: 'something', 66: 'only', 67: 'could', 68: 'find', 69: 'boyfriend', 70: '...', 71: 'ma', 72: 'head', 73: 'because', 74: 'such', 75: 'nice', 76: 'one', 77: 'how', 78: 'our', 79: 'little', 80: 'plan', 81: 'mind', 82: 'have', 83: 'word', 84: 'as', 85: 'do', 86: 'get', 87:

In [None]:
#include unknown token for words not in dictionary
decodingDict[len(encodingDict)+2] = '<UNK>'
encodingDict['<UNK>'] = len(encodingDict)+2


In [None]:
dictSize = word_num+1

In [None]:
def modify(encodingDict, data, vector_size=20):
    """
    :param encoding: encodingDict dict built by build_word_encoding()
    :param data: list of strings
    :param vector_size: size of each encoded vector
    """
    transformedData = np.zeros(shape=(len(data), vector_size))
    for i in range(len(data)):
        for j in range(min(len(data[i]), vector_size)):
            try:
                transformedData[i][j] = encodingDict[data[i][j]]
            except:
                transformedData[i][j] = encodingDict['<UNK>']
    return transformedData

In [None]:
#encoding training set
inputLength = 25
ouputLength = 25
encodedTrainInput = modify(encodingDict, X_train, vector_size=inputLength)
encodedTrainOutput = modify(encodingDict, y_train, vector_size=ouputLength)

print('encodedTrainInput: ', encodedTrainInput.shape)
print('encodedTrainOutput', encodedTrainOutput.shape)

encodedTrainInput:  (12000, 25)
encodedTrainOutput (12000, 25)


In [None]:
#encoding validation set
encodedValInput = modify(encodingDict, X_test, vector_size=inputLength)
encodedValOutput = modify(encodingDict, y_test, vector_size=ouputLength)

print('encodedValInput', encodedValInput.shape)
print('encodedValOutput', encodedValOutput.shape)

encodedValInput (3000, 25)
encodedValOutput (3000, 25)


In [None]:
#2 Model Building
#2.1 Sequence-to-Sequence in Keras
import tensorflow as tf
tf.keras.backend.clear_session()

In [None]:
inputLength = 25
ouputLength = 25

encoderInput = Input(shape=(inputLength,))
decoderInput = Input(shape=(ouputLength,))

In [None]:
from keras.layers import SimpleRNN

encoderInput = Input(shape=(inputLength,))
decoderInput = Input(shape=(ouputLength,))

neuronDim = 512
encoderEmbeddings = Embedding(dictSize, 128, input_length=inputLength, mask_zero=True)(encoderInput)
encoderLstm = LSTM(neuronDim, return_sequences=True, unroll=True)(encoderEmbeddings)
encoderState = encoderLstm[:,-1,:]

print('encoderLstm: ', encoderLstm)
print('encoderState: ', encoderState)

decoderEmbeddings = Embedding(dictSize, 128, input_length=ouputLength, mask_zero=True)(decoderInput)
decoderLstm = LSTM(neuronDim, return_sequences=True, unroll=True)(decoderEmbeddings, initial_state=[encoderState, encoderState])

print('decoderLstm: ', decoderLstm)

# For the plain Sequence-to-Sequence, we produced the output from directly from decoder
# output = TimeDistributed(Dense(output_dict_size, activation="softmax"))(decoder)

encoderLstm:  Tensor("lstm/transpose_2:0", shape=(None, 25, 512), dtype=float32)
encoderState:  Tensor("strided_slice:0", shape=(None, 512), dtype=float32)
decoderLstm:  Tensor("lstm_1/transpose_2:0", shape=(None, 25, 512), dtype=float32)


In [None]:
#2.2 Attention Mechanism
#Reference: Effective Approaches to Attention-based Neural Machine Translation's Global Attention with Dot-based scoring function (Section 3, 3.1) https://arxiv.org/pdf/1508.04025.pdf

from keras.layers import Activation, dot, concatenate

# Equation (7) with 'dot' score from Section 3.1 in the paper.
# Note that we reuse Softmax-activation layer instead of writing tensor calculation
attention = dot([decoderLstm, encoderLstm], axes=[2, 2])
attention = Activation('softmax', name='attention')(attention)
print('attention: ', attention)

contextVector = dot([attention, encoderLstm], axes=[2,1])
print('contextVector: ', contextVector)

decoderCombinedContext = concatenate([contextVector, decoderLstm])
print('decoderCombinedContext: ', decoderCombinedContext)

# Has another weight + tanh layer as described in equation (5) of the paper
decoderOutput = TimeDistributed(Dense(neuronDim, activation="tanh"))(decoderCombinedContext)
decoderOutput = TimeDistributed(Dense(dictSize, activation="softmax"))(decoderOutput)
print('decoderOutput: ', decoderOutput)

attention:  Tensor("attention/truediv:0", shape=(None, 25, 25), dtype=float32)
contextVector:  Tensor("dot_1/MatMul:0", shape=(None, 25, 512), dtype=float32)
decoderCombinedContext:  Tensor("concatenate/concat:0", shape=(None, 25, 1024), dtype=float32)
decoderOutput:  Tensor("time_distributed_1/Reshape_1:0", shape=(None, 25, 1273), dtype=float32)


In [None]:
model = Model(inputs=[encoderInput, decoderInput], outputs=[decoderOutput])
model.compile(optimizer='adam', loss='binary_crossentropy')
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 25)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 25, 128)      162944      input_1[0][0]                    
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 25)]         0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     (None, 25, 512)      1312768     embedding[0][0]                  
_______________________________________________________________________________________

In [None]:
trainEncoderInput = encodedTrainInput
trainDecoderInput = np.zeros_like(encodedTrainOutput)
trainDecoderInput[:, 1:] = encodedTrainOutput[:,:-1]
trainDecoderInput[:, 0] = 1
trainDecoderOutput = np.eye(dictSize)[encodedTrainOutput.astype('int')]

testEncoderInput = encodedValInput
testDecoderInput = np.zeros_like(encodedValOutput)
testDecoderInput[:, 1:] = encodedValOutput[:,:-1]
testDecoderInput[:, 0] = 1
testDecoderOutput = np.eye(dictSize)[encodedValOutput.astype('int')]

In [None]:
'''
#UNCOMMENT to TRAIN THE MODEL
for i in range(10):
  model.fit(x=[trainEncoderInput, trainDecoderInput], y=[trainDecoderOutput],
                validation_data=([testEncoderInput, testDecoderInput], [testDecoderOutput]),
                #validation_split=0.05,
                batch_size=64, epochs=10)

  model.save('model_attention_weights.h5')
'''

"\n#UNCOMMENT to TRAIN THE MODEL\nfor i in range(10):\n  model.fit(x=[trainEncoderInput, trainDecoderInput], y=[trainDecoderOutput],\n                validation_data=([testEncoderInput, testDecoderInput], [testDecoderOutput]),\n                #validation_split=0.05,\n                batch_size=64, epochs=10)\n\n  model.save('model_attention_weights.h5')\n"

In [None]:
model = load_model('model_attention_60.h5')



In [None]:
model = load_model('model_attention_60.h5')
def prediction(raw_input):
    cleanInput = removePuncAndClean(raw_input)
    inputToken = [nltk.word_tokenize(cleanInput)]
    inputToken = [inputToken[0][::-1]]  #reverseing input seq
    encoderInput = modify(encodingDict, inputToken, 25)
    decoderInput = np.zeros(shape=(len(encoderInput), ouputLength))
    decoderInput[:,0] = 1
    for i in range(1, ouputLength):
        decoderOutput = model.predict([encoderInput, decoderInput]).argmax(axis=2)
        decoderInput[:,i] = decoderOutput[:,i]
    return decoderOutput

In [None]:
def decodeSequence(decodingDict, vector):
    """
    :param decoding: decodingDict dict built by word encodingDict
    :param vector: an encoded vector
    """
    txt = ''
    for i in vector:
        if i == 0:
            break
        txt += ' '
        txt += decodingDict[i]
    return txt

In [None]:
"""
#NEW CODE USER INPUT

output = prediction(input("User: "))
print('Outp: ',decode(decodingDict, output[0]))

"""

'\n#NEW CODE USER INPUT\n\noutput = prediction(input("User: "))\nprint(\'Outp: \',decode(decodingDict, output[0]))\n\n'

In [None]:
print("FIVE EXAMPLES: TRAIN SENTENCE PREDICTIONS: ")
print('---')
for i in range(5):
    seq_index = np.random.randint(1, len(XX_train))
    output = prediction(XX_train[seq_index])
    print('Question           :', XX_train[seq_index])
    print('Actual Response    : ', yy_train[seq_index])
    print('Predicted Response : ', decodeSequence(decodingDict, output[0]))
    print('----')

print('----')
print("FIVE EXAMPLES: TEST SENTENCE PREDICTIONS: ")
print('---')
for i in range(5):
    seq_index = np.random.randint(1, len(XX_test))
    output = prediction(XX_test[seq_index])
    print('Question           :', XX_test[seq_index])
    print('Actual Response    : ', yy_test[seq_index])
    print('Predicted Response : ', decodeSequence(decodingDict, output[0]))
    print('----')


FIVE EXAMPLES: TRAIN SENTENCE PREDICTIONS: 
---
Question           : that is not
Actual Response    :  i am not stupid enough to repeat your mistakes.
Predicted Response :   you interested in a little work ?
----
Question           : they hired you. you are like, a private detective?
Actual Response    :  that is exactly what i am.
Predicted Response :   that is exactly what i am .
----
Question           : so much for honor among thieves. you would have cut loose your friends, your girl...
Actual Response    :  i was doing it for them.
Predicted Response :   i was doing it for them .
----
Question           : he was hurt, but not seriously. he will be fine.
Actual Response    :  do you have the suspect in custody?
Predicted Response :   do you have the suspect in <UNK> ?
----
Question           : we cannot jump from here or at this speed. but if we could get a message out tell the refueling plane...
Actual Response    :  they have cut communication, and i spent a good bit of time look

In [None]:
print('Importing libraries to calculate bleu score...')
import nltk
from nltk.translate.bleu_score import SmoothingFunction
from nltk.translate.bleu_score import sentence_bleu

c = SmoothingFunction()
print('---')
print("Calculating Bleu Score for Train data...")
print('---')
bleuScoresTrain = []
for x,y in zip(XX_train[:5], yy_train[:5]):

    output = prediction(x)

    actualOutput = y
    predictedOutput = decodeSequence(decodingDict, output[0])

    ref = actualOutput.split(' ')
    pred = predictedOutput.split(' ')

    if len(ref) >= 4 and len(pred) >= 4:
        BLEUscore = sentence_bleu([ref], pred, smoothing_function = c.method2)
    elif len(ref) >= 3 and len(pred) >= 3:
        BLEUscore = sentence_bleu([ref], pred, weights = (1.0/3, 1.0/3, 1.0/3), smoothing_function = c.method2)
    elif len(ref) >= 2 and len(pred) >= 2:
        BLEUscore = sentence_bleu([ref], pred, weights = (0.5, 0.5), smoothing_function = c.method2)
    else:
        BLEUscore = sentence_bleu([ref], pred, weights = [1], smoothing_function = c.method2)
    bleuScoresTrain.append(BLEUscore)

print("Bleu Score for Train data: ", sum(bleuScoresTrain)/float(len(bleuScoresTrain)))
print('---')
print("Calculating Bleu Score for Test data...")
print('---')
bleuScoresTest = []
for x,y in zip(XX_test[:5], yy_test[:5]):

    output = prediction(x)

    actualOutput = y
    predictedOutput = decodeSequence(decodingDict, output[0])

    ref = actualOutput.split(' ')
    pred = predictedOutput.split(' ')

    if len(ref) >= 4 and len(pred) >= 4:
        BLEUscore = sentence_bleu([ref], pred, smoothing_function = c.method2)
    elif len(ref) >= 3 and len(pred) >= 3:
        BLEUscore = sentence_bleu([ref], pred, weights = (1/3, 1/3, 1/3), smoothing_function = c.method2)
    elif len(ref) >= 2 and len(pred) >= 2:
        BLEUscore = sentence_bleu([ref], pred, weights = (0.5, 0.5), smoothing_function = c.method2)
    else:
        BLEUscore = sentence_bleu([ref], pred, weights = [1], smoothing_function = c.method2)
    bleuScoresTest.append(BLEUscore)

print("Bleu Score for Test data: ", sum(bleuScoresTest)/float(len(bleuScoresTest)))
print('---')
print("END!")

Importing libraries to calculate bleu score...
---
Calculating Bleu Score for Train data...
---
Bleu Score for Train data:  0.34871464062534524
---
Calculating Bleu Score for Test data...
---
Bleu Score for Test data:  0.09274912611049727
---
END!
