In [1]:
import numpy as np
import tensorflow as tf
import pickle
from tensorflow.keras import layers , activations , models , preprocessing
import os
import pandas as pd

In [2]:
r""" Build a word sequence-to-sequence training set """

from tensorflow.keras import preprocessing , utils

questions = list()
answers = list()

with open("movies-sequence-input.txt", "r") as file_input:
    movie_input = file_input.read()
df_input = pd.DataFrame(movie_input.split('\n'),columns=list('i'))
df_input = df_input.fillna(' ')

with open("movies-sequence-output.txt", "r") as file_output:
    movie_output = file_output.read()
df_output = pd.DataFrame(movie_output.split('\n'),columns=list('o'))
df_output = df_output.fillna(' ')

for input_text, target_text in zip(df_input.i, df_output.o):
    if(len(input_text)>400):
        questions.append(input_text[:400])
    else:
        questions.append(input_text)
    if(len(target_text)>400):
        answers.append(target_text[:400])
    else:
        answers.append(target_text)
answers_with_tags = list()
for i in range( len( answers ) ):
    if type( answers[i] ) == str:
        answers_with_tags.append( answers[i] )
    else:
        questions.pop( i )

answers = list()
for i in range( len( answers_with_tags ) ) :
    answers.append( '<START> ' + answers_with_tags[i] + ' <END>' )

tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts( questions + answers )
VOCAB_SIZE = len( tokenizer.word_index )+1
print( 'VOCAB SIZE : {}'.format( VOCAB_SIZE ))


VOCAB SIZE : 7159


In [3]:
# encoder_input_data
tokenized_questions = tokenizer.texts_to_sequences( questions )
maxlen_questions = max( [ len(x) for x in tokenized_questions ] )
padded_questions = preprocessing.sequence.pad_sequences( tokenized_questions , maxlen=maxlen_questions , padding='post' )
encoder_input_data = np.array( padded_questions )
print( encoder_input_data.shape , maxlen_questions )

# decoder_input_data
tokenized_answers = tokenizer.texts_to_sequences( answers )
maxlen_answers = max( [ len(x) for x in tokenized_answers ] )
padded_answers = preprocessing.sequence.pad_sequences( tokenized_answers , maxlen=maxlen_answers , padding='post' )
decoder_input_data = np.array( padded_answers )
print( decoder_input_data.shape , maxlen_answers )

# decoder_output_data
tokenized_answers = tokenizer.texts_to_sequences( answers )
for i in range(len(tokenized_answers)) :
    tokenized_answers[i] = tokenized_answers[i][1:]
padded_answers = preprocessing.sequence.pad_sequences( tokenized_answers , maxlen=maxlen_answers , padding='post' )
onehot_answers = utils.to_categorical( padded_answers , VOCAB_SIZE )
decoder_output_data = np.array( onehot_answers )
print( decoder_output_data.shape )

# Saving all the arrays to storage
np.save( 'enc_in_data.npy' , encoder_input_data )
np.save( 'dec_in_data.npy' , decoder_input_data )
np.save( 'dec_tar_data.npy' , decoder_output_data )


(6236, 83) 83
(6236, 89) 89
(6236, 89, 7159)


In [4]:
encoder_inputs = tf.keras.layers.Input(shape=( None , ))
encoder_embedding = tf.keras.layers.Embedding( VOCAB_SIZE, 200 , mask_zero=True ) (encoder_inputs)
encoder_outputs , state_h , state_c = tf.keras.layers.LSTM( 200 , return_state=True )( encoder_embedding )
encoder_states = [ state_h , state_c ]

decoder_inputs = tf.keras.layers.Input(shape=( None ,  ))
decoder_embedding = tf.keras.layers.Embedding( VOCAB_SIZE, 200 , mask_zero=True) (decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM( 200 , return_state=True , return_sequences=True )
decoder_outputs , _ , _ = decoder_lstm ( decoder_embedding , initial_state=encoder_states )
decoder_dense = tf.keras.layers.Dense( VOCAB_SIZE , activation=tf.keras.activations.softmax ) 
output = decoder_dense ( decoder_outputs )

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output )
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy')

model.summary()


Instructions for updating:
Colocations handled automatically by placer.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 200)    1431800     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 200)    1431800     input_2[0][0]                    
_____________________________________

In [5]:
model.fit([encoder_input_data , decoder_input_data], decoder_output_data, batch_size=50, epochs=100 ) 
model.save( 'model14.h5' ) 

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100

Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [6]:
def make_inference_models():
    
    encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)
    
    decoder_state_input_h = tf.keras.layers.Input(shape=( 200 ,))
    decoder_state_input_c = tf.keras.layers.Input(shape=( 200 ,))
    
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
    decoder_outputs, state_h, state_c = decoder_lstm(
        decoder_embedding , initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = tf.keras.models.Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states)
    
    return encoder_model , decoder_model


In [12]:
def str_to_tokens( sentence : str ):
    words = sentence.lower().split()
    tokens_list = list()
    for word in words:
        try:
            tokens_list.append( tokenizer.word_index[ word ] ) 
        except KeyError as e:
             print('Movie Bot: I am sorry I do not understand this word ' + word)
    return preprocessing.sequence.pad_sequences( [tokens_list] , maxlen=maxlen_questions , padding='post')


In [8]:

enc_model , dec_model = make_inference_models()


for _ in range(10):
    prompt = input( 'Movie bot prompt : ' )
    if(prompt == 'end'):
        break
    states_values = enc_model.predict( str_to_tokens( prompt) )
    empty_target_seq = np.zeros( ( 1 , 1 ) )
    empty_target_seq[0, 0] = tokenizer.word_index['start']
    stop_condition = False
    decoded_translation = ''
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([ empty_target_seq ] + states_values )
        sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
        sampled_word = None
        for word , index in tokenizer.word_index.items() :
            if sampled_word_index == index :
                decoded_translation += ' {}'.format( word )
                sampled_word = word
                       
        if sampled_word == "end" or prompt == 'end' or len(decoded_translation.split()) > maxlen_answers:
            print("End")
            stop_condition = True
            
        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        states_values = [ h , c ] 

    print( "Movie_bot: " + decoded_translation )


Movie bot prompt : Hi how are you
End
Movie_bot:  not gonna gonna have to see me end
Movie bot prompt : what do you mean
End
Movie_bot:  i mean is it is it end
Movie bot prompt : Is that really what you mean
End
Movie_bot:  no end
Movie bot prompt : excuse me
End
Movie_bot:  yes end
Movie bot prompt : that does not make sense
End
Movie_bot:  what end
Movie bot prompt : this is fun
End
Movie_bot:  yeah it sounds great end
Movie bot prompt : ran out of time
End
Movie_bot:  that's what you going to tell me who i didn't want to know i don't want to know what it will this is better than you end
Movie bot prompt : what
End
Movie_bot:  i can't end
Movie bot prompt : I guess this is good
End
Movie_bot:  you and you know what's the name end
Movie bot prompt : The wine guy
End
Movie_bot:  it got an actor so end


In [None]:
for _ in range(10):
    prompt = input( 'Movie bot prompt : ' )
    if(prompt == 'end'):
        break
    states_values = enc_model.predict( str_to_tokens( prompt) )
    empty_target_seq = np.zeros( ( 1 , 1 ) )
    empty_target_seq[0, 0] = tokenizer.word_index['start']
    stop_condition = False
    decoded_translation = ''
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([ empty_target_seq ] + states_values )
        sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
        sampled_word = None
        for word , index in tokenizer.word_index.items() :
            if sampled_word_index == index :
                decoded_translation += ' {}'.format( word )
                sampled_word = word
                       
        if sampled_word == "end" or prompt == 'end' or len(decoded_translation.split()) > maxlen_answers:
            print("End")
            stop_condition = True
            
        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        states_values = [ h , c ] 

    print( "Movie_bot: " + decoded_translation )


Movie bot prompt : Not from the real world
End
Movie_bot:  i know how it was supposed to be be writing adult world if you don't me me if i don't want to go if it's not feel my drop me a lot of mine i lead if you just were here i can i just will all come in my head and i knew you could she might be throw me again now end
Movie bot prompt : only in my imagination
End
Movie_bot:  he's in love need too know they don't even want back back end
Movie bot prompt : I am going to ask you to marry me
End
Movie_bot:  if you get it to you on the car before that you have most most of it for you end
Movie bot prompt : does that mena we can not be lovers
I am sorry I do not understand this word mena
End
Movie_bot:  then have you been in here and if them you could find you on the ass end
Movie bot prompt : does that mean we can not be lovers
End
Movie_bot:  then can you give you in my world you're about the she out and you a little hope in case something he's interested in hear for the father and you h

In [16]:
for _ in range(10):
    prompt = input( 'Movie bot prompt : ' )
    if(prompt == 'end'):
        break
    states_values = enc_model.predict( str_to_tokens( prompt) )
    empty_target_seq = np.zeros( ( 1 , 1 ) )
    empty_target_seq[0, 0] = tokenizer.word_index['start']
    stop_condition = False
    decoded_translation = ''
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([ empty_target_seq ] + states_values )
        sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
        sampled_word = None
        for word , index in tokenizer.word_index.items() :
            if sampled_word_index == index :
                decoded_translation += ' {}'.format( word )
                sampled_word = word
                       
        if sampled_word == "end" or prompt == 'end' or len(decoded_translation.split()) > maxlen_answers:
            print("End")
            stop_condition = True
            
        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        states_values = [ h , c ] 

    print( "Movie_bot: " + decoded_translation )


Movie bot prompt : do you like baseball
End
Movie_bot:  yes i do end
Movie bot prompt : is baseball fun
End
Movie_bot:  yeah well there's no you're no steve drink to know end
Movie bot prompt : how is your imagination
End
Movie_bot:  she's been thinking about your hair end
Movie bot prompt : what is it like being virtual
I am sorry I do not understand this word virtual
End
Movie_bot:  the one was he said a good live in his date was a special and he haven't got a wazoo in miss my kid and he just out of you home and things home and if you it called me even a lot well you don't know my show you and what's your name and i am not my name end
Movie bot prompt : virtual
I am sorry I do not understand this word virtual
End
Movie_bot:  you know no you're not end
Movie bot prompt : end
