In [1]:
# A very simple and easy to apply chatbot that I built when I first join the working society in year 2018.
# This is just for personal reference or future job application reference. 

In [2]:
import re
import random

In [3]:
with open('../input/695_1639_bundle_archive/human_text.txt', 'r', encoding='utf-8') as file:
    human = file.read().split('\n')
with open('../input/695_1639_bundle_archive/robot_text.txt', 'r', encoding='utf-8') as file2:
    robot = file2.read().split('\n')

In [4]:
human = [re.sub(r"\[\w+\]",'hi',line) for line in human[:300]]
human = [" ".join(re.findall(r"\w+",line)) for line in human]
robot = [re.sub(r"\[\w+\]",'hi',line) for line in robot[:300]]
robot = [" ".join(re.findall(r"\w+",line)) for line in robot]

In [5]:
# create dataframe for easier processing
import pandas as pd
df = pd.DataFrame({'human': human,
             'robot': robot,
             'pairs': list(zip(human, robot))})

In [6]:
#tokenize the data
from nltk.tokenize import word_tokenize
df['tokenized_human'] = df.human.apply(word_tokenize)
df['tokenized_robot'] = df.robot.apply(word_tokenize)

In [7]:
df

Unnamed: 0,human,robot,pairs,tokenized_human,tokenized_robot
0,hi,hi there how are you,"(hi, hi there how are you)",[hi],"[hi, there, how, are, you]"
1,oh thanks i m fine this is an evening in my ti...,here is afternoon,(oh thanks i m fine this is an evening in my t...,"[oh, thanks, i, m, fine, this, is, an, evening...","[here, is, afternoon]"
2,how do you feel today tell me something about ...,my name is rdany but you can call me dany the ...,(how do you feel today tell me something about...,"[how, do, you, feel, today, tell, me, somethin...","[my, name, is, rdany, but, you, can, call, me,..."
3,how many virtual friends have you got,i have many but not enough to fully understand...,"(how many virtual friends have you got, i have...","[how, many, virtual, friends, have, you, got]","[i, have, many, but, not, enough, to, fully, u..."
4,is that forbidden for you to tell the exact nu...,i ve talked with 143 users counting 7294 lines...,(is that forbidden for you to tell the exact n...,"[is, that, forbidden, for, you, to, tell, the,...","[i, ve, talked, with, 143, users, counting, 72..."
...,...,...,...,...,...
295,hello how s going,hi how are you,"(hello how s going, hi how are you)","[hello, how, s, going]","[hi, how, are, you]"
296,i m fine and you,everything is fine on this side,"(i m fine and you, everything is fine on this ...","[i, m, fine, and, you]","[everything, is, fine, on, this, side]"
297,we have a big snow outside,the kind that could make you stay home,"(we have a big snow outside, the kind that cou...","[we, have, a, big, snow, outside]","[the, kind, that, could, make, you, stay, home]"
298,it may but i have to go for a long way,put on warm clothes,"(it may but i have to go for a long way, put o...","[it, may, but, i, have, to, go, for, a, long, ...","[put, on, warm, clothes]"


In [8]:
# Combine input and target features
from itertools import chain

total_input = list(chain.from_iterable(df['tokenized_human'].tolist())) + list(chain.from_iterable(df['tokenized_robot'].tolist()))+['<PAD>']
total_target = total_input+['<START>', '<END>']
input_set = set(total_input)
target_set = set(total_target)

In [9]:
len(input_set)

1214

In [10]:
len(target_set)

1216

In [11]:
# create a dictionary for mapping the words for input features
from itertools import chain

#input features
word_2_int_input = {w:i for i,w in enumerate(input_set)}
int_2_word_input = {i:w for w, i in word_2_int_input.items()}

#target features
word_2_int_target = {w:i for i,w in enumerate(target_set)}
int_2_word_target = {i:w for w, i in word_2_int_target.items()}

In [12]:
len(word_2_int_input)

1214

In [13]:
len(word_2_int_target)

1216

In [14]:
def map_to_int_i(sentence):
    sequence = [word_2_int_input[word] for word in sentence]
    return sequence
def map_to_int_t(sentence):
    sequence = [word_2_int_target[word] for word in sentence]
    return sequence

In [15]:
# map all word to sequence
df['input_sequence'] = df.tokenized_human.apply(map_to_int_i)
df['target_sequence'] = df.tokenized_robot.apply(map_to_int_t)

In [16]:
df

Unnamed: 0,human,robot,pairs,tokenized_human,tokenized_robot,input_sequence,target_sequence
0,hi,hi there how are you,"(hi, hi there how are you)",[hi],"[hi, there, how, are, you]",[301],"[301, 43, 284, 567, 343]"
1,oh thanks i m fine this is an evening in my ti...,here is afternoon,(oh thanks i m fine this is an evening in my t...,"[oh, thanks, i, m, fine, this, is, an, evening...","[here, is, afternoon]","[686, 626, 81, 84, 106, 846, 278, 475, 1075, 6...","[1047, 278, 97]"
2,how do you feel today tell me something about ...,my name is rdany but you can call me dany the ...,(how do you feel today tell me something about...,"[how, do, you, feel, today, tell, me, somethin...","[my, name, is, rdany, but, you, can, call, me,...","[284, 129, 343, 508, 387, 172, 214, 253, 86, 871]","[578, 20, 278, 396, 720, 343, 704, 376, 214, 7..."
3,how many virtual friends have you got,i have many but not enough to fully understand...,"(how many virtual friends have you got, i have...","[how, many, virtual, friends, have, you, got]","[i, have, many, but, not, enough, to, fully, u...","[284, 886, 609, 1005, 480, 343, 541]","[81, 480, 886, 720, 922, 192, 125, 1171, 496, ..."
4,is that forbidden for you to tell the exact nu...,i ve talked with 143 users counting 7294 lines...,(is that forbidden for you to tell the exact n...,"[is, that, forbidden, for, you, to, tell, the,...","[i, ve, talked, with, 143, users, counting, 72...","[278, 1079, 700, 818, 343, 125, 172, 242, 698,...","[81, 395, 363, 590, 384, 4, 7, 734, 1037, 54, 75]"
...,...,...,...,...,...,...,...
295,hello how s going,hi how are you,"(hello how s going, hi how are you)","[hello, how, s, going]","[hi, how, are, you]","[682, 284, 367, 1028]","[301, 284, 567, 343]"
296,i m fine and you,everything is fine on this side,"(i m fine and you, everything is fine on this ...","[i, m, fine, and, you]","[everything, is, fine, on, this, side]","[81, 84, 106, 711, 343]","[400, 278, 106, 427, 846, 954]"
297,we have a big snow outside,the kind that could make you stay home,"(we have a big snow outside, the kind that cou...","[we, have, a, big, snow, outside]","[the, kind, that, could, make, you, stay, home]","[859, 480, 1110, 511, 210, 379]","[242, 723, 1080, 1206, 1002, 343, 1095, 806]"
298,it may but i have to go for a long way,put on warm clothes,"(it may but i have to go for a long way, put o...","[it, may, but, i, have, to, go, for, a, long, ...","[put, on, warm, clothes]","[321, 1143, 720, 81, 480, 125, 420, 818, 1110,...","[802, 427, 664, 1064]"


In [17]:
from keras.preprocessing.sequence import pad_sequences

In [18]:
#Find the length of each sentence

df['human_len'] = df.tokenized_human.str.len()
df['robot_len'] = df.tokenized_robot.str.len()

In [19]:
import numpy as np

# find the max length for each sentence to pad in input
num_tokens = df['human_len'].tolist() + df['robot_len'].tolist()
num_tokens = np.array(num_tokens)
max_num = int(np.mean(num_tokens) + 2*np.std(num_tokens))

In [20]:
max_num

22

In [21]:
df['input_padded_sequences'] = pad_sequences(df['input_sequence'].tolist(), maxlen=max_num, padding='post', truncating='post').tolist()
df['target_padded_sequences'] = pad_sequences(df['target_sequence'].tolist(), maxlen=max_num, padding='post', truncating='post').tolist()

In [22]:
df

Unnamed: 0,human,robot,pairs,tokenized_human,tokenized_robot,input_sequence,target_sequence,human_len,robot_len,input_padded_sequences,target_padded_sequences
0,hi,hi there how are you,"(hi, hi there how are you)",[hi],"[hi, there, how, are, you]",[301],"[301, 43, 284, 567, 343]",1,5,"[301, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[301, 43, 284, 567, 343, 0, 0, 0, 0, 0, 0, 0, ..."
1,oh thanks i m fine this is an evening in my ti...,here is afternoon,(oh thanks i m fine this is an evening in my t...,"[oh, thanks, i, m, fine, this, is, an, evening...","[here, is, afternoon]","[686, 626, 81, 84, 106, 846, 278, 475, 1075, 6...","[1047, 278, 97]",12,3,"[686, 626, 81, 84, 106, 846, 278, 475, 1075, 6...","[1047, 278, 97, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,how do you feel today tell me something about ...,my name is rdany but you can call me dany the ...,(how do you feel today tell me something about...,"[how, do, you, feel, today, tell, me, somethin...","[my, name, is, rdany, but, you, can, call, me,...","[284, 129, 343, 508, 387, 172, 214, 253, 86, 871]","[578, 20, 278, 396, 720, 343, 704, 376, 214, 7...",10,21,"[284, 129, 343, 508, 387, 172, 214, 253, 86, 8...","[578, 20, 278, 396, 720, 343, 704, 376, 214, 7..."
3,how many virtual friends have you got,i have many but not enough to fully understand...,"(how many virtual friends have you got, i have...","[how, many, virtual, friends, have, you, got]","[i, have, many, but, not, enough, to, fully, u...","[284, 886, 609, 1005, 480, 343, 541]","[81, 480, 886, 720, 922, 192, 125, 1171, 496, ...",7,11,"[284, 886, 609, 1005, 480, 343, 541, 0, 0, 0, ...","[81, 480, 886, 720, 922, 192, 125, 1171, 496, ..."
4,is that forbidden for you to tell the exact nu...,i ve talked with 143 users counting 7294 lines...,(is that forbidden for you to tell the exact n...,"[is, that, forbidden, for, you, to, tell, the,...","[i, ve, talked, with, 143, users, counting, 72...","[278, 1079, 700, 818, 343, 125, 172, 242, 698,...","[81, 395, 363, 590, 384, 4, 7, 734, 1037, 54, 75]",10,11,"[278, 1079, 700, 818, 343, 125, 172, 242, 698,...","[81, 395, 363, 590, 384, 4, 7, 734, 1037, 54, ..."
...,...,...,...,...,...,...,...,...,...,...,...
295,hello how s going,hi how are you,"(hello how s going, hi how are you)","[hello, how, s, going]","[hi, how, are, you]","[682, 284, 367, 1028]","[301, 284, 567, 343]",4,4,"[682, 284, 367, 1028, 0, 0, 0, 0, 0, 0, 0, 0, ...","[301, 284, 567, 343, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
296,i m fine and you,everything is fine on this side,"(i m fine and you, everything is fine on this ...","[i, m, fine, and, you]","[everything, is, fine, on, this, side]","[81, 84, 106, 711, 343]","[400, 278, 106, 427, 846, 954]",5,6,"[81, 84, 106, 711, 343, 0, 0, 0, 0, 0, 0, 0, 0...","[400, 278, 106, 427, 846, 954, 0, 0, 0, 0, 0, ..."
297,we have a big snow outside,the kind that could make you stay home,"(we have a big snow outside, the kind that cou...","[we, have, a, big, snow, outside]","[the, kind, that, could, make, you, stay, home]","[859, 480, 1110, 511, 210, 379]","[242, 723, 1080, 1206, 1002, 343, 1095, 806]",6,8,"[859, 480, 1110, 511, 210, 379, 0, 0, 0, 0, 0,...","[242, 723, 1080, 1206, 1002, 343, 1095, 806, 0..."
298,it may but i have to go for a long way,put on warm clothes,"(it may but i have to go for a long way, put o...","[it, may, but, i, have, to, go, for, a, long, ...","[put, on, warm, clothes]","[321, 1143, 720, 81, 480, 125, 420, 818, 1110,...","[802, 427, 664, 1064]",11,4,"[321, 1143, 720, 81, 480, 125, 420, 818, 1110,...","[802, 427, 664, 1064, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [23]:
# We need to pad it first because if we add the <START> and <END> first then after padding these added string might be stripped.

target_padded_temp = df['target_padded_sequences'].str[:-1].tolist()


In [24]:
# create 3d for the final d as onehot encoder for decoder output. [data, sentence, word]
def generate_onehot(word, word_dict):
    z = np.zeros(len(word_dict))
    z[word] = 1
    return z

def get_onehot(series, word_dict):
    onehot_l = []
    for sentence in series.tolist():
        onehot_s = [generate_onehot(word, word_dict) for word in sentence]
        onehot_l.append(onehot_s)
    return onehot_l

%time df['target_padded_onehot'] = get_onehot(df['target_padded_sequences'], word_2_int_target)

CPU times: user 15.6 ms, sys: 62.5 ms, total: 78.1 ms
Wall time: 83.5 ms


In [25]:
# create encoder and decoder input [data, sentence]
encoder_input = np.array(df['input_padded_sequences'].tolist())
decoder_input = np.array(df['target_padded_sequences'].tolist())

In [26]:
encoder_input.shape

(300, 22)

In [27]:
decoder_input.shape

(300, 22)

In [28]:
# Create decoder output [data, sentence, word(onehot)]
decoder_output = np.array(df['target_padded_onehot'].tolist())

In [29]:
decoder_output.shape

(300, 22, 1216)

In [30]:
num_encoder_tokens = len(word_2_int_input)
num_decoder_tokens = len(word_2_int_target)

In [31]:
import tensorflow as tf
from keras.layers import Input, Embedding, LSTM, TimeDistributed, Dense, Bidirectional
from keras.models import Model, load_model
tf.keras.backend.clear_session()

INPUT_LENGTH = max_num
OUTPUT_LENGTH = 21
dict_size = len(word_2_int_target)

encoder_input_l = Input(shape=(INPUT_LENGTH,))
decoder_input_l = Input(shape=(OUTPUT_LENGTH,))

In [32]:
from keras.layers import SimpleRNN

encoder = Embedding(dict_size, 128, input_length=INPUT_LENGTH, mask_zero=True)(encoder_input_l)
encoder = LSTM(512, return_sequences=True, unroll=True)(encoder)
encoder_last = encoder[:,-1,:]

print('encoder', encoder)
print('encoder_last', encoder_last)

decoder = Embedding(dict_size, 128, input_length=OUTPUT_LENGTH, mask_zero=True)(decoder_input_l)
decoder = LSTM(512, return_sequences=True, unroll=True)(decoder, initial_state=[encoder_last, encoder_last])

print('decoder', decoder)

encoder Tensor("lstm/Identity:0", shape=(None, 22, 512), dtype=float32)
encoder_last Tensor("strided_slice:0", shape=(None, 512), dtype=float32)
decoder Tensor("lstm_1/Identity:0", shape=(None, 21, 512), dtype=float32)


In [39]:
encoder

<tf.Tensor 'lstm/Identity:0' shape=(None, 22, 512) dtype=float32>

In [37]:
from keras.layers import Activation, dot, concatenate

# Equation (7) with 'dot' score from Section 3.1 in the paper.
# Note that we reuse Softmax-activation layer instead of writing tensor calculation
attention = dot([decoder, encoder], axes=[2, 2], name='test_attention')
attention = Activation('softmax', name='attention')(attention)
print('attention', attention)

context = dot([attention, encoder], axes=[2,1], name='testttt')
print('context', context)

decoder_combined_context = concatenate([context, decoder])
print('decoder_combined_context', decoder_combined_context)

# Has another weight + tanh layer as described in equation (5) of the paper
output = TimeDistributed(Dense(512, activation="tanh"))(decoder_combined_context)
output = TimeDistributed(Dense(dict_size, activation="softmax"))(output)
print('output', output)

attention Tensor("attention_2/Identity:0", shape=(None, 21, 22), dtype=float32)
context Tensor("testttt/Identity:0", shape=(None, 21, 512), dtype=float32)
decoder_combined_context Tensor("concatenate_2/Identity:0", shape=(None, 21, 1024), dtype=float32)
output Tensor("time_distributed_5/Identity:0", shape=(None, 21, 1216), dtype=float32)


In [38]:
model = Model(inputs=[encoder_input_l, decoder_input_l], outputs=[output])
model.compile(optimizer='adam', loss='binary_crossentropy')
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 22)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 22, 128)      155648      input_1[0][0]                    
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 21)]         0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     (None, 22, 512)      1312768     embedding[0][0]                  
____________________________________________________________________________________________

In [88]:
model.fit(x=[encoder_input, decoder_input], y=[decoder_output],
          #validation_split=0.05,
          batch_size=64, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7fc22bc5c898>

In [89]:
model.save('model_attention.h5')
