<a href="https://colab.research.google.com/github/jahirulRifat/spl3/blob/master/gru_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, GRU, Masking
from tensorflow.keras.preprocessing.text import tokenizer_from_json
import json
import time
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# import preprocessed data

with open('/content/drive/MyDrive/Colab Notebooks/chatbot/dataset/questions.json', 'r') as f:
    json_data = json.load(f)
    question_corpus = tokenizer_from_json(json_data)
    f.close()

with open('/content/drive/MyDrive/Colab Notebooks/chatbot/dataset/answers.json', 'r') as f:
    json_data = json.load(f)
    answer_corpus = tokenizer_from_json(json_data)
    f.close()

npzfile = np.load('/content/drive/MyDrive/Colab Notebooks/chatbot/dataset/data.npz') 

In [3]:
# define encoder

def create_encoder(inputdim, embeddingsize, inputlen, n_units):

    # encoder
    encoder_input = Input((inputlen,))
    encoder_embed = Embedding(inputdim+1, embeddingsize)(encoder_input)
    # we use embedding layer to vectorize the word. the inputdim indicates the input dimension
    # specifically, the inputdim is the length of vocabulary
    # inputdim+1 at here is because our word label start from 1
    encoder_mask = Masking()(encoder_embed)
    # add mask layer because we should ignore the input of those padding 0
    encoder = GRU(n_units, return_state = True)
    # n_units indicates the number of units 
    # the state is the vector which encoder maps input to
    # discard the output of encoder, only state vectors are we need
    _, encoder_state = encoder(encoder_mask)
    
    encoder=Model(encoder_input, encoder_state)
    
    return encoder

In [4]:
# define decoder. notice that this model is only used in training

def create_decoder(inputdim, embeddingsize,inputlen, n_units):
    # input of answers
    decoder_input = Input((inputlen,))
    # input of encoder state vectors   
    initial_state = Input((n_units,))

    # vectorize input answers
    decoder_embed = Embedding(inputdim+1, embeddingsize)(decoder_input)
    decoder_mask = Masking()(decoder_embed)
    decoder = GRU(n_units, return_sequences = True, return_state = True)
    # the state we dont need in training model
    decoder_output,_ = decoder(decoder_embed,initial_state = initial_state)

    decoder_dense = Dense(inputdim, activation = 'softmax')
    decoder_output_ = decoder_dense(decoder_output)
    
    decoder=Model([decoder_input,initial_state],decoder_output_)
    
    return decoder

In [5]:
# define hyperparameters

BatchSize = 32 # we choose a small size because of the oom issue
N_Unit = 256
EmbeddingSize = 128
VocabSize = 8000 
# theoretically, vocabulary size should be len(question_corpus.word_index)+1. 
# however, seems like the 'num_words' didnt filter the tokenizer. so we assign the number manually
QuestionLen = npzfile['arr_0'].shape[1]
AnswerLen = npzfile['arr_1'].shape[1]

In [6]:
encoder=create_encoder(VocabSize,EmbeddingSize,QuestionLen,N_Unit)

In [7]:
encoder.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 21)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 21, 128)           1024128   
_________________________________________________________________
masking (Masking)            (None, 21, 128)           0         
_________________________________________________________________
gru (GRU)                    [(None, 256), (None, 256) 296448    
Total params: 1,320,576
Trainable params: 1,320,576
Non-trainable params: 0
_________________________________________________________________


In [8]:
decoder=create_decoder(VocabSize,EmbeddingSize,AnswerLen,N_Unit)

In [9]:
decoder.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 22)]         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 22, 128)      1024128     input_2[0][0]                    
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
gru_1 (GRU)                     [(None, 22, 256), (N 296448      embedding_1[0][0]                
                                                                 input_3[0][0]              

In [10]:
# define the optimizer and loss function
optimizer = tf.keras.optimizers.Adam(1e-3)
def loss_function(real, pred): 
    loss = tf.keras.losses.categorical_crossentropy(real,pred)
    return tf.reduce_mean(loss)

In [11]:
# define the training step

@tf.function
def train_step(enc_inp,dec_inp,targ):
    loss=0

    with tf.GradientTape() as tape:
        encoder_state=encoder(enc_inp)
        initial_state=encoder_state
        prediction=decoder([dec_inp,initial_state])
        loss=loss_function(targ,prediction)

    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    
    return loss

In [12]:
# define the validation loss

def validation_loss(enc_inp,dec_inp,targ):
    loss=0
    encoder_state=encoder(enc_inp)
    initial_state=encoder_state
    prediction=decoder([dec_inp,initial_state])
    loss=loss_function(targ,prediction)
    
    return loss

In [13]:
# define the parameter to split data
train_valid_split = int(len(npzfile['arr_0'])*0.9)

In [14]:
print(train_valid_split)

132964


In [15]:
# get the training data
inputq=npzfile['arr_0'][:train_valid_split]
inputa=npzfile['arr_1'][:train_valid_split] 
targa=np.zeros_like(inputa) # create target data to do teacher forcing training
targa[:,0:-1]=inputa[:,1:]  # the target is same as the input answers but 1 timestep shifted to the left

In [16]:
# get the validation data
validq=npzfile['arr_0'][train_valid_split:]
valida=npzfile['arr_1'][train_valid_split:]
validt=np.zeros_like(valida)
validt[:,0:-1]=valida[:,1:]

In [17]:
# use onehot encoding to vectorize the target data
def onehotencoding(matrix,dim):
    onehot=np.zeros((matrix.shape[0],matrix.shape[1],dim))
    for i,sequence in enumerate(matrix):
        for j,index in enumerate(sequence):
            if index>0:
                onehot[i][j][index-1]=1 # the index start from 1 so we minus 1
    return onehot

In [18]:
# create tensorflow dataset pipeline for faster processing
# training set
BufferSize = len(inputq)
dataset_train = tf.data.Dataset.from_tensor_slices((inputq,inputa,targa)).shuffle(BufferSize)
dataset_train = dataset_train.batch(BatchSize, drop_remainder=True)
# validation set
BufferSize1 = len(validq)
dataset_valid = tf.data.Dataset.from_tensor_slices((validq,valida,validt)).shuffle(BufferSize1)
dataset_valid = dataset_valid.batch(BatchSize, drop_remainder=True)

In [19]:
# train the model

Epochs = 20
trainstep_epoch = len(inputq)//BatchSize
validstep_epoch = len(validq)//BatchSize
overalltime=0

for epoch in range(Epochs):
    start=time.time()
    total_loss=0
    valid_loss=0
    
    for (batch, (inputq,inputa,targa)) in enumerate(dataset_train.take(trainstep_epoch)):
        targa_onehot=onehotencoding(targa,VocabSize)
        batch_loss = train_step(inputq,inputa,targa_onehot)
        total_loss += batch_loss
    
    for (batch, (validq,valida,validt)) in enumerate(dataset_valid.take(validstep_epoch)):
        validt_onehot=onehotencoding(validt,VocabSize)
        valid_batch_loss = validation_loss(validq,valida,validt_onehot)
        valid_loss+=valid_batch_loss
    print('Epoch {} Loss {:.3f} Valid_Loss {:.3f}'.format(epoch+1,total_loss/trainstep_epoch,valid_loss/validstep_epoch))
    
    stop=time.time()
    timetaken=stop-start
    print('Time taken for 1 epoch {} sec\n'.format(timetaken))
    
    overalltime+=timetaken
    
print('Overall time taken {} min\n'.format(overalltime/60))

Epoch 1 Loss 1.894 Valid_Loss 1.785
Time taken for 1 epoch 591.2352752685547 sec

Epoch 2 Loss 1.672 Valid_Loss 1.640
Time taken for 1 epoch 565.2383005619049 sec

Epoch 3 Loss 1.567 Valid_Loss 1.612
Time taken for 1 epoch 561.4942758083344 sec

Epoch 4 Loss 1.509 Valid_Loss 1.613
Time taken for 1 epoch 559.6403362751007 sec

Epoch 5 Loss 1.462 Valid_Loss 1.623
Time taken for 1 epoch 560.9627494812012 sec

Epoch 6 Loss 1.420 Valid_Loss 1.646
Time taken for 1 epoch 561.0874810218811 sec

Epoch 7 Loss 1.381 Valid_Loss 1.673
Time taken for 1 epoch 558.427976846695 sec

Epoch 8 Loss 1.345 Valid_Loss 1.706
Time taken for 1 epoch 562.7872128486633 sec

Epoch 9 Loss 1.312 Valid_Loss 1.745
Time taken for 1 epoch 558.5704684257507 sec

Epoch 10 Loss 1.281 Valid_Loss 1.781
Time taken for 1 epoch 556.254264831543 sec

Epoch 11 Loss 1.252 Valid_Loss 1.821
Time taken for 1 epoch 555.0458059310913 sec

Epoch 12 Loss 1.226 Valid_Loss 1.855
Time taken for 1 epoch 555.9962034225464 sec

Epoch 13 Loss 1

In [20]:
# save parameters after training
encoder.save_weights('/content/drive/MyDrive/Colab Notebooks/chatbot/dataset/gru_enc_test.h5')
decoder.save_weights('/content/drive/MyDrive/Colab Notebooks/chatbot/dataset/gru_dec_test.h5')