Prerequisites: Upload the course_bachelors.yaml file to Google Drive in order to rerun the code

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers , activations , models , preprocessing , utils
import pickle
import re
from gensim.models import Word2Vec

In [None]:
# Mount Google Drive in Colab
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import yaml

with open(r'/content/gdrive/MyDrive/course_bachelors.yaml') as file:
  documents = yaml.load(file)
documents

{'categories': ['course', "Bachelor's"],
 'conversations': [['How long is the duration of the course?', 'Three years.'],
  ['What are the entry requirements for the course?',
   'Minimum entry requirements are a grade H5 and above in two higher level subjects together with a minimum of O6/H7 in four other subjects. A minimum of grade O6/H7 must be obtained in English. A grade O5/H6 must be obtained in Mathematics.For applicants whose first language is not English, please note the English language entry requirements. Mature applicants, applicants with a disability or those applying through the DARE or HEAR access schemes can find out more information on the application process.'],
  ['How much is the tuition fees for the course?',
   'The fees for this course for international students is €10000 per year. For domestic students applying through the CAO, this course applies under the free fees initiative.']]}

In [None]:
# The dataset is split into question and answer lists. For our chatbot, we have used the conversations subject of the dataset.

questions, answers = [], []

conversations = documents['conversations']

for conv in conversations:
  if len(conv) > 2 :
    questions.append(conv[0])
    replies = conv[1 :]
    ans = ' '
    for rep in replies:
      ans += ' ' + rep
      answers.append(ans)
  elif len(conv) > 1:
    questions.append(conv[0])
    answers.append(conv[1])

In [None]:
questions

['How long is the duration of the course?',
 'What are the entry requirements for the course?',
 'How much is the tuition fees for the course?']

In [None]:
answers

['Three years.',
 'Minimum entry requirements are a grade H5 and above in two higher level subjects together with a minimum of O6/H7 in four other subjects. A minimum of grade O6/H7 must be obtained in English. A grade O5/H6 must be obtained in Mathematics.For applicants whose first language is not English, please note the English language entry requirements. Mature applicants, applicants with a disability or those applying through the DARE or HEAR access schemes can find out more information on the application process.',
 'The fees for this course for international students is €10000 per year. For domestic students applying through the CAO, this course applies under the free fees initiative.']

In [None]:
# Data preprocessing for seq2seq learning

# For preprocessing, a single vocabulary is used for tokenization.

answers_tags = []

for i in range(len(answers)):
  if type(answers[i]) == str:
    answers_tags.append(answers[i])
  else:
    questions.pop(i)

answers = []

for i in range(len(answers_tags)):
  answers.append('<START>' + answers_tags[i] + '<END>')

tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(questions + answers)

VOCAB_SIZE = len(tokenizer.word_index) + 1

In [None]:
VOCAB_SIZE

81

In [None]:
vocab = []

for word in tokenizer.word_index:
  vocab.append(word)

def tokenize(sentences):
  tokens_list = []
  vocabulary = []
  for sentence in sentences:
    sentence = sentence.lower()
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    tokens = sentence.split()
    vocabulary += tokens
    tokens_list.append(tokens)
  return tokens_list, vocabulary

In [None]:
# encoder input data

tokenized_questions = tokenizer.texts_to_sequences(questions)

maxlen_questions = max([len(x) for x in tokenized_questions])

encoder_input_data = preprocessing.sequence.pad_sequences(tokenized_questions, maxlen=maxlen_questions, padding='post')

#padded_questions = preprocessing.sequence.pad_sequences(tokenized_questions, maxlen=maxlen_questions, padding='post')

#encoder_input_data = np.array([padded_questions])

print(encoder_input_data.shape)

(3, 9)


In [None]:
# decoder input data

tokenized_answers = tokenizer.texts_to_sequences(answers)

maxlen_answers = max([len(x) for x in tokenized_answers])

decoder_input_data = preprocessing.sequence.pad_sequences(tokenized_answers, maxlen=maxlen_answers, padding='post')

#padded_answers = preprocessing.sequence.pad_sequences(tokenized_answers, maxlen=maxlen_answers, padding='post')

#decoder_input_data = np.array(padded_answers)

print(decoder_input_data.shape)

(3, 87)


In [None]:
# decoder output data

tokenized_answers = tokenizer.texts_to_sequences(answers)

for i in range(len(tokenized_answers)):
  tokenized_answers[i] = tokenized_answers[i][1:]

padded_answers = preprocessing.sequence.pad_sequences(tokenized_answers, maxlen=maxlen_answers, padding='post')

decoder_output_data = utils.to_categorical(padded_answers, VOCAB_SIZE)

#onehot_answers = utils.to_categorical(padded_answers, VOCAB_SIZE)

#decoder_output_data = np.array([onehot_answers])

print(decoder_output_data.shape)

(3, 87, 81)


In [None]:
# Building the model
# Keras Functional API is used to build the architecture of the model. 
# The model is a multi input model, the encoder input and the decoder input. 
# Successive layers include the Embedding and the LSTM layers

# Embedding LSTM and Desne Layers

encoder_inputs = tf.keras.layers.Input(shape=(maxlen_questions, ))
encoder_embedding = tf.keras.layers.Embedding(VOCAB_SIZE, 200, mask_zero=True)(encoder_inputs)
encoder_outputs, state_h, state_c = tf.keras.layers.LSTM(200, return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

decoder_inputs = tf.keras.layers.Input(shape=(maxlen_answers, ))
decoder_embedding = tf.keras.layers.Embedding(VOCAB_SIZE, 200, mask_zero=True)(decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM(200, return_state=True, return_sequences=True)
decoder_outputs, _ , _  = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = tf.keras.layers.Dense(VOCAB_SIZE, activation=tf.keras.activations.softmax)
output = decoder_dense(decoder_outputs)

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output)
model.compile(optimizer='adam', loss= tf.keras.losses.categorical_crossentropy, metrics=['accuracy'])

In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 9)]          0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 87)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 9, 200)       16200       ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 87, 200)      16200       ['input_2[0][0]']                
                                                                                              

In [None]:
history = model.fit([encoder_input_data , decoder_input_data], decoder_output_data, batch_size=32, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
# Making inferences

#For making inferences, two inference models namely the encoder and the decoder inference model are built. 
# These models undergo similar preprocessing steps as the model did during the training phase.

def inference():
  encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)

  decoder_state_input_h = tf.keras.layers.Input(shape=(200 ,))
  decoder_state_input_c = tf.keras.layers.Input(shape=(200 ,))
    
  decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
  decoder_outputs, state_h, state_c = decoder_lstm(
  decoder_embedding , initial_state=decoder_states_inputs)
  decoder_states = [state_h, state_c]
  decoder_outputs = decoder_dense(decoder_outputs)
  decoder_model = tf.keras.models.Model(
      [decoder_inputs] + decoder_states_inputs,
      [decoder_outputs] + decoder_states)
    
  return encoder_model , decoder_model

In [None]:
def preprocess_input(input_sentence):
    tokens = input_sentence.lower().split()
    tokens_list = []
    for word in tokens:
        tokens_list.append(tokenizer.word_index[word]) 
    return preprocessing.sequence.pad_sequences([tokens_list] , maxlen=maxlen_questions , padding='post')

In [None]:
enc_model , dec_model = inference()

In [None]:
tests = ['How long is the duration of the course', 'What are the entry requirements for the course', 'How much is the tuition fees for the course']

for i in range(3):
    states_values = enc_model.predict(preprocess_input(tests[i]))
    empty_target_seq = np.zeros((1 , 1))
    empty_target_seq[0, 0] = tokenizer.word_index['start']
    stop_condition = False
    decoded_translation = ''
    
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([empty_target_seq] + states_values)
        sampled_word_index = np.argmax(dec_outputs[0, -1, :])
        sampled_word = None
        
        for word , index in tokenizer.word_index.items() :
            if sampled_word_index == index :
                decoded_translation += f' {word}'
                sampled_word = word
        
        if sampled_word == 'end' or len(decoded_translation.split()) > maxlen_answers:
            stop_condition = True
            
        empty_target_seq = np.zeros((1 , 1))  
        empty_target_seq[0 , 0] = sampled_word_index
        states_values = [h , c] 
    print(f'Human: {tests[i]}')
    print()
    decoded_translation = decoded_translation.split(' end')[0]
    print(f'Bot: {decoded_translation}')
    print('-'*25)


Human: How long is the duration of the course

Bot:  three years
-------------------------
Human: What are the entry requirements for the course

Bot:  minimum entry requirements are a grade h5 and above in two higher level subjects together with a minimum of o6 h7 in four other subjects a minimum of grade o6 h7 must be obtained in a grade o5 o5 h6 must be obtained in mathematics for applicants whose first language is not english please note the english language entry requirements mature applicants applicants with a disability or those applying through the dare or hear access schemes can find out more information on the application process
-------------------------
Human: How much is the tuition fees for the course

Bot:  the fees for this course for international students is €10000 per year for domestic students applying through the cao this course applies under the free fees initiative
-------------------------


Code snippet sources: Kaggle and Stack Overflow