<a href="https://colab.research.google.com/github/john-decker/Cultural_Heritage/blob/main/Intent_Based_Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
'''Based on the tutorial at https://data-flair.training/blogs/python-chatbot-project/
I have updated the code to reflect changes in several libraries and to replace and
refactor depricated expressions. I have unpacked various areas of the code to make
it easier for beginners to read and understand. In addition, I have used best practices
by splitting the data into training and testing data and adjusting the model to use
this split to properly validate -- the original tutorial did not do this (though it
still obtained a good working outcome).
'''

#import libraries needed for NLP work
import nltk
nltk.download('punkt') #specifically required to tokenize words
nltk.download('wordnet') #specifically required to lemmatize properly
from nltk.stem import WordNetLemmatizer
import json
import pickle

#import libraries needed to train a model
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD

#use random for shuffling data and returning responses
import random

#initialize lemmatizer and necessary lists
lemmatizer = WordNetLemmatizer()
words = []
classes = []
documents = []
ignore_words = ['?', ',', '!']

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
#open the intents json file to use as labeled training data
data_path = "/content/intents.json"
with open(data_path, 'r') as data_file:
  data = data_file.read()

intents = json.loads(data)
print('data loaded')

data loaded


In [None]:
#iterate over json document to tokenize patterns and isolate labels (tags)
for intent in intents['intents']:
  for pattern in intent['patterns']:
    #tokenize each pattern sentence and add to the words list
    item = nltk.word_tokenize(pattern)
    words.extend(item)
    #associate the tokenized words with the label (tag) that describes them and add them to documents
    pairs = (item, intent['tag'])
    documents.append(pairs) #note that this will contain a tuple of tokenized words (index 0) and the tag (index 1)

    #add a label (tag) to the classes list if it isn't already there
    if intent['tag'] not in classes:
      classes.append(intent['tag'])

In [None]:
# create a list of lemmatized words using a list comprehension
words = [lemmatizer.lemmatize(word.lower()) for word in words if word not in ignore_words]

# de-duplicate words and order list
sorted_words = sorted(list(set(words)))

In [None]:
#de-duplicate classes (tags) and order list
sorted_classes = sorted(list(set(classes)))


In [None]:
#check shape of data by outputting lengths
print(f'{len(documents)} documents')
print(f'{len(classes)} unique classes (tags)')
print(f'{len(words)} unique lemmatized words')

47 documents
9 unique classes (tags)
185 unique lemmatized words


In [None]:
#use pickle to serialize data as byte objects for later retrieval using the wb (write binary) mode
pickle.dump(words, open('words.pkl', 'wb'))
pickle.dump(classes, open('classes.pkl', 'wb'))

In [None]:
#create training data for our model
training = []

#create an array of zeroes for later one-hot encoding
output_empty = [0] * len(classes)

#create a bag-of-words for each sentence in documents
for entry in documents:
  # initialize empty list for bag-of-words
  bag = []
  #tokenized words from each pattern, use 0
  doc_words = entry[0] #remember, the words are at index 0 in the tuple in docs
  #lemmatize each word to put it into its base form
  pattern_words = [lemmatizer.lemmatize(word.lower()) for word in doc_words]
  #create a bag-of-words using one-hot encoding
  for item in words:
    if item in pattern_words:
      bag.append(1)
    else:
      bag.append(0)
  #encode tags using one-hot encoding
  output_row = list(output_empty) #need to use list to create iterator to unpack output_empty
  output_row[classes.index(entry[1])] = 1 #places a 1 where the tag occurs

  paired_data = (bag, output_row) #is a tuple with the bow at index 0 and the tags at index 1
  training.append(paired_data)


In [None]:
print(len(training)) #should equal the number of documents

47


In [None]:
#train the data
random.shuffle(training)
#initialize lists to receive word and tag portions
word_vecs = []
tag_vecs = []

for item in training:
  word_vecs.append(item[0])
  tag_vecs.append(item[1])

#set percentage of data to use as training
train_percent = 0.8

#helper function to calculate training size for vectors
def train_size(vec, percentage):
  return int(len(vec) * percentage) #note, cast to int to avoid float outcomes

word_train = train_size(word_vecs, train_percent)
tag_train = train_size(tag_vecs, train_percent)


#create training set using 80% of data and test set using 20% of data
#convert to np arrays
train_x = np.array(word_vecs[:word_train])
train_y = np.array(tag_vecs[:tag_train])

X_test = np.array(word_vecs[word_train:])
y_test = np.array(tag_vecs[tag_train:])

print("Training Data Created")

Training Data Created


In [None]:
#create model using keras
model = Sequential()
#create first dense layer
model.add(Dense(128, input_shape=(len(train_x[0]),), activation='relu'))
#use drop out to avoid overfitting
model.add(Dropout(0.5))
#create second layer, with reduced dimensions
model.add(Dense(64, activation='relu'))
#use dropout again
model.add(Dropout(0.5))
#create output layer of network
model.add(Dense(len(train_y[0]), activation='softmax'))

#initialize the optimizer function (in this case, Stochastic Gradient Descent)
sgd = SGD(learning_rate=0.01, weight_decay=1e-6, momentum=0.9, nesterov=True)
#set the loss function and specify which metric is most important
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])

#train the model on the training data and use the testing data to validate
hist=model.fit(train_x, train_y, epochs=200, batch_size=5, verbose=1, validation_data=(X_test, y_test))

model.save('chatbot_model.v1', hist)

print("Model Created")


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [None]:
#load the trained model
from keras.models import load_model
model = load_model('chatbot_model.v1')

#will need to unpickle resources, load json, and reimport key libraries
#if using this section as a separate script (which is advisable)

###Use this if treating the next section as a standalone script and importing the model.

import nltk <br/>
from nltk.stem import WordNetLemmatizer<br/>
lemmatizer = WordNetLemmatizer()<br/>
import pickle<br/>
import numpy as np<br/>

from keras.models import load_model<br/>
model = load_model('chatbot_model.h5')<br/>
import json<br/>
import random<br/>
intents = json.loads(open('intents.json').read())<br/>
words = pickle.load(open('words.pkl','rb'))<br/>
classes = pickle.load(open('classes.pkl','rb'))

In [None]:
#create helper functions to preprocess the user's input
def clean_up_text(text):
  #tokenize incoming text and then lemmatize it
  user_words = nltk.word_tokenize(text)
  processed_words = [lemmatizer.lemmatize(word.lower()) for word in user_words]
  return processed_words


In [None]:
def bag_of_words(sentence, words, show_details=True):
  #NOTE: this helper function depends on the existence of the words file
  sentence_words = clean_up_text(sentence)
  bag = [0]*len(words)
  for token in sentence_words:
    for index, word in enumerate(words):
      if word == token:
        bag[index] = 1
      #use show details to help debug if needed
      if show_details:
        print(f"{token} found in bag.")
  return(np.array(bag))


In [None]:
def predict_class(sentence, model):
  #call helper function
  target = bag_of_words(sentence, words, show_details=False)
  #ask model to predict potential classes based on user input
  prediction = model.predict(np.array([target]))[0]
  #set a threshold above which answers are valid
  ERROR_THRESHOLD = 0.25
  #test the results of the real number encodings to see which are better than the error threshold
  results = [[index, real_num] for index, real_num in enumerate(prediction) if real_num > ERROR_THRESHOLD]
  #use a lambda function to sort in descending order.
  #Needed if there is more than one prediction -- returns biggest number first
  results.sort(key=lambda x: x[1], reverse=True)
  return_list = []
  for result in results:
    return_list.append({"intent": classes[result[0]], "probability": str(result[1])})

  return return_list



In [None]:
def getResponse(predicted_tag, intents_list):
  #this function requires an intents.json file
  predicted = predicted_tag[0]['intent']#uses dictionary structure from predicted_class function
  possible_intents = intents_list['intents']
  for intent in possible_intents:
    if intent['tag'] == predicted:
      output = random.choice(intent['responses'])
      break
  return output

In [None]:
def get_chatbot_response(text):
  query = predict_class(text, model)
  response = getResponse(query, intents)
  return response

In [None]:
sentence = "Can you help me with blood pressure tracking?"

print(get_chatbot_response(sentence))

Navigating to Blood Pressure module


In [None]:

bot_type = "medical help"
print(f'Welcome to the {bot_type} chatbot. Please let us know how we can assist you.\n')
print("Please type 'Quit' or 'Bye' to exit chat.\n\n")
user_input = ''
exit_word_1 = 'Quit'
exit_word_2 = "Bye"
continue_session = True

while continue_session:
  user_input = input("type your question here. ")
  if user_input == exit_word_1 or user_input == exit_word_1.lower() or user_input == exit_word_2 or user_input == exit_word_2.lower():
    print(f'\nThank you for visiting, we hope to chat with you again soon!')
    continue_session = False

  else:
    answer = get_chatbot_response(user_input)
    print(f'{answer}\n')


Welcome to the medical help chatbot. Please let us know how we can assist you.

Please type 'Quit' or 'Bye' to exit chat.


type your question here. hi there!
Good to see you again

type your question here. Can you help me?
I can guide you through Adverse drug reaction list, Blood pressure tracking, Hospitals and Pharmacies

type your question here. can you help me with blood pressure tracking?
Navigating to Blood Pressure module

type your question here. bye

Thank you for visiting, we hope to chat with you again soon!
