In [17]:
#https://medium.com/@rr_42830/build-your-own-chatbot-using-deep-learning-23a022638067
import os
import json
import nltk
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hfyhc5\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\hfyhc5\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\hfyhc5\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\hfyhc5\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [2]:

def model(data):
    # extract data from json file
    training_sentences = []
    training_labels = []
    labels = []
    responses = []


    for intent in data['intents']:
        for pattern in intent['patterns']:
            training_sentences.append(pattern)
            training_labels.append(intent['tag'])
        responses.append(intent['responses'])

        if intent['tag'] not in labels:
            labels.append(intent['tag'])

    num_classes = len(labels)

    # fit transform to BOW
    lbl_encoder = LabelEncoder() # convert categorical variables to numerical labels ("greeting", "bye", "thanks") -> (0, 1, 2)
    lbl_encoder.fit(training_labels) 
    training_labels = lbl_encoder.transform(training_labels)
    training_labels = keras.utils.to_categorical(training_labels, num_classes=num_classes) # convert to bow model (binary representation)

    vocab_size = 1000
    embedding_dim = 16
    max_len = 20
    oov_token = "<OOV>" # out of vocabulary token value

    # tokenizing text, converting words to lowercase, filtering out punctuation, and converting text into sequences of integers
    tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token) 
    tokenizer.fit_on_texts(training_sentences)
    word_index = tokenizer.word_index
    sequences = tokenizer.texts_to_sequences(training_sentences)
    padded_sequences = pad_sequences(sequences, truncating='post', maxlen=max_len)
    
    train_data = padded_sequences
    train_labels = training_labels
    
    # LSTM model
    model = keras.Sequential([
        Embedding(len(word_index)+1, embedding_dim, input_length=max_len),
        LSTM(embedding_dim),
        Dense(num_classes, activation="softmax")
    ])

    model.compile(
        optimizer="rmsprop",
        loss="categorical_crossentropy",
        metrics=["accuracy"]
    )

    model.fit(train_data, train_labels, epochs=150, verbose=0)
    
    return model, lbl_encoder, tokenizer

In [3]:
# #Train model

# model = Sequential()
# model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
# model.add(GlobalAveragePooling1D())
# model.add(Dense(16, activation='relu'))
# model.add(Dense(16, activation='relu'))
# model.add(Dense(num_classes, activation='softmax'))

# model.compile(loss='sparse_categorical_crossentropy', 
#               optimizer='adam', metrics=['accuracy'])

# # model.summary()


# epochs = 500
# history = model.fit(padded_sequences, np.array(training_labels), epochs=epochs)

# # to save the trained model
# model.save("chat_model")

# import pickle

# # to save the fitted tokenizer
# with open('tokenizer.pickle', 'wb') as handle:
#     pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
# # to save the fitted label encoder
# with open('label_encoder.pickle', 'wb') as ecn_file:
#     pickle.dump(lbl_encoder, ecn_file, protocol=pickle.HIGHEST_PROTOCOL)

In [4]:
# def lstmModel(padded_sequences, training_labels, word_index, embedding_dim):
#     train_data = padded_sequences
#     train_labels = training_labels

#     model = keras.Sequential([
#         Embedding(len(word_index)+1, embedding_dim, input_length=max_len),
#         LSTM(embedding_dim),
#         Dense(num_classes, activation="softmax")
#     ])

#     model.compile(
#         optimizer="rmsprop",
#         loss="categorical_crossentropy",
#         metrics=["accuracy"]
#     )

#     model.fit(train_data, train_labels, epochs=150, verbose=0)
#     return model

In [5]:
def openMenu():
    with open('Beverage_Menu.txt', 'r', encoding='utf-8') as file:
        for line in file:
            print(line)

In [18]:
# Name Entity Recognision (NER)
def NER(query):
    name_entity = []
    # temp_greeting = list(greet_in)
    # if(query != ''):
    #     for word in nltk.word_tokenize(query):
    #         for item in temp_greeting: #loop through greet_in
    #                 item = item.capitalize() #capitaize each item in greet_in
    #                 if (word==item):
    #                     query = query.replace(word,'') #remove greeeting
    words =  nltk.word_tokenize(query)
    tagged = nltk.pos_tag(words)
    chunked = nltk.ne_chunk(tagged, binary=False)
    for chunk in chunked.leaves():
        if hasattr(chunk, 'label') or chunk[1] == 'NNP':
            # name_entity = named_entity + ' ' + ' '.join(chunk[0])
            name_entity.append(chunk[0])
                
    userName = ' ' + ' '.join(name for name in name_entity)
    
    # nameDict = {"name":userName}
    # storeKnowledge(nameDict)
    return userName

In [6]:
def detectMisspelledWord(query, menuPrice):
    # !pip install fuzzywuzzy
    from fuzzywuzzy import fuzz
    
    matching_items = []
    query = query.lower().split()
    print(query)
    for word in query:
        for item in menuPrice.keys():
            match_score = fuzz.token_set_ratio(word, item.lower()) # Calculate fuzzy match score between user query and coffee item
            if match_score > 80:
                if item not in matching_items:
                    matching_items.append(item.title())
    return matching_items

In [7]:
# detect number in user query
def detectNumber(query):
    num_dict = {"one": 1, "two": 2, "three": 3, "four": 4, "five": 5, "six":6, "seven":7, "eight":8, "nine":9, "ten":10} # number dictionary
    
    quantity = []
    for word in query.split():
        if word.isdigit():
            quantity += [int(word)]
        else:
            for w in num_dict:
                if w == word:
                    quantity += [num_dict[word]]
    return quantity

In [23]:
def reorder(query, model2, tokenizer2, lbl_encoder2):
    prediction = model2.predict(pad_sequences(tokenizer2.texts_to_sequences([query]), truncating='post', maxlen=20), verbose=0) # remove the query from the end when its length exceed max length (20)
    tag = lbl_encoder2.inverse_transform([np.argmax(prediction)]) #transform tag's numerical values back to string (0 -> "greeting")
    print(tag)
    if (tag == "positive"):
        returnFlag = True
        return returnFlag
    else:
        returnFlag = False
        return returnFlag

def transaction(queryList, model2, tokenizer2, lbl_encoder2):
    import re
     
    menuPrice = {"Espresso": 1.50, "Americano": 1.50, "Cappuccino": 1.80, "Latte": 1.80, "Macchiato": 1.80, "Flat White": 2.10, "Mocha": 2.10, "Black Tea": 1.50}

    lastQuery = queryList[0]
    currentQuery = queryList[1]
    
    switch = True    
    while switch == True:
        # Detect quantity in user query
        print(lastQuery)
        print(currentQuery)
        temp_query = currentQuery
        quantity = detectNumber(temp_query)        
        
        print(quantity)
        # Detect misspelled beverage
        beverages = detectMisspelledWord(currentQuery, menuPrice)
        print(beverages)
        if len(beverages) == 0:
            beverages = detectMisspelledWord(lastQuery, menuPrice)
            print(beverages)
        
        if len(beverages) == 0:
            print('\nSorry, your order does not appear in our menu.')
            return True
        
        # Assume 1 for all beverage if no number detected
        if(len(quantity) == 0):
            for i in beverages:
                quantity += [1]
                
        # if(len(quantity) == 0):
        #     temp_query = lastQuery
        #     quantity = detectNumber(temp_query)    
    
        if len(quantity)!=0 and len(quantity) == len(beverages):
            # Find price of each beverage
            price = []
            for i, j in menuPrice.items():
                for k in beverages:
                    if k == i:
                        price += [j]

            #Calculate total price
            totalPrice = sum([p*q for p, q in zip(price, quantity)])
            print("{:.2f}".format(totalPrice))

            # Order dictionary contains beverage, quantity, and price
            order_dict = {(b, a): c for a, b, c in zip(beverages, quantity, price)}

            # Let user confirm their order
            # if(len(order_dict) == 0):
            #     reorder = input('\nSorry, your order does not appear in our menu. Would you like to reorder (yes/no)?')
            #     prediction = model2.predict(pad_sequences(tokenizer2.texts_to_sequences([reorder]), truncating='post', maxlen=20)) # remove the query from the end when its length exceed max length (20)
            #     tag = lbl_encoder2.inverse_transform([np.argmax(prediction)]) #transform tag's numerical values back to string (0 -> "greeting")
            #     print(tag)
            #     if (tag == "positive"):
            #         return True
            #     else:
            #         return False
            # else:
            print("{:<10} {:<15} {:<10}".format('Quantity', 'Beverage', 'Price'))
            print("----------------------------------\n")
            for i in range(len(price)):
                print("{:<10} {:<15} £{:<10.2f}".format(quantity[i], beverages[i], price[i]))
            print('\nTotal price is: £', "{:.2f}".format(totalPrice))
            flag = True
            while(flag == True):
                confirmation = input('\nGreat! I am Groovy. Please confirm your item, quantity and total price (e.g. yes/no). If you want to exit transaction, type quit. :').lower()
                # prediction = Feedback(confirmation)
                prediction = model2.predict(pad_sequences(tokenizer2.texts_to_sequences([confirmation]), truncating='post', maxlen=20), verbose=0) # remove the query from the end when its length exceed max length (20)
                tag = lbl_encoder2.inverse_transform([np.argmax(prediction)]) #transform tag's numerical values back to string (0 -> "greeting")
                print(tag)
                if ('quit' in confirmation):
                    flag = False
                    return False
                elif(tag == "positive"):
                    flag = False
                    print("Thank you! Your order has been confirmed.")
                    return False
                elif(tag == "negative"):
                    reorder_query = input('\nWould you like to reorder?')
                    returnFlag = reorder(reorder_query, model2, tokenizer2, lbl_encoder2)
                    flag = False
                    return returnFlag
                else:
                    print('\nSorry I do not understand. Please type in "yes" or "no" to confirm your order.')
                           
            switch = False
        else:
            query = input("Please specify the quantity of each item (e.g. 2 mocha and 1 americano). If you want to exit transaction, type quit. :").lower()
            if "quit" in query:
                return False
            else:
                lastQuery = currentQuery
                currentQuery = query
            
# i=0
# queryList = []
# while i<2:
#     query = input()
#     if len(queryList) < 2:
#         queryList.append(query.lower())
#     i = i+1
# flag = transaction(queryList)
# print(flag)

In [9]:
# Feedback function to detect positive and negative

# Reference:
# ****************************************************************************************************************
#    Title: COMP3074 Human-AI Interaction Lab 3: Text classification
#    Author: Clos, J
#    Date: 2022
#    Availability: https://moodle.nottingham.ac.uk/pluginfile.php/8612037/mod_resource/content/5/COMP3074_Lab3.pdf
#
# ****************************************************************************************************************

def Feedback(query):
    # replace short form to standard words
    contractions = {
        "ain't": "am not",
        "aren't": "are not",
        "can't": "cannot",
        "can't've": "cannot have",
        "couldn't": "could not",
        "couldn't've": "could not have",
        "didn't": "did not",
        "doesn't": "does not",
        "don't": "do not",
        "hadn't": "had not",
        "hadn't've": "had not have",
        "hasn't": "has not",
        "haven't": "have not",
        "I'll": "I will",
        "I'll've": "I will have",
        "I'm": "i am",
        "I've": "i have",
        "i'd": "i would",
        "i'd've": "i would have",
        "isn't": "is not",
        "mayn't": "may not",
        "might've": "might have",
        "mightn't": "might not",
        "mightn't've": "might not have",
        "mustn't": "must not",
        "needn't": "need not",
        "needn't've": "need not have",
        "o'clock": "of the clock",
        "oughtn't": "ought not",
        "shan't": "shall not",
        "sha'n't": "shall not",
        "she'd": "she would",
        "she'll": "she will",
        "she's": "she is",
        "shouldn't": "should not",
        "wasn't": "was not",
        "we'd": "we would",
        "we'd've": "we would have",
        "we'll": "we will",
        "we'll've": "we will have",
        "we're": "we are",
        "we've": "we have",
        "weren't": "were not",
        "won't": "will not",
        "wouldn't": "would not",
        "you'll": "you will",
        "you'll've": "you will have",
        "you're": "you are",
        "u":"you",
        "ur":" your ",
        "n":" and ",
        'bout':'about',
        "cn":"can",
        "hve":"have"
    }

    for word in query.split():
        for key in contractions:
            value = contractions[key]
            if (word==key):
                query = query.replace(key, value) #if query have informal words replace it with formal words
    
    # Read training data which contains label
    label_dir = {
        "positive": "Dataset/positive",
        "negative": "Dataset/negative"
    }

    data = []
    labels = []
    
    # Read all files in folders
    for label in label_dir.keys():
        for file in os.listdir(label_dir[label]):
            filepath = label_dir[label] + os.sep + file
            with open(filepath, encoding='utf8', errors='ignore', mode='r') as review:
                content = review.read()
                data.append(content)
                labels.append(label)
    
    # Split train and test data
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test, = train_test_split(data, labels, stratify=labels,
                                                         test_size=0.25, random_state=1)
    
    # Split text into tokens, lower all the tokens, remove stop words, stemming, create bag-of-word models and term-document matrix:
    from sklearn.feature_extraction.text import CountVectorizer
    from nltk.stem.snowball import PorterStemmer

    p_stemmer = PorterStemmer()
    analyzer = CountVectorizer().build_analyzer()

    def stemmed_words(doc):
        return (p_stemmer.stem(w) for w in analyzer(doc))

    # stop word list
    my_list = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 
              'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 
              'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are',
              'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but',
              'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through',
              'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',
              'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some',
              'such', 'nor', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should',
              "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'ma']

    count_vect = CountVectorizer(lowercase=True, stop_words = my_list)
    X_train_counts = count_vect.fit_transform(X_train)
    
    # Weighting
    from sklearn.feature_extraction.text import TfidfTransformer

    tfidf_transformer = TfidfTransformer(use_idf=True, sublinear_tf=True).fit(X_train_counts)
    X_train_tf = tfidf_transformer.transform(X_train_counts)
    
    # Traning a classifier
    from sklearn.linear_model import LogisticRegression

    classifier = LogisticRegression(random_state=0).fit(X_train_tf, y_train) 
        
    #Evaluation of classifier
    from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

    # Preprocessing documents and creating term-document matrix
    X_new_counts = count_vect.transform(X_test)

    # Weighting
    X_new_tfidf = tfidf_transformer.transform(X_new_counts)

    # Predict on the test set
    predicted = classifier.predict(X_new_tfidf)

    # Print confusion matrix, accuracy, and f1 score
    # print(confusion_matrix(y_test, predicted))
    # print(accuracy_score(y_test, predicted))
    # print(f1_score(y_test, predicted, pos_label='positive'))
    
    #Process query
    query = [query]
    processed_query = count_vect.transform(query)

    # Weighting for query
    processed_query = tfidf_transformer.transform(processed_query)
    
    # Predict query's emotion
    prediction = classifier.predict(processed_query)
    # print(prediction)
    return prediction

In [27]:
import random

def chat():
    # load trained model
    # model = keras.models.load_model('chat_model')

    # load tokenizer object
#     with open('tokenizer.pickle', 'rb') as handle:
#         tokenizer = pickle.load(handle)

#     # load label encoder object
#     with open('label_encoder.pickle', 'rb') as enc:
#         lbl_encoder = pickle.load(enc)
    
    with open('intentTest.json', encoding='utf-8') as file:
        data1 = json.load(file)
        
    with open('TrueFalseData.json', encoding='utf-8') as file1:
        data2 = json.load(file1)
    
    model1, lbl_encoder1, tokenizer1 = model(data1)
    model2, lbl_encoder2, tokenizer2 = model(data2)
    
    # parameters
    max_len = 20
    queryList = ["",""]
    newName = ""
    flag1 = True
    flag2 = False
    query = input("Hello, welcome to Chin's Coffee House! My name is Groovy. What is your name? : ")
    name = NER(query)
    while flag1:
        if flag2 == True:
            query = input('\nLuna: What would you like to order? If you want to exit, type quit. :')
            flag2 = False
        else:
            query = input('\nHi'+ (newName if len(newName)!=0 else name) +", I am Groovy. Welcome to Chin's Coffee House. How can I help you? If you want to exit, type quit. :")
            
        if query.lower() == "quit":
            flag1 = False
            query = input("\nGroovy: Thank you for visiting Chin's Coffee House. What do you think of my service? :")
            prediction = Feedback(query)
            if(prediction=='positive'):
                print('\nThank you for your feedback, I am glad to hear that!')
            else:
                print('\nI am sorry to hear that, I will feedback to my company.')
            print("Bye, take care"+ (newName if len(newName)!=0 else name))
        else:
            queryList[0] = queryList[1]
            queryList[1] = query.lower()
            
            # model, lbl_encoder, tokenizer = preprocessing(data)
            result = model1.predict(pad_sequences(tokenizer1.texts_to_sequences([query]), truncating='post', maxlen=max_len), verbose=0) # remove the query from the end when its length exceed max length (20)
            tag = lbl_encoder1.inverse_transform([np.argmax(result)]) #transform tag's numerical values back to string (0 -> "greeting")

            for i in data1['intents']:
                if i['tag'] == tag:
                    print(tag)
                    response = np.random.choice(i['responses'])
                    if response == "Menu":
                        openMenu()                  
                        flag2 = True
                    elif response == "transaction":
                        print(queryList)
                        flag2 = transaction(queryList, model2, tokenizer2, lbl_encoder2)
                    elif response == "username":
                        print("Your name is" + (newName if len(newName)!= 0 else name) + ".")
                    elif response == "Change name":
                        newName = NER(query)
                        print("Noted, your name is" + (newName if len(newName)!= 0 else name) + ".")
                    else:
                        print("Groovy:", response)
    
chat()

Hello, welcome to Chin's Coffee House! My name is Groovy. What is your name? :  Hong Shen

Hi Hong Shen, I am Groovy. Welcome to Chin's Coffee House. How can I help you? If you want to exit, type quit. : quit

Groovy: Thank you for visiting Chin's Coffee House. What do you think of my service? : good



Thank you for your feedback, I am glad to hear that!
Bye, take care Hong Shen
