# Title : Mov Bee Chatbot
## Name : Jagriti Kumari
### Program :  Data Science

#### Loading necessary libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
import re
from nltk.corpus import wordnet
import nltk
import gensim
import numpy as np
import pandas as pd
import os
import json
import random
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
import pickle
import colorama
from tensorflow import keras
from colorama import Fore, Style

stop_words = set(nltk.corpus.stopwords.words('english'))
wtk = nltk.tokenize.RegexpTokenizer(r'\w+')
wnl = nltk.stem.wordnet.WordNetLemmatizer()


Read movie lines from input file

In [2]:
def readFile():
  data = open("C:/Users/Jagriti/Documents/NLP/Chatbot/cornell_movie_dialogs_corpus/cornell movie-dialogs corpus/movie_lines.txt")
  inputArray = []
  for line in data:
    inputArray.append([words.strip() for words in line.split("+++$+++")])
  return inputArray

* Select only those movie lines for topic modeling which has more than 10 words. For Faster processing only consider first 20000 lines.

## Data preprocessing

In [3]:
def extract_movie_lines(input):
    movie_lines = []
    count = 0
    for line in input[:10000]:
        count +=1
        words = line[4].split(" ")
        if len(words) > 10:
            movie_lines.append(line[4])
    print(len(movie_lines))
    print(count)
    return movie_lines

* Pre-process the movie lines. Convert each movie dialogue to list of words. Cleanup dialogues like strip white spaces, remove stop words. Tokenize them and then lemmatize each words.

In [4]:
def text_wrangling(extractedmovielines):
    wtk = nltk.tokenize.RegexpTokenizer(r'\w+')
    wnl = nltk.stem.wordnet.WordNetLemmatizer()
    normalized_movielines = []
    for line in extractedmovielines:
        line = line.lower()
        movieline_tokens = [token.strip() for token in wtk.tokenize(line) if len(token.strip()) > 3]
        movieline_tokens = [wnl.lemmatize(token) for token in movieline_tokens if not token.isnumeric()]
        #movieline_tokens = [token for token in movieline_tokens if len(token) > 1]
        movieline_tokens = [token for token in movieline_tokens if token not in stop_words]
        movieline_tokens = list(filter(None, movieline_tokens))
        if movieline_tokens:
            normalized_movielines.append(movieline_tokens)
    return normalized_movielines

* Create uni-gram and bi-gram words

## Feature Engineering

In [5]:
def featureEngineering(norm_movielines):
    cv = CountVectorizer(min_df=2, max_df=0.5, ngram_range=(1,2),token_pattern=None, tokenizer=lambda doc: doc,preprocessor=lambda doc: doc)
    cv_features = cv.fit_transform(norm_movielines)
    # validating vocabulary size
    vocabulary = np.array(cv.get_feature_names())
    print(cv_features.shape)
    return cv_features,vocabulary

* Perform topic modelling and get topics (uni-gram and bi-gram)

In [6]:
def topic_model(cv_features,vocabulary):
    wordset = []
    lda_model = LatentDirichletAllocation(n_components =100,max_iter=500, max_doc_update_iter=50, learning_method='online',batch_size=1740, learning_offset=50., random_state=42, n_jobs=16)
    movielines_topics = lda_model.fit_transform(cv_features)
    topic_terms = lda_model.components_
    topic_key_term_idxs = np.argsort(-np.absolute(topic_terms), axis=1)[:,:20]
    topic_keyterms = vocabulary[topic_key_term_idxs]
    topics = [', '.join(topic) for topic in topic_keyterms]
    for sentence in topics:
        wordset.append(sentence.split(", "))
    words = []
    for i in range(len(wordset)):
        for elements in wordset[i]:
            words.append(elements)
    return words

**Get similar words as that of topics**

In [7]:
def getSynonyms(list_words):
    list_syn = {}
    threshold = 7
    for word in list_words:
        synonyms = []
        for syn in wordnet.synsets(word):
            for lem in syn.lemmas():
                if len(synonyms) > threshold:
                    break
                # Remove any special characters from synonym strings
                lem_name = re.sub('[^a-zA-Z0-9 \n\.]', ' ', lem.name())
                synonyms.append(lem_name)

        list_syn[word] = list(set(synonyms))
    return list_syn

**Write Topic to a txt file**

In [8]:
def writeTopic(words):
    with open('words.txt', 'w') as f:
        wordList = json.dumps(words)
        f.writelines(wordList)
    f.close()

In [9]:
def writeSeedWords(list_syn):
    with open('synonyms.txt', 'w') as f:
        symList = json.dumps(list_syn)
        f.writelines(symList)
        #print(symList)
    f.close()

In [10]:
def readTopics():
    wordFile = open('words.txt')
    wordList = json.load(wordFile, strict=False)
    return [word for word in wordList if len(word) > 3]

In [11]:
input = readFile()
extractedmovielines = extract_movie_lines(input)
normalized_movielines = text_wrangling(extractedmovielines)
cv_features,vocabulary = featureEngineering(normalized_movielines)
words = topic_model(cv_features,vocabulary)
writeTopic(words)

3450
10000




(3450, 3992)


In [12]:
topicList = readTopics()
print("Total topic detected %d"%len(topicList))
list_syn = getSynonyms(topicList)
writeSeedWords(list_syn)

Total topic detected 1969


## Training Intent Creator (TIRC)

**Class to model intent, will be stored in json format and used for training.**

In [13]:
from json import JSONEncoder
from collections import namedtuple

class Intent:
    def __init__(self, tag:str, seedWords:list, pattern:dict, response:dict):
        self.tag = tag
        self.seedWords = seedWords
        self.pattern = pattern
        self.response = response

    def __str__(self):
        return 'tag: {t} \n seedWord: {s} \n pattern: {p} \n response: {r}'.format(t= self.tag,
                                                                                   s= self.seedWords,
                                                                                   p= self.pattern,
                                                                                   r= self.response)

class IntentEncoder(JSONEncoder):
    def default(self, o):
        return o.__dict__

def customIntentDecoder(intentDict):
    return namedtuple('X', intentDict.keys())(*intentDict.values())



**Class to represent dialogues**

In [14]:
class Dialogues:
    def __init__(self, dialogue:str, cleanedDialogue:list, lineNo:int):
        self.dialogue = dialogue
        self.cleanedDialogue = cleanedDialogue
        self.lineNo = lineNo

**Get set of manually created list of topic that needs to be filtered out. These are topic which represent intent like
stuff , line, cause etc. which are very general**


In [15]:
def getSetofTopicToBeFilteredOut():
    file = open("C:/kaggle/NLP Project/TopicFilter.txt",'r')
    topicSet = set()
    for line in file:
        topic = line.strip().replace("\"","").replace(":","")
        topicSet.add(topic)
    print("Number of topic to be filtered out %d"%len(topicSet))
    return topicSet

In [16]:
def cleanDialogue(dialogue:str)->list:
    dialogue_words = [token.strip() for token in wtk.tokenize(dialogue.lower()) if len(token.strip()) > 3]
    dialogue_words_lemmatized = [wnl.lemmatize(token) for token in dialogue_words if token not in stop_words]
    return dialogue_words_lemmatized

**Read each dialogue line and convert it in Dialogues object. Store this object in a dictionary where key is dialogue no and value is Dialogues object. This map speds up creation of Intent**

In [17]:
def readDialogues():
    file = open("C:/Users/Jagriti/Documents/NLP/Chatbot/cornell_movie_dialogs_corpus/cornell movie-dialogs corpus/movie_lines.txt")
    dilogueMap = {}
    for line in file:
        components = line.split("+++$+++")
        lineNo = int(components[0].strip().replace("L",''))
        dialogue = components[4].strip()
        wordList = cleanDialogue(dialogue)
        dialogueObject = Dialogues(dialogue, wordList, lineNo)
        dilogueMap[lineNo] = dialogueObject
    file.close()
    return dilogueMap

In [18]:
def getTopicSynonyms()->dict:
    filePath = open('synonyms.txt')
    data = json.load(filePath, strict=False)
    return data

**Calculate Jaccard similarity between two dialogues/sentences which are cleanded , tokenized and lemmatized**

In [19]:
def getJaccardSimilarity(wordList1:list, wordList2:list):
    set1 = set(wordList1)
    set2 = set(wordList2)
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return float(len(intersection)) / len(union)

In [20]:
def getIntent(tag:str, seedWords:list, dialogueMap:dict, jaquardThreshold):
    intent = Intent(tag, seedWords, {}, {})
    for lineNo in dialogueMap:
        dialogue = dialogueMap[lineNo]
        if dialogue.lineNo + 1 in dialogueMap and len(dialogue.cleanedDialogue) > 0:
            jacSim = getJaccardSimilarity(seedWords, dialogue.cleanedDialogue)
            if jacSim > jaquardThreshold:
                intent.pattern.update({dialogue.lineNo : dialogue.dialogue})
                intent.response.update({dialogue.lineNo: dialogueMap[dialogue.lineNo + 1].dialogue})

    return intent

In [21]:
def persistIntents(intentDict:dict):
    intentJson = json.dumps(intentDict, indent=2, cls=IntentEncoder)
    fileToWrite = open("intent.json",'w')
    fileToWrite.writelines(intentJson)
    fileToWrite.close()

**Create intent using movie dialogues and seed words. Use a JaccardSimilarity threshold of 0.1**

In [22]:

def generateIntentFromTopic(similarityCofficient = 0.1):
    dialogueMap = readDialogues()
    synonyms = getTopicSynonyms()
    topicFilterSet = getSetofTopicToBeFilteredOut()
    intentDict = {}
    print("creating intent .... this will take some time")
    count = 0
    for tag in synonyms:
        if(len(synonyms[tag]) > 0 and tag not in topicFilterSet):
            count += 1
            seedWords = synonyms[tag]
            seedWords.append(tag)
            intent = getIntent(tag, seedWords, dialogueMap, similarityCofficient)
            intentDict[tag] = intent
            if(count % 100 == 0):
                print("%d intent created."%count)
    print()
    print('Total intent created %d'%len(intentDict))
    return intentDict


In [23]:
intentList = generateIntentFromTopic()
persistIntents(intentList)

Number of topic to be filtered out 485
creating intent .... this will take some time
100 intent created.
200 intent created.
300 intent created.
400 intent created.
500 intent created.
600 intent created.
700 intent created.
800 intent created.

Total intent created 857


### Intent Detector , Matcher and Response Selector (IDMRS)

In [24]:
defaultAnswerList = ["Humm! I do not know that", "I am sorry I do not know that"]
def getToken(sentense:str)->list:
    dialogue_words = [token.strip() for token in wtk.tokenize(sentense.lower())]
    dialogue_words_lemmatized = [wnl.lemmatize(token) for token in dialogue_words]
    return dialogue_words_lemmatized

In [25]:
def readTrainingIntents():
    intentFile = open('intent.json', 'r')
    intentDict = json.load(intentFile)
    intentFile.close()
    return intentDict

In [26]:
def getPatternWithHighestSimilarity(userInputTokenList:list, patternDict:dict):
    closestMatchList = []
    highestJackScore = 0.0
    for lineNo in patternDict:
        pattern = patternDict[lineNo]
        patternToken = getToken(pattern)
        jackScore = getJaccardSimilarity(userInputTokenList, patternToken)
        if(jackScore > highestJackScore):
            closestMatchList = []
            highestJackScore = jackScore
            closestMatchList.append(lineNo)
        elif(jackScore == highestJackScore):
            closestMatchList.append(lineNo)
    return closestMatchList

In [27]:
'''
Sample intent dict
"christmas": {
    "tag": "christmas",
    "seedWords": [
      "Noel"
    ],
    "pattern": {
      "122179": "Merry Christmas eve."
    },
    "response": {
      "122179": "Not if you work for FedEx."
    }
  }
'''

def createUserResponse(userInput:str, userIntent:str, trainingIntent:dict):
    userInputTokenList =  getToken(userInput)
    intentDict = trainingIntent[userIntent]
    patternDict = intentDict["pattern"]
    answerList = getPatternWithHighestSimilarity(userInputTokenList, patternDict)
    if len(answerList) == 0:
        return random.choice(defaultAnswerList)
    else:
        lineNo = random.choice(answerList)
        return intentDict["response"][lineNo]

In [28]:
myInput = [
           {'userInput': "is there anything bothering you?", 'tag':'explanation'},
           {'userInput': "he is waiting downstairs", 'tag':'downstairs'},
           {'userInput': "Hey! when is christmas?", 'tag':'christmas'},
           {'userInput': "what is weather outside?", 'tag':'holiday'},
           {'userInput': "I am very excited about my vacation!", 'tag':'holiday'}]

def test():
    print("Brining Mov-bee online ......")
    print()
    trainingIntentDict = readTrainingIntents()
    for inputDict in myInput:
        response = createUserResponse(inputDict['userInput'], inputDict['tag'], trainingIntentDict)
        print("User: %s"%inputDict['userInput'])
        print("Bee: %s"%response)
        print()
        print()

In [29]:
#test()

### ML model training: RNN sequence to sequence

In [30]:
def populateTrainingDataAndLabels(patternDict:dict, patternList:list ,label:list, tag:str):
    for key in patternDict:
        patternList.append(patternDict[key])
        label.append(tag)

def getTraininGDataAndLabel(training_intents):
    trainingPatternList = []
    labelList = []
    for intent in training_intents:
        if len(training_intents[intent]['pattern']) > 0:
            populateTrainingDataAndLabels(training_intents[intent]['pattern'],trainingPatternList, labelList, intent)
    return trainingPatternList, labelList

* Train Sequential model. For speeding up the training process I have kept only 20 epoches. With 20 epoch model has 34% accuracy. With epoch of 100 I was able to achieve accuracy of 45%. If we select topics more carefully i.e. only keep less ambigious and specific topics, instead of genric and ambigious topics then we can achieve even more accuracy.

In [31]:
def trainAndPickleModel():
    training_intents = readTrainingIntents()
    trainSentence, trainLabels = getTraininGDataAndLabel(training_intents)
    num_classes = len(training_intents)

    lbl_encoder = LabelEncoder()
    lbl_encoder.fit(trainLabels)
    trainLabels = lbl_encoder.transform(trainLabels)
    vocab_size = 10000
    embedding_dim = 16
    max_len = 100
    oov_token = "<OOV>"
    tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token) # adding out of vocabulary token
    tokenizer.fit_on_texts(trainSentence)
    word_index = tokenizer.word_index
    sequences = tokenizer.texts_to_sequences(trainSentence)
    padded_sequences = pad_sequences(sequences, truncating='post', maxlen=max_len)

    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(16, activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))

    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    epochs = 20
    history = model.fit(padded_sequences, np.array(trainLabels), epochs=epochs)
    print(history)

    model.save("chat_model")
    with open('tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # to save the fitted label encoder
    with open('label_encoder.pickle', 'wb') as ecn_file:
        pickle.dump(lbl_encoder, ecn_file, protocol=pickle.HIGHEST_PROTOCOL)

In [32]:
trainAndPickleModel()

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
<keras.callbacks.History object at 0x000001D0422C3FD0>
INFO:tensorflow:Assets written to: chat_model\assets


### Mov-bee Chat Bot

In [33]:
def chat():
    print("Bringing Mov-Bee online ....")
    # load trained model
    model = keras.models.load_model('chat_model')

    # load tokenizer object
    with open('tokenizer.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)

    # load label encoder object
    with open('label_encoder.pickle', 'rb') as enc:
        lbl_encoder = pickle.load(enc)

    # parameters
    max_len = 20
    trainingIntentDict = readTrainingIntents()
    while True:
        print(Fore.LIGHTBLUE_EX + "User: " + Style.RESET_ALL, end="")
        userInput = input()
        if userInput.lower() == "quit" or userInput.lower() == "exit" or userInput.lower() == "end":
            break

        result = model.predict(keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences([userInput]),
                                                                          truncating='post', maxlen=max_len))
        tag = lbl_encoder.inverse_transform([np.argmax(result)])

        response = createUserResponse(userInput, tag[0], trainingIntentDict)
        print(Fore.GREEN + "ChatBot:" + Style.RESET_ALL, response)


In [34]:
print(Fore.YELLOW + "Start messaging with the bot (type quit, exit or end to stop)!" + Style.RESET_ALL)
chat()

[33mStart messaging with the bot (type quit, exit or end to stop)![0m
Bringing Mov-Bee online ....
[94mUser: [0m

TypeError: 'list' object is not callable

### Evaluation

In [35]:
test()

Brining Mov-bee online ......

User: is there anything bothering you?
Bee: Look, Dave, I know that you're sincere and that you're trying to do a competent job, and that you're trying to be helpful, but I can assure the problem is with the AO-units, and with your test gear.


User: he is waiting downstairs
Bee: Where is Pimenov? Where is Pimenov?




KeyError: 'christmas'

#### Note: 
##### I have implemented the code for Chatbot in another IDE(pyCharm). However, while integrating with jupyter notebook I was facing some issue. I have provided a sample chatbot that I have tested upon. But, I will include the snapshot from pycharm for the evaluation part in my report. 