# Bag of Words Chatbot Model

In [72]:
import pandas as pd
import nltk
import re
from nltk.stem import wordnet # to perform lemmitization
from sklearn.feature_extraction.text import CountVectorizer # to perform bow
from sklearn.feature_extraction.text import TfidfVectorizer # to perform tfidf
from nltk import pos_tag # for parts of speech
from sklearn.metrics import pairwise_distances # to perfrom cosine similarity
from nltk import word_tokenize # to create tokens
from nltk.corpus import stopwords # for stop words

In [73]:
# Download if required
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\letic\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\letic\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\letic\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\letic\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Preprocess corpus

In [74]:
df = pd.read_excel('queries_and_responses.xlsx')
df

Unnamed: 0,Context,Text Response
0,Tell me about your personality,Just guess :)
1,I want to know you better,I am a bot. I am designed to keep conversation...
2,Define yourself,
3,Describe yourself,
4,tell me about yourself,
...,...,...
1610,Do you have family,
1611,Are there others like you,Of course. That's why we're here
1612,Do you want to take over the world,
1613,Who are we?,You don't need to know


In [75]:
# Fill in missing responses -> Replaces every null value with the previous row's response
df.ffill(axis = 0,inplace=True)
df

Unnamed: 0,Context,Text Response
0,Tell me about your personality,Just guess :)
1,I want to know you better,I am a bot. I am designed to keep conversation...
2,Define yourself,I am a bot. I am designed to keep conversation...
3,Describe yourself,I am a bot. I am designed to keep conversation...
4,tell me about yourself,I am a bot. I am designed to keep conversation...
...,...,...
1610,Do you have family,No
1611,Are there others like you,Of course. That's why we're here
1612,Do you want to take over the world,Of course. That's why we're here
1613,Who are we?,You don't need to know


In [76]:
# Assign syntactic attribute of each word
s = 'tell me about your personality'
pos_tag(nltk.word_tokenize(s),tagset = None) # returns the parts of speech of every word

[('tell', 'VB'),
 ('me', 'PRP'),
 ('about', 'IN'),
 ('your', 'PRP$'),
 ('personality', 'NN')]

In [77]:
# Initialize lemmatizer so each token is converted to its root
lemma = wordnet.WordNetLemmatizer()
lemma.lemmatize('went', pos = 'v')

'go'

In [78]:
# function that performs text normalization steps

def normalization(text):
    
    text = str(text).lower() # text to lower case
    clean_text = re.sub(r'[^ a-z]','',text) # removing special characters
    tokens = nltk.word_tokenize(clean_text) # word tokenizing
    
    lema = wordnet.WordNetLemmatizer() # intializing lemmatization
    lema_words = []
    
    tags_list = pos_tag(tokens,tagset=None) # parts of speech
    
    # Lemmatize all the words in given sentence by assigning correct category
    for token,syntactic_func in tags_list:
        if syntactic_func.startswith('V'):  # Verb
            pos_val = 'v'
        elif syntactic_func.startswith('J'): # Adj
            pos_val = 'a'
        elif syntactic_func.startswith('R'): # Adverb
            pos_val = 'r'
        else:
            pos_val = 'n' # Noun
        lemmatized_word = lema.lemmatize(token, pos_val) # lemmatize
        lema_words.append(lemmatized_word) # appending the lemmatized token
    
    return " ".join(lema_words) # returns the lemmatized tokens as a sentence 

In [79]:
# test of normalization function
normalization('i was thinking about what you would be telling me')

'i be think about what you would be tell me'

In [80]:
# normalize whole dataset (user input text)
df['Lemmatized_text'] = df['Context'].apply(normalization)
df.tail(15)

Unnamed: 0,Context,Text Response,Lemmatized_text
1600,I'll be back,All right. I'll be here.,ill be back
1601,I'll get back to you in a moment,Till next time.,ill get back to you in a moment
1602,I promise to come back,Okay. You know where to find me.,i promise to come back
1603,I promise to come back,Okay. You know where to find me.,i promise to come back
1604,How large is the moon,A pretty large number of miles,how large be the moon
1605,Big is moon,A pretty large number of miles,big be moon
1606,How far away is the sun,A pretty large number of miles,how far away be the sun
1607,Do you know where the sun is,I'm blind,do you know where the sun be
1608,Do you drive,No,do you drive
1609,Do you smoke,No,do you smoke


## Model

Intuition: with corpus of words and frequency of these, we can compare similarity between documents and learn information form these documents throught the presence of the 'target words'

In [81]:
# First Attempt of BoW model with small corpus of text (model intuition from https://machinelearningmastery.com/gentle-introduction-bag-words-model/)
c = "It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness"
corpus = [i for i in c.split(', ')]
corpus

['It was the best of times',
 'it was the worst of times',
 'it was the age of wisdom',
 'it was the age of foolishness']

In [82]:
# create vector using target words
targets = ['it', 'was', 'the', 'best', 'of', 'times', 'worst', 'age', 'wisdom', 'foolishness']
vectors = []
for verse in corpus:
    vector = []
    verse = [i for i in verse.lower().split()]
    for word in targets:
        if word in verse:
            vector.append(1)
        else:
            vector.append(0)
    vectors.append(vector)
vectors

[[1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
 [1, 1, 1, 0, 1, 1, 1, 0, 0, 0],
 [1, 1, 1, 0, 1, 0, 0, 1, 1, 0],
 [1, 1, 1, 0, 1, 0, 0, 1, 0, 1]]

Use NLTK and current corpus to apply BOW model to chatbot

In [83]:
# Use CountVectorizer method to improve performance
cv = CountVectorizer() # intializing the count vectorizer
X = cv.fit_transform(df['Lemmatized_text']).toarray()
# returns all the unique word from data 
features = cv.get_feature_names()
df_bow = pd.DataFrame(X, columns = features)
df_bow

Unnamed: 0,abort,about,absolutely,abysmal,actually,adore,advice,advise,affirmative,afraid,...,yeh,yep,yes,yet,you,your,youre,yours,yourself,yup
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1610,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1611,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1612,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1613,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [84]:
query = 'Are you a bot'
query1 ='Will you help me and tell me more about yourself' 
Q = []
a = query.split()
for i in a:
    if i in stopwords.words('english'):
        continue
    else:
        Q.append(i)
    b=" ".join(Q) 

In [85]:
query_lemma = normalization(b) # applying the function that we created for text normalizing
query_BOW = cv.transform([query_lemma]).toarray() # applying bow

In [86]:
query_BOW

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

# Measure of Similarity

Use trigonometric functions (cosine similarity) to determine 0-1 value of each vector, facilitating comparison

In [87]:
# cosine similarity for the test query
cosine_value = 1 - pairwise_distances(df_bow, query_BOW, metric = 'cosine' )
(cosine_value)

array([[0.        ],
       [0.        ],
       [0.        ],
       ...,
       [0.        ],
       [0.40824829],
       [0.40824829]])

In [88]:
df['similarity_bow'] = cosine_value

In [89]:
df_simi = pd.DataFrame(df, columns=['Text Response','similarity_bow']) # taking similarity value of responses for the question we took
df_simi 

Unnamed: 0,Text Response,similarity_bow
0,Just guess :),0.000000
1,I am a bot. I am designed to keep conversation...,0.000000
2,I am a bot. I am designed to keep conversation...,0.000000
3,I am a bot. I am designed to keep conversation...,0.000000
4,I am a bot. I am designed to keep conversation...,0.000000
...,...,...
1610,No,0.000000
1611,Of course. That's why we're here,0.316228
1612,Of course. That's why we're here,0.000000
1613,You don't need to know,0.408248


In [90]:
df_simi_sort = df_simi.sort_values(by='similarity_bow', ascending=False) # sorting the values
df_simi_sort.head()

Unnamed: 0,Text Response,similarity_bow
237,Indeed I am. I'll be here whenever you need me.,0.816497
239,Indeed I am. I'll be here whenever you need me.,0.816497
235,Indeed I am. I'll be here whenever you need me.,0.707107
1239,"Lovely, thanks.",0.534522
1374,Excellent! That's what I like to see.,0.5


In [91]:
threshold = 0.35 # considering the value of p=smiliarity > 0.35
df_threshold = df_simi_sort[df_simi_sort['similarity_bow'] > threshold] 
df_threshold

Unnamed: 0,Text Response,similarity_bow
237,Indeed I am. I'll be here whenever you need me.,0.816497
239,Indeed I am. I'll be here whenever you need me.,0.816497
235,Indeed I am. I'll be here whenever you need me.,0.707107
1239,"Lovely, thanks.",0.534522
1374,Excellent! That's what I like to see.,0.500000
...,...,...
261,Thank you.,0.353553
271,Thank you.,0.353553
273,Thank you.,0.353553
252,You're pretty smart yourself.,0.353553


In [92]:
index_value = cosine_value.argmax() # index in dataframe of of highest similarity
index_value 

237

In [93]:
query

'Are you a bot'

In [94]:
df['Text Response'].loc[index_value] # Get response for highest-similarity index

"Indeed I am. I'll be here whenever you need me."

# Model Using Bag of Words

In [95]:
# Function to perform text normalization
def classify(tags, words_to_remove):
    lema = wordnet.WordNetLemmatizer() # intializing lemmatization
    lema_words = []
    
    # Lemmatize all the words in given sentence by assigning correct category
    for token,syntactic_function in tags:
        if token in words_to_remove:
            continue
        if syntactic_function.startswith('V'):  # Verb
            pos_val = 'v'
        elif syntactic_function.startswith('R'): # Adverb
            pos_val = 'r'
        elif syntactic_function.startswith('J'): # Adj
            pos_val = 'a'
        else:
            pos_val = 'n' # Noun
        lemmatized_word = lema.lemmatize(token, pos_val) # lemmatize
        lema_words.append(lemmatized_word) # append the lemmatized token
    
    return " ".join(lema_words) # returns the lemmatized tokens as a sentence 

In [96]:
# Function that removes stop words and process the corpus
def stopword_(text):      
    lema = wordnet.WordNetLemmatizer() # intializing lemmatization
    lema_words = []
    
    tokens = nltk.word_tokenize(text) # word tokenizing
    tags = pos_tag(tokens,tagset=None) # parts of speech
    
    words_to_remove = stopwords.words('english')
    return classify(tags, words_to_remove)
stopword_(query1)

'Will help tell'

In [97]:
# Function that returns response to query using BOW model
def chat(text):
    s = stopword_(text)
    lemma = normalization(s)
    bow = cv.transform([lemma]).toarray() 
    cosine_value = 1- pairwise_distances(df_bow,bow, metric = 'cosine' )
    index_value = cosine_value.argmax() # getting index value 
    return df['Text Response'].loc[index_value]
chat(query1)

"I'm glad to help. What can I do for you?"

## Conversation Demo

In [98]:
chat('Hi')

'Hey!'

In [99]:
chat('How are you feeling?')

'Lovely, thanks.'

In [100]:
chat('What is your name?')

'I am a bot. I am designed to keep conversation with you'

In [101]:
chat('See you soon')

'Bye.'