In [1]:
#Import libraries and packages
import numpy as np
import nltk
import string 
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Load and read QA text
f = open('Chatbot Document.txt', 'r', errors = 'ignore')
text = f.read()

<b>PREPROCESSING</b>

In [3]:
#Convert text to sentences (i.e., create sentence tokens)
text_sentences = nltk.sent_tokenize(text)

#Convert text to list of words (i.e., create word tokens)
text_words = nltk.word_tokenize(text)

In [4]:
#Function takes in tokens and returns base word
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatization(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

#Function takes in text and returns word tokens (lowercase and without punctuation)
#Translate function replaces punctuation replaced with None
punctuation_dictionary = dict((ord(punct), None) for punct in string.punctuation)
def prepare_corpus(text):
    return lemmatization(nltk.word_tokenize(text.lower().translate(punctuation_dictionary))) 

<b>QUIZBOT STUDY SESSION QUESTIONS</b>

In [5]:
#Start of study session; questions hard-coded
start_session = ["start", 
                 "begin", 
                 "question", 
                 "next question", 
                 "next"]

study_questions = ["What is logistic regression?", 
                "What is collaborative filtering?", 
                "What is one way to select K for K-means?", 
                "What is entropy?"]

#Function returns a question 
#If word used by user in start session list, function returns a random question in study_questions list
def question(sentence):
    for word in sentence.split():
        if word.lower() in start_session:
            return random.choice(study_questions)

<b>ANSWER CHECK GENERATION</b>

In [6]:
#Generate  function accepts 1 parameter (i.e., user_input)
def generate_bot_answer(user_input):
    quizbot_answer = ''
    
    #Initialize vectorizer with prepare_corpus tokenizer and stop word removal
    vectorizer = TfidfVectorizer(tokenizer = prepare_corpus, stop_words = 'english')  
    
    #Vectorizer fits to "training data"
    #Learns vocabulary from text_sentences and calculates idf
    #Maps features to idf values
    vectorizer.fit(text_sentences) 
    
    #Convert tokenized text to 2D feature matrix (i.e., document-term matrix)
    matrix = vectorizer.transform(text_sentences)
    
    #Debugging
    #print("debug start")
    
    #Check attributes/methods to figure out hot to calculate idf values
    #print(dir(vectorizer))
    
    #Check length of array 
    #print(vectorizer.idf_.shape)
    
    #Calculate idf values
    #print(vectorizer.idf_)
    
    #Initialize cosine similarity function
    #CS used to measure similarity between user input and corpus
    cs_values = cosine_similarity(matrix[-1], matrix)
    
    
    #Debugging
    #print("debug start")
    
    #Check type
    #print(type(cs_values))
    
    #Check original shape of array
    #print(cs_values.shape)
    
    #Check to ensure array has been flattened (2D --> 1D)
    #print(cs_values.flatten().shape) 
    
    #Sort feature vectors by respective cosine similarity values and select most related (i.e., highest cs values)
    #Leaves out the user input [at index -1 since highest cosine similarity (i.e., equals 1, perfect match)]
    #idx indicates *position* of the second highest cosine similarity value
    match_position = cs_values.argsort()[0][-2]
    
    #Flattened 2D to 1D array and selects best match
    #Need to flatten otherwise will return a list instead of float (i.e., cosine similarity value)
    #req_tfidf returns the actual cosine similarity value at index -2
    flattened_matrix = cs_values.flatten()
    flattened_matrix.sort()
    matched_value = flattened_matrix[-2]
    
    #If cosine similarity equals 0, there are no similarities between the user input and corpus
    #Bot responds "I don't understand"
    if matched_value == 0:
        quizbot_answer = quizbot_answer + "I am sorry! I don't understand you!"
        return quizbot_answer
    
    #If cosine similarity is nonzero, return matching sentence in response to query (i.e., sentence at index -2)
    else:
        quizbot_answer = quizbot_answer + text_sentences[match_position]
        return quizbot_answer

<b>STUDY SESSION</b>

In [7]:
#Start of session
#Initialize the session variable as true to start the session.
session = True
print("QuizBot: Welcome! If you want to end the study session, just type bye!")

#Session
#The conversation will continue until session is false.
#If user input indicates start of session, respond with question
while session == True:
    user_input = input()
    
    #User input converted to lowercase prior to addition to sentence corpus
    user_input = user_input.lower()
    if user_input != 'bye':
        if question(user_input) != None:
            print("QuizBot: " + question(user_input))
        
        #User input appended to text_sentences and tokenized (i.e., preprocessed) for cosine similarity
        else:
            text_sentences.append(user_input)
            text_words = text_words + nltk.word_tokenize(user_input)
            all_words = list(set(text_words))
            
            #Bot answer is generated, user can compare answers
            print("QuizBot: ", end = "")
            print(generate_bot_answer(user_input))
            
            #After bot response is generated, user input is removed from sentence corpus
            text_sentences.remove(user_input)
                
    #End of Session
    #Session set to false if user enters 'bye'; breaks out of while loop and ends session.
    else:
        session = False
        print("QuizBot: Bye! See you later!") 

QuizBot: Welcome! If you want to end the study session, just type bye!
start
QuizBot: What is logistic regression?
Logistic regression is a statistical model used to find the relationships that exist between a dependent binary variable and one or more independent variables.
QuizBot: Logistic regression is a statistical model used to find the relationships that exist between a dependent binary variable and one or more independent variables.
next
QuizBot: What is one way to select K for K-means?
The most popular method for selecting k for the k-means algorithm is using the elbow method.
QuizBot: The most popular method for selecting k for the k-means algorithm is using the elbow method.
next
QuizBot: What is collaborative filtering?
Based on user activity
QuizBot: Collaborative filtering is a form of content filtering that uses similarities between different users to make recommendations.
next
QuizBot: What is entropy?
Entropy is a chemical term, measure of disorder.
QuizBot: Entropy is 

<b>APPENDIX</b>

In [8]:
#Answers
#What is logistic regression?
#Logistic regression is a statistical model used to find the relationships that exist between a dependent binary variable and one or more independent variables.
#What is collaborative filtering?
#Collaborative filtering is a form of content filtering that uses similarities between different users to make recommendations.
#What is one way to select K for K-means?
#The most popular method for selecting k for the k-means algorithm is using the elbow method.
#What is entropy?
#Entropy is a measure of the level of uncertainty or impurity that’s present in a dataset.