In [9]:
import nltk
import numpy as np
import random
import string
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
article_text = ''
with open('raft.txt') as f:
    article_text += " ".join(line.strip() for line in f)      

article_text = article_text.lower()
# remove special characters and empty spaces
article_text = re.sub(r'\[[0-9]*\]', ' ', article_text)
article_text = re.sub(r'\s+', ' ', article_text)

article_sentences = nltk.sent_tokenize(article_text)

In [12]:
def clean_input(text):
    """ pre-process user input and data """
    
    # remove punctutation
    text = text.translate(str.maketrans(' ', ' ', string.punctuation))
    # tokenize input
    tokens = nltk.word_tokenize(text)
    wnl = nltk.stem.WordNetLemmatizer()

    for words in tokens:
        # lemmatize words
        wnl.lemmatize(words)
    return tokens

In [13]:
greeting_inputs = ("hey", "good morning", "good evening", "morning", "evening", "hi", "hello")
greeting_responses = ["Hi", "Nice to meet you!", "*nods*", "Hello!", "Welcome!"]

def generate_greeting_response(greeting):
    """ return a random greeting from a pre-defined list """

    for token in greeting.split():
        if token.lower() in greeting_inputs:
            return random.choice(greeting_responses)

In [14]:
def generate_response(user_input):
    """ get cosine similarity of user input and compare with article sentence tokens,
     return the token with the highest cosine score """

    bot_response = ''
    # add user input to list of tokens for comparison
    article_sentences.append(user_input)
    word_vectorizer = TfidfVectorizer(tokenizer=clean_input, stop_words='english')
    # convert tokens into a vector
    all_word_vectors = word_vectorizer.fit_transform(article_sentences)
    # find cosine similarity of user input (last item in list) with article vectors
    similar_vector_values = cosine_similarity(all_word_vectors[-1], all_word_vectors)
    similar_sentence_number = similar_vector_values.argsort()[0][-2]

    matched_vector = similar_vector_values.flatten()
    matched_vector.sort()
    vector_matched = matched_vector[-2]

    if vector_matched == 0:
        bot_response += "Sorry, I don't understand."
        return bot_response
    else:
        bot_response += article_sentences[similar_sentence_number]
        return bot_response

In [16]:
continue_dialogue = True
print("Hi, my name is Bot and I'm an expert at Raft! \n"
      "Ask me anything about the Raft paper published in 2014.\n"
      "To end our conversation, type 'bye' or just thank me ;)")

while continue_dialogue:
    user_input = input()
    user_input = user_input.lower()
    if user_input != 'bye':
        if user_input == 'thanks' or user_input == 'thank you':
            continue_dialogue = False
            print("Bot: You're welcome!")
        else:
            if generate_greeting_response(user_input) is not None :
                print("Bot: " + generate_greeting_response(user_input))
            else:
                print("Bot: ", end="")
                print(generate_response(user_input))
                article_sentences.remove(user_input)
    else:
        continue_dialogue = False
        print("Bot: Bye bye!")

Hi, my name is Bot and I'm an expert at Raft! 
Ask me anything about the Raft paper published in 2014.
To end our conversation, type 'bye' or just thank me ;)
How does a leader get elected?
Bot: 4 election safety: at most one leader can be elected in a given term.
Tell me about log replication
Bot: for example, in raft we separated leader election, log replication, safety, and membership changes.
What states can a server be in?
Bot: at any given time each server is in one of three states: leader, follower, or candidate.
thanks
Bot: You're welcome!
