### Install dependencies/libraries
* nltk
* numpy
* bs4 (Beautifulsoup4)
* sklearn

`pip install -r requirements.txt`

### Import required libraries

### Create Corpus

In [1]:
import urllib.request
import bs4 as bs

html = urllib.request.urlopen('https://en.wikipedia.org/wiki/Eiffel_Tower')
html = html.read()

article = bs.BeautifulSoup(html, 'lxml')
paragraphs = article.find_all('p')

text = ''
for paragraph in paragraphs:
    text += paragraph.text
    
text = text.lower()

### Preprocess Data

In [2]:
import re
import nltk
import string

text = re.sub(r'\[[0-9]*\]', ' ', text) # remove special characters
text = re.sub(r'\s+', ' ', text) # replace multiple spaces with single space

sentences = nltk.sent_tokenize(text) # split the text to sentences
words = nltk.word_tokenize(text) # split the text to words

lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

punc_remover = dict((ord(punc), None) for punc in string.punctuation)

def preprocess(document):
    return lemmatize(nltk.word_tokenize(document.lower().translate(punc_remover)))

### Rules: Greetings

In [3]:
import random

greeting_inputs = ("hey", "good morning", "good evening", "morning", "evening", "hi", "whatsup")
greeting_responses = ["hey", "hello", "hello, there!", "how can I help you?"]

def greeting_response(greeting):
    for token in greeting.split():
        if token.lower() in greeting_inputs:
            return random.choice(greeting_responses)

### Rules: User Queries

Response is generated based upon the **cosine similarity** of the **tfidf vectorized** form of the input sentence and the sentences in the corpora.

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def query_response(query):
    
    bot_response = ''
    
    sentences.append(query) # append user query to already existing sentences
    
    vectorizer = TfidfVectorizer(tokenizer=preprocess, stop_words='english')
    vectors = vectorizer.fit_transform(sentences) # vectorize all sentences
    
    # find the similarity between the user input (last item) and all the sentences
    similar_vector_values = cosine_similarity(vectors[-1], vectors)
    
    # find the most similar sentence to the user input (second-last of the sorted list)
    similar_sentence_number = similar_vector_values.argsort()[0][-2]
    
    # flatten and extract the matched vector
    matched_vectors = similar_vector_values.flatten()
    matched_vectors.sort()
    vector_matched = matched_vectors[-2]
    
    if vector_matched == 0: # answer to query not found
        bot_response += "I am sorry, I could not understand that"
        return bot_response
    else: 
        bot_response += sentences[similar_sentence_number]
        return bot_response

## Chatting with the Bot

In [5]:
continue_chat = True
print("BOT: Hello! You can ask me anything about the Eiffel Tower.")
while(continue_chat == True):
    user_text = input()
    user_text = user_text.lower()
    if user_text != 'bye':
        if user_text == 'thanks' or user_text == 'thank you':
            print('BOT: Most welcome!')
            continue_chat = False
        else:
            if greeting_response(user_text) != None:
                print('BOT: ', greeting_response(user_text))
            else:
                print('BOT: ', query_response(user_text))
                sentences.remove(user_text) # remove from corpus once responded
    else:
        print('BOT: Goodbye and take care!')
        continue_chat = False

BOT: Hello! You can ask me anything about the Eiffel Tower.
Hi
BOT:  how can I help you?
constructed


  'stop_words.' % sorted(inconsistent))


BOT:  prior to the eiffel tower's construction, no structure had ever been constructed to a height of at least 300 metres, and many people believed it impossible.
tall


  'stop_words.' % sorted(inconsistent))


BOT:  it opened in 1894 and is 158.1 metres (518 ft) tall.
levels


  'stop_words.' % sorted(inconsistent))


BOT:  the tower has three levels for visitors, with restaurants on the first and second levels.
bye
BOT: Goodbye and take care!
