In [3]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string
import bs4
import urllib.request
import re
import random
import numpy as np

In [4]:
url = 'https://en.wikipedia.org/wiki/Machine_learning'

In [5]:
raw_data = urllib.request.urlopen(url)
raw_data = raw_data.read()

In [6]:
article_html = bs4.BeautifulSoup(raw_data, 'html')

In [7]:
paragraphs = article_html.find_all("p")

In [8]:
article_text = " "
for para in paragraphs:
    article_text += para.text

In [9]:
article_text = re.sub(r"[[0-9]*]", "", article_text)
article_text = re.sub(r":\d*", "", article_text)

  """Entry point for launching an IPython kernel.


In [10]:
article_text = re.sub(r"\s+", " ", article_text)

In [11]:
article_text[:1000]

' Machine learning (ML) is the study of computer algorithms that improve automatically through experience. It is seen as a subset of artificial intelligence. Machine learning algorithms build a mathematical model based on sample data, known as "training data", in order to make predictions or decisions without being explicitly programmed to do so. Machine learning algorithms are used in a wide variety of applications, such as email filtering and computer vision, where it is difficult or infeasible to develop conventional algorithms to perform the needed tasks. Machine learning is closely related to computational statistics, which focuses on making predictions using computers. The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning. Data mining is a related field of study, focusing on exploratory data analysis through unsupervised learning. In its application across business problems, machine learning is also referred to as

In [12]:
article_sentences = nltk.sent_tokenize(article_text)
article_words = nltk.word_tokenize(article_text)

In [13]:
len(article_sentences)

262

In [14]:
wnlemmatizer = nltk.stem.WordNetLemmatizer()

def perform_lemmatization(tokens):
    return [wnlemmatizer.lemmatize(token) for token in tokens]

punctuation_removal = dict((ord(punctuation), None) for punctuation in string.punctuation)

def get_processed_text(document):
    return perform_lemmatization(nltk.word_tokenize(document.lower().translate(punctuation_removal)))

In [15]:
greeting_inputs = ("hey", "good morning", "good evening", "morning", "evening", "hi","hello", "whatsup")
greeting_responses = ["hey", "hey hows you?", "hello, how you doing", "hello", "Welcome, I am good and you"]

def generate_greeting_responce(greeting):
    for token in greeting.split():
        if token.lower() in greeting_inputs:
            return random.choice(greeting_responses).title()
        

In [16]:
word_vector = TfidfVectorizer(tokenizer=get_processed_text, stop_words="english")
all_words_vector = word_vector.fit_transform(article_sentences)

  'stop_words.' % sorted(inconsistent))


In [17]:
print("length of vocabulary", len(word_vector.get_feature_names()))
all_words_vector.shape

length of vocabulary 1326


(262, 1326)

In [18]:
def generate_response(user_input):
    robo_response = ""
    user_text = user_input.lower()
    user_text_vector = word_vector.transform([user_text])
    similarty_vector_value = cosine_similarity(user_text_vector, all_words_vector)
    similar_sentence_number = similarty_vector_value.argsort()[0][-1]
    
    similarty_score = similarty_vector_value.flatten()
    similarty_score.sort()
    similar_score = similarty_score[-1]
#     print(similarty_score)
    
    if similar_score == 0.:
        robo_response = robo_response + "I am sorry, I could not understand you."
        return robo_response
    else:
        robo_response = robo_response + article_sentences[similar_sentence_number]
        return robo_response


In [None]:
continue_dialog = True
print("Hello, I am your friend Robo. You can ask me any question regarding machine learning : ")
while continue_dialog:
    print("User : ")
    user_input = input()
    user_text = user_input.strip().lower()
    
    if user_text == "bye":
        continue_dialog = False
        print("Robo : Good bye and take care of yourself...")
    else:
        if user_text=="thanks" or user_text=="thank you very much" or user_text=="thank you" :
            continue_dialog = False
            print("Robo : Thank you, Most Welcome ")
        elif user_text == 'hmmmmmm' or user_text == 'hmm'or user_text == 'ok' or user_text == 'okay':
            print("Robo : Hmmm..")
        else:
            if generate_greeting_responce(user_text) != None:
                print("Robo : "+generate_greeting_responce(user_text))
            else:
                print("Robo : " + generate_response(user_text))

Hello, I am your friend Robo. You can ask me any question regarding machine learning : 
User : 


 hello


Robo : Hello, How You Doing
User : 


 tell me something about machine learning


Robo : A representative book of the machine learning research during the 1960s was the Nilsson's book on Learning Machines, dealing mostly with machine learning for pattern classification.
User : 


 pattern classification >


Robo : A representative book of the machine learning research during the 1960s was the Nilsson's book on Learning Machines, dealing mostly with machine learning for pattern classification.
User : 


 what is pattern classification?


Robo : A representative book of the machine learning research during the 1960s was the Nilsson's book on Learning Machines, dealing mostly with machine learning for pattern classification.
User : 
