### Importing Libraries

In [1]:
import numpy as np
import nltk
import string
import random

In [2]:
f = open('chatbot.txt', 'r', errors = 'ignore')
raw_doc = f.read()
raw_doc = raw_doc.lower()
nltk.download('punkt')# Using the Punkt tokenizer
nltk.download('wordnet')# Using the WordNet Dictionary
sent_tokens = nltk.sent_tokenize(raw_doc) #Converts dov to list of sentences
word_tokens = nltk.word_tokenize(raw_doc) #Converts doc to list of wc

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


### Example of sentence tokens

In [3]:
sent_tokens[:2]

['\ndata science\nfrom wikipedia, the free encyclopedia\njump to navigationjump to search\nnot to be confused with information science.',
 'the existence of comet neowise (here depicted as a series of red dots) was discovered by analyzing astronomical survey data acquired by a space telescope, the wide-field infrared survey explorer.']

### Example of word tokens

In [4]:
word_tokens[:3]

['data', 'science', 'from']

### Text Preprocessing

In [11]:
lemmer = nltk.stem.WordNetLemmatizer()
#Wordnet is a semantically-oriented dictionary of English included in NLTK
def LenTokens(tokens):
  return [lemmer.lemmatize(token) for token in tokens]

remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
def LemNormalize(text):
   return LenTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict))) 

### Defining the greeting function

In [6]:
GREET_INPUTS = ("hello", "hi")
GREET_RESPONSES = ["hi", "hey", "*nods*", "hi there"]
def great(sentence):

  for word in sentence.split():
    if word.lower in GREET_INPUTS:
      return random.choice(GREET_RESPONSES)


### Response generation

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
def response(user_response):
  robo1_response = ''
  TfidfVec = TfidfVectorizer(tokenizer = LemNormalize, stop_words = 'english')
  tfidf =TfidfVec.fit_transform(sent_tokens)
  vals = cosine_similarity(tfidf[-1], tfidf)
  idx = vals.argsort()[0][-2]
  flat = vals.flatten()
  flat.sort()
  req_tfidf = flat[-2]
  if(req_tfidf == 0):
    robo1_response = robo1_response+"I am sorry! I don't understand you"
    return robo1_response
  else:
    robo1_response = robo1_response+sent_tokens[idx]
    return robo1_response



### Defining conversation start/end protocols

In [None]:
flag = True
print("BOT: My name is Stark. Let's have a conversation! Also, if you want to exit any time, just type")
while(flag == True):
  user_response = input()
  user_response = user_response.lower()
  if(user_response!= 'bye'):
    if(user_response == 'thanks' or user_response == 'thank you'):
      flag = False
      print("BOT: You are welcome .. ")
    else:
      if(great(user_response)!=None):
        print("BOT:" +greet(user_response))

      else:
        sent_tokens.append(user_response)
        word_tokens = word_tokens+nltk.word_tokenize(user_response)
        final_words = list(set(word_tokens))
        print("BOT:", end="")
        print(response(user_response))
        sent_tokens.remove(user_response)
  else:
    flag = False
    print("BOT: Goodbye! Take care")
      


BOT: My name is Stark. Let's have a conversation! Also, if you want to exit any time, just type
Hello
BOT:

  'stop_words.' % sorted(inconsistent))


hello
Data Science


  'stop_words.' % sorted(inconsistent))


BOT:"what is data science?
What is data science?


  'stop_words.' % sorted(inconsistent))


BOT:"what is data science?
contents


  'stop_words.' % sorted(inconsistent))


BOT:[4][5]


contents
1	foundations
1.1	relationship to statistics
2	etymology
2.1	early usage
2.2	modern usage
3	impact
4	technologies and techniques
4.1	techniques
5	see also
6	references
foundations
data science is an interdisciplinary field focused on extracting knowledge from data sets, which are typically large (see big data), and applying the knowledge and actionable insights from data to solve problems in a wide range of application domains.
relationship to statistics


  'stop_words.' % sorted(inconsistent))


BOT:[11]

relationship to statistics
many statisticians, including nate silver, have argued that data science is not a new field, but rather another name for statistics.
etymology


  'stop_words.' % sorted(inconsistent))


BOT:etymology
early usage
in 1962, john tukey described a field he called "data analysis", which resembles modern data science.
next


  'stop_words.' % sorted(inconsistent))


BOT:I am sorry! I don't understand you
impact


  'stop_words.' % sorted(inconsistent))


BOT:"the impacts of big data that you may not have heard of".
early usage


  'stop_words.' % sorted(inconsistent))


BOT:etymology
early usage
in 1962, john tukey described a field he called "data analysis", which resembles modern data science.
