Importing Libraries

In [21]:
import numpy as np
import nltk   # NLP tokenization, parsing, classification, stemming, tagging and semantic reasoning
import string # Ascii string constants
import random # generate pseudo-random values

Importing and reading the corpus

In [22]:
f = open('chatbot.txt', 'r', errors = 'ignore')
raw_doc = f.read()
raw_doc = raw_doc.lower() #lowecase
nltk.download('punkt') #punkt tokenizer
nltk.download('wordnet') #wordnet dictionary
sent_tokens = nltk.sent_tokenize(raw_doc)  #convert doc to list of sent
word_tokens = nltk.word_tokenize(raw_doc) #convert doc to list of words

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Example of sentence tokens

In [23]:
sent_tokens[:2]

['data science is an interdisciplinary academic field [1] that uses statistics, scientific computing, scientific methods, processes, algorithms and systems to extract or extrapolate knowledge and insights from noisy, structured, and unstructured data.',
 '[2]\n\ndata science also integrates domain knowledge from the underlying application domain (e.g., natural sciences, information technology, and medicine).']

In [24]:
word_tokens[:2]

['data', 'science']

Text Preprocessing

In [25]:
lemmer = nltk.stem.WordNetLemmatizer()
#wordnet is sematically oriented dict of english included in nltk
def LemTokens(tokens):
  return [lemmer.lemmatize(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
def LemNormalize(text):
  return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

Defining the greeting function

In [26]:
GREET_INPUTS = ('hello', 'hii','greetings','sup','hey',)
GREET_RESPONSES = ['hii','hey','*nods*','hi there','hello','I am glad!']
def greet(sentence):
  for word in sentence.split():
    if word.lower() in GREET_INPUTS:
      return random.choice(GREET_RESPONSES)

Response generation

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [28]:
def response(user_response):
  robo1_response = ''
  TfidfVec = TfidfVectorizer(tokenizer =  LemNormalize, stop_words = 'english')
  tfidf = TfidfVec.fit_transform(sent_tokens)
  vals = cosine_similarity(tfidf[-1], tfidf)
  idx = vals.argsort()[0][-2]
  flat = vals.flatten()
  flat.sort()
  req_tfidf = flat[-2]
  if(req_tfidf == 0):
    robo1_response = robo1_response+"I am sorry! I don't understand you"
    return robo1_response
  else:
    robo1_reponse = robo1_response+sent_tokens[idx]
    return robo1_response


Defining conversation start/end protocol

In [29]:
flag = True
print('BOT: My name is Stark. Lets have a conversation! Also, if you want to exit any time, just type Bye')
while(flag == True):
  user_response = input()
  user_response = user_response.lower()
  if(user_response != 'bye'):
    if(user_response == 'thanks' or user_response == 'thank you'):
      flag = False
      print("BOT: You are Welcome!!")
    else:
      if(greet(user_response) != None):
        print("BOT: "+greet(user_response))
      else:
        sent_tokens.append(user_response)
        word_tokens = word_tokens+nltk.word_tokenize(user_response)
        final_words = list(set(word_tokens))
        print("BOT: ",end="")
        print(response(user_response))
        sent_tokens.remove(user_response)
  else:
    flag = False
    print("BOT: Goodbye! Take Care <3")


BOT: My name is Stark. Lets have a conversation! Also, if you want to exit any time, just type Bye
hey
BOT: hi there
bye
BOT: Goodbye! Take Care <3
