In [None]:
import nltk
import warnings
from nltk.stem import WordNetLemmatizer
import string
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
warnings.filterwarnings('ignore')

**Downloading necessary packages**

In [None]:
nltk.download('popular',quiet=True)  #Downloads nltk packages
nltk.download('punkt',quiet=True)    #Downloads tokenizer used for word tokenization
nltk.download('wordnet',quiet=True)  #Wordnet provides vast collection of synonym sets

True

**Reading text file and storing it in a string**

In [None]:
with open('/content/Virat_Kohli.txt','r') as f:
  txt = f.read().lower()

#Tokenization

In [None]:
sent_tokens = nltk.sent_tokenize(txt)  #Contains sentence tokens
word_tokens = nltk.word_tokenize(txt)  #Contains word tokens

In [None]:
print(len(sent_tokens))
print(len(word_tokens))

2310
36708


#Preprocessing

In [None]:
lemmer = WordNetLemmatizer()
def lemmatization(tokens):
  return [lemmer.lemmatize(token) for token in tokens]

**Dictionary whose keys are ordinal values of punctuations and values set to none**

In [None]:
remove_punc_dict = dict((ord(punc),None) for punc in string.punctuation)

**removes all punctuations, then tokenizes the sentences and apply lemmatization to tokenized words**

In [None]:
def token_lemma(text):
  return lemmatization(nltk.word_tokenize(text.lower().translate(remove_punc_dict)))

In [None]:
GREETING_INPUTS = ["hello", "hi", "greetings", "sup", "what's up","hey"]
GREETING_RESPONSES = ["hi", "hey", "*nods*", "hi there", "hello", "I am glad! You are talking to me"]

def greeting(user_response):
  for word in user_response.split():
    if word.lower() in GREETING_INPUTS:
      return random.choice(GREETING_INPUTS)

In [None]:
def botresponse(user_response):
  bot_response = ''
  sent_tokens.append(user_response)
  tfidfVect = TfidfVectorizer(tokenizer=token_lemma, stop_words='english')
  tfidf = tfidfVect.fit_transform(sent_tokens)
  vals = cosine_similarity(tfidf[-1],tfidf)
  idx = vals.argsort()[0][-2]    #Stores the index of sentence with maximum similarity
  flat = vals.flatten()
  flat.sort()
  req_tfidf = flat[-2]
  if(req_tfidf==0):
    bot_response = bot_response + "Sorry, I don't understand you!!"
  else:
    bot_response = bot_response + sent_tokens[idx]

  return bot_response

In [None]:
flag = True

print('Bot : Hey, I am your bot. How can I help you? To exit the chat type bye')

while(flag):
  user_response = input()
  if(user_response.lower()!='bye'):
    if(user_response=='thanks' or user_response=='thank you' ):
      flag=False
      print("ROBO: You are welcome..")
    else:
      if(greeting(user_response)!=None):
        print('Bot : '+greeting(user_response))
      else:
        print('Bot : '+botresponse(user_response))
        sent_tokens.remove(user_response)
  else:
    flag = False
    print('Ok, have a nice day!!')