# **Create Your Own Chatbot using NLP**

## **Importing necessary modules**

In [1]:
import numpy as np # for numerical operations
import pandas as pd # for reading the dataset
 
import re   # regural expressions
 
import sklearn  #these have inbuilt models and algorithms
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
import nltk #natural langauage tool kit

from nltk.corpus import stopwords # to remove unnecessary words from the sentences
nltk.download("stopwords")

import gensim # inbuilt model used in the nltk
from nltk.stem.porter import PorterStemmer # combine the similar words in to a single word
from gensim import corpora

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## **Data cleaning and preprocessing**

In [4]:
df=pd.read_csv("dataset.csv")
df.head()

Unnamed: 0,ques,ans
0,Hi,Hello! How can I help you?
1,Thank you,Your welcome
2,What is your name?,Chat-Bot
3,your name,Chat-Bot
4,hello,Hello! How can I help you?


In [5]:
df.info() #Print information about the data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ques    9 non-null      object
 1   ans     9 non-null      object
dtypes: object(2)
memory usage: 272.0+ bytes


In [6]:
ques=df["ques"]

ans=df["ans"]

for i in range(len(ques)):
  print(ques[i] + "   --->   " + ans[i] )

Hi   --->   Hello! How can I help you?
Thank you   --->   Your welcome
What is your name?   --->   Chat-Bot
your name   --->   Chat-Bot
hello   --->   Hello! How can I help you?
Bye   --->   Bye!
Thanks a lot   --->   Your welcome
Which language do you like most   --->   Python
mosk liked language   --->   Python


In [7]:
#returns the cleaned sentences by reomoving all stopwords and similar type of words
stemmer=PorterStemmer()


def clean_sentences(sentence,stop_words=False):
  cleaned_sentences=[]
  for i in range(0,len(sentence)):
    review=re.sub('[^a-zA-Z]' ,' ',sentence[i])
    review=review.lower()
    review=review.split()
    if stop_words:
     review=[stemmer.stem(word) for word in review if  not word in stopwords.words('english')]
    review=' '.join(review)
    cleaned_sentences.append(review)
  return cleaned_sentences



In [11]:
#similar to above cell, but only for questions.
 
def clean_sentences_Q(sentence,stop_words=False):
 
  for i in range(0,len(sentence)):
    review=re.sub('[^a-zA-Z]' ,' ',sentence)
    review=review.lower()
    review=review.split()
    if stop_words:
      review=[stemmer.stem(word) for word in review if  not word in stopwords.words('english')]
    review=' '.join(review)
  return review

In [12]:
#cleaning the sentences in the training dataset

cleaned_sent_with_stopwords = clean_sentences(ques,stop_words=False)


In [13]:
cleaned_sent_with_stopwords

['hi',
 'thank you',
 'what is your name',
 'your name',
 'hello',
 'bye',
 'thanks a lot',
 'which language do you like most',
 'mosk liked language']

In [14]:
cleaned_sentences=clean_sentences(ques,stop_words=True)

## **Pre-processing and Feature Engineering**

In [15]:
#printing the cleaned sentences --> words are seperated within the list 
sentences=cleaned_sent_with_stopwords

sentence_words=[[word for word in document.split()] for document in sentences]


sentence_words

[['hi'],
 ['thank', 'you'],
 ['what', 'is', 'your', 'name'],
 ['your', 'name'],
 ['hello'],
 ['bye'],
 ['thanks', 'a', 'lot'],
 ['which', 'language', 'do', 'you', 'like', 'most'],
 ['mosk', 'liked', 'language']]

In [16]:
from gensim import corpora

dictionary = corpora.Dictionary(sentence_words)

for key,value in dictionary.items():
  print(key,':',value)

0 : hi
1 : thank
2 : you
3 : is
4 : name
5 : what
6 : your
7 : hello
8 : bye
9 : a
10 : lot
11 : thanks
12 : do
13 : language
14 : like
15 : most
16 : which
17 : liked
18 : mosk


## **Creating word2Vec model**

In [17]:
 #converting words to vector

bow_corpus=[dictionary.doc2bow(text) for text in sentence_words]

In [18]:
dictionary

<gensim.corpora.dictionary.Dictionary at 0x7f3777ec8050>

In [19]:
bow_corpus

[[(0, 1)],
 [(1, 1), (2, 1)],
 [(3, 1), (4, 1), (5, 1), (6, 1)],
 [(4, 1), (6, 1)],
 [(7, 1)],
 [(8, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(2, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1)],
 [(13, 1), (17, 1), (18, 1)]]

In [20]:
for sent, embedding in zip(sentences,bow_corpus):
  print(sent)
  print(embedding)
  print('\n-------------------------------------------------------')

hi
[(0, 1)]

-------------------------------------------------------
thank you
[(1, 1), (2, 1)]

-------------------------------------------------------
what is your name
[(3, 1), (4, 1), (5, 1), (6, 1)]

-------------------------------------------------------
your name
[(4, 1), (6, 1)]

-------------------------------------------------------
hello
[(7, 1)]

-------------------------------------------------------
bye
[(8, 1)]

-------------------------------------------------------
thanks a lot
[(9, 1), (10, 1), (11, 1)]

-------------------------------------------------------
which language do you like most
[(2, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1)]

-------------------------------------------------------
mosk liked language
[(13, 1), (17, 1), (18, 1)]

-------------------------------------------------------


## **Using Cosine Similarity**

In [21]:
#using cosine similarity

def your_ans(ques_embedding,sent_embedding,df,sentences):
    max_sim=-1
    index_sim=-1
    for index,df_embedding in enumerate(sent_embedding):
      sim=cosine_similarity(df_embedding,ques_embedding)[0][0];
      #print(index,sim,sentences[index])
      if sim>max_sim:
        max_sim=sim
        index_sim=index
    print('  ',df.iloc[index_sim,1])

## **Testing Our Chatbot**

In [23]:
while True:
    question=str(input('>> '))
    try:
      if question=='exit':
          break;
      question = clean_sentences_Q(question,stop_words=False)
      question_embedding=dictionary.doc2bow(question.split())
 
      your_ans(question_embedding,bow_corpus,df,sentences)
    except:     
      print("Something went wrong!")

>> Hi
   Hello! How can I help you?
>> your name?
   Chat-Bot
>> which language do you like?
   Python
>> Thank you!
   Your welcome
>> exit


K HUNNURJI RAO