# Text classification using NLP- Core engine of a chatbot.

- Natural language processing helps computers communicate with humans in their own language and scales other language-related tasks. For example, NLP makes it possible for computers to read text, interpret it, measure sentiment and determine which parts are important. Understanding this will enable us to build the core component of any conversational chatbot. This is the core engine of a conversational chatbot

** **
- Detecting patterns is a central part of Natural Language Processing. 
- Words ending in -ed tend to be past tense verbs. 
- Frequent use of will is indicative of news text 
- These observable patterns — word structure and word frequency — happen to correlate with particular aspects of meaning, such as tense and topic. 

** **
- But how did we know where to start looking, which aspects of form to associate with which aspects of meaning? Through this exercise we will learn to create the core engine of a chat bot. 
- We will also learn text classification using the techniques of natural language processing.



In [1]:
import nltk

In [2]:
# nltk.download('popular')

In [3]:
import os
import re
import numpy as np
import pandas as pd
import random
import csv

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer, word_tokenize, sent_tokenize
from nltk.classify import SklearnClassifier

In [4]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

In [5]:
from IPython.display import display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [6]:
stop_words = stopwords.words('english')
type(stop_words)

list

In [7]:
def data_preprocessing(text):
    text = text.lower()    #Converting to text to lowercase
    text = re.sub(r'[^\w\s]','',text) #Removing hashtags
    text_tokens = word_tokenize(text)
    filtered_text = [word for word in text_tokens if word not in stop_words]
    # preprocessed_text = ' '.join(filtered_text)
    return filtered_text

POS tag list:

- CC coordinating conjunction
- CD cardinal digit
- DT determiner
- EX existential there (like: "there is" ... think of it like "there exists")
- FW foreign word
- IN preposition/subordinating conjunction
- JJ adjective 'big'
- JJR adjective, comparative 'bigger'
- JJS adjective, superlative 'biggest'
- LS list marker 1)
- MD modal could, will
- NN noun, singular 'desk'
- NNS noun plural 'desks'
- NNP proper noun, singular 'Harrison'
- NNPS proper noun, plural 'Americans'
- PDT predeterminer 'all the kids'
- POS possessive ending parent's
- PRP personal pronoun I, he, she
- PRP'$'$ possessive pronoun my, his, hers
- RB adverb very, silently,
- RBR adverb, comparative better
- RBS adverb, superlative best
- RP particle give up
- TO to go 'to' the store.
- UH interjection errrrrrrrm
- VB verb, base form take
- VBD verb, past tense took
- VBG verb, gerund/present participle taking
- VBN verb, past participle taken
- VBP verb, sing. present, non-3d take
- VBZ verb, 3rd person sing. present takes
- WDT wh-determiner which
- WP wh-pronoun who, what
- WP$ possessive wh-pronoun whose
- WRB wh-abverb where, when

In [8]:
def part_of_speech_tags(text):
    return nltk.pos_tag(text)

In [9]:
def extract_pos_tags(sentences):
    features = []
    for tagged_word in sentences:
        word, tag = tagged_word
        if tag=='NN' or tag == 'VBN' or tag == 'NNS' or tag == 'VBP' or tag == 'RB' or tag == 'VBZ' or tag == 'VBG' or tag =='PRP' or tag == 'JJ':
            features.append(word)
    return features

In [10]:
def extract_feature(text):
    lemmantizer = WordNetLemmatizer() 
    words = data_preprocessing(text)
    #print('words: ',words)
    tags = part_of_speech_tags(words)
    #print('tags: ',tags)
    extracted_features = extract_pos_tags(tags)
    #print('Extracted features: ',extracted_features)
    # stemmed_words = [stemmer.stem(x) for x in extracted_features]
    #print(stemmed_words)

    result = [lemmantizer.lemmatize(x) for x in extracted_features]
   
    return result

In [11]:
def word_feats(words):
    return dict([(word, True) for word in words])

In [12]:
def extract_feature_from_doc(data):
    result = []
    corpus = []
    # The responses of the chat bot
    answers = {}
    for (text, category, answer) in data:

        features = extract_feature(text)
        corpus.append(features)
        result.append((word_feats(features), category))
        answers[category] = answer

    return (result, sum(corpus,[]), answers)

In [13]:
extract_feature_from_doc([['this is the input text from the user','category','answer to give']])

([({'input': True, 'user': True}, 'category')],
 ['input', 'user'],
 {'category': 'answer to give'})

In [14]:
from sklearn.model_selection import train_test_split

def split_dataset(data, split_ratio=0.2):
    training_dataset, test_dataset = train_test_split(data, test_size=split_ratio, random_state=123)
    print('Shape of training dataset -', len(training_dataset))
    print('Shape of test dataset -', len(test_dataset))
    print('Saving Training dataset and Test dataset -')
    
    # save the data
    np.save('training_data', training_dataset)
    np.save('test_data', test_dataset)
    return training_dataset, test_dataset

In [15]:
def get_content(filename):
    document = os.path.join(filename)
    data = []
    with open(document, 'r') as content_file:
        lines = csv.reader(content_file, delimiter='|')
        data = [x for x in lines if len(x) == 3]
    return data

In [16]:
filename = 'leaves.txt'
data = get_content(filename)
data

[['Hello',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hi hello',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hi ',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hi', 'Greetings', 'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hi', 'Greetings', 'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hey',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hello, hi',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hey',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hey, hi',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hey, hello',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['Good morning',
  'Morning',
  'Good Morning. I am Dexter. I will serve your leave enquiries.'],
 ['Good afternoon',
  'Afternoon',
  

In [17]:
features_data, corpus, answers = extract_feature_from_doc(data)

In [18]:
features_data

[({'hello': True}, 'Greetings'),
 ({'hi': True, 'hello': True}, 'Greetings'),
 ({'hi': True}, 'Greetings'),
 ({'hi': True}, 'Greetings'),
 ({'hi': True}, 'Greetings'),
 ({'hey': True}, 'Greetings'),
 ({'hello': True, 'hi': True}, 'Greetings'),
 ({'hey': True}, 'Greetings'),
 ({'hey': True, 'hi': True}, 'Greetings'),
 ({'hey': True, 'hello': True}, 'Greetings'),
 ({'good': True, 'morning': True}, 'Morning'),
 ({'good': True, 'afternoon': True}, 'Afternoon'),
 ({'good': True, 'evening': True}, 'Evening'),
 ({'good': True, 'night': True}, 'Goodbye'),
 ({'today': True}, 'Opening'),
 ({'want': True, 'help': True}, 'Help'),
 ({'need': True, 'help': True}, 'Help'),
 ({'help': True}, 'Help'),
 ({'dont': True, 'want': True, 'help': True}, 'No-Help'),
 ({'dont': True, 'want': True, 'assistance': True}, 'No-Help'),
 ({'help': True}, 'No-Help'),
 ({'great': True, 'talking': True}, 'Closing'),
 ({'great': True}, 'Closing'),
 ({'thank': True, 'help': True}, 'Closing'),
 ({'thank': True}, 'Closing'),

In [19]:
corpus

['hello',
 'hi',
 'hello',
 'hi',
 'hi',
 'hi',
 'hey',
 'hello',
 'hi',
 'hey',
 'hey',
 'hi',
 'hey',
 'hello',
 'good',
 'morning',
 'good',
 'afternoon',
 'good',
 'evening',
 'good',
 'night',
 'today',
 'want',
 'help',
 'need',
 'help',
 'help',
 'dont',
 'want',
 'help',
 'dont',
 'want',
 'assistance',
 'help',
 'great',
 'talking',
 'great',
 'thank',
 'help',
 'thank',
 'thank',
 'much',
 'thanks',
 'thanks',
 'much',
 'many',
 'type',
 'leaf',
 'type',
 'leaf',
 'type',
 'leaf',
 'type',
 'leaf',
 'type',
 'many',
 'leaf',
 'taken',
 'many',
 'leaf',
 'already',
 'taken',
 'many',
 'annual',
 'leaf',
 'many',
 'annual',
 'leaf',
 'taken',
 'many',
 'annual',
 'leaf',
 'already',
 'taken',
 'annual',
 'leaf',
 'count',
 'taken',
 'many',
 'annual',
 'leaf',
 'taken',
 'number',
 'annual',
 'leaf',
 'taken',
 'annual',
 'leaf',
 'taken',
 'number',
 'annual',
 'leaf',
 'already',
 'taken',
 'annual',
 'leaf',
 'taken',
 'annual',
 'leaf',
 'already',
 'taken',
 'number',
 'an

In [20]:
answers

{'Greetings': 'Hello. I am Dexter. I will serve your leave enquiries.',
 'Morning': 'Good Morning. I am Dexter. I will serve your leave enquiries.',
 'Afternoon': 'Good afternoon. I am Dexter. I will serve your leave enquiries.',
 'Evening': 'Good evening. I am Dexter. I will serve your leave enquiries.',
 'Goodbye': 'Good night. Take care.',
 'Opening': "I'm fine! Thank you. How can I help you?",
 'Help': 'How can I help you?',
 'No-Help': 'Ok sir/madam. No problem. Have a nice day.',
 'Closing': "It's glad to know that I have been helpful. Have a good day!",
 'Leaves-Type': 'Currently I know about two: annual and optional leaves.',
 'Default-Utilized-Annual-Leaves': 'You have used 12 annual leaves.',
 'Utilized-Annual-Leaves': 'You have taken 12 annual leaves.',
 'Utilized-Optional-Leaves': 'You have taken 1 optional leaves.',
 'Default-Balance-Annual-Leaves': 'You have 25 annual leaves left.',
 'Balance-Annual-Leaves': 'You have 25 annual leaves remaining.',
 'Balance-Optional-Leave

In [21]:
training_data, test_data = split_dataset(features_data, split_ratio=0.2)

Shape of training dataset - 114
Shape of test dataset - 29
Saving Training dataset and Test dataset -


## Classification using Decision Tree

In [22]:
def train_using_decision_tree(training_data, test_data):
    classifier = nltk.classify.DecisionTreeClassifier.train(training_data, entropy_cutoff=0.6, support_cutoff=6)
    classifier_name = type(classifier).__name__
    training_set_accuracy = nltk.classify.accuracy(classifier, training_data)
    print('training set accuracy: ', training_set_accuracy)
    test_set_accuracy = nltk.classify.accuracy(classifier, test_data)
    print('test set accuracy: ', test_set_accuracy)
    return classifier, classifier_name, test_set_accuracy, training_set_accuracy

In [23]:
training_data = np.load('training_data.npy', allow_pickle=True)
test_data = np.load('test_data.npy' , allow_pickle=True)
dtclassifier, classifier_name, test_set_accuracy, training_set_accuracy = train_using_decision_tree(training_data, test_data)

training set accuracy:  0.9210526315789473
test set accuracy:  0.7241379310344828


## Classification using Naive Bayes

In [24]:
def train_using_naive_bayes(training_data, test_data):
    classifier = nltk.NaiveBayesClassifier.train(training_data)
    classifier_name = type(classifier).__name__
    training_set_accuracy = nltk.classify.accuracy(classifier, training_data)
    test_set_accuracy = nltk.classify.accuracy(classifier, test_data)
    return classifier, classifier_name, test_set_accuracy, training_set_accuracy

In [25]:
classifier, classifier_name, test_set_accuracy, training_set_accuracy = train_using_naive_bayes(training_data, test_data)
print(training_set_accuracy)
print(test_set_accuracy)
print(len(classifier.most_informative_features()))
classifier.show_most_informative_features()

0.868421052631579
0.7241379310344828
74
Most Informative Features
                 already = True           Defaul : Utiliz =     12.0 : 1.0
                    leaf = None           Closin : Balanc =      7.1 : 1.0
                    many = True           Defaul : Balanc =      6.3 : 1.0
                   taken = None           Balanc : Utiliz =      4.3 : 1.0
                    help = True             Help : Closin =      3.9 : 1.0
                   carry = None           Utiliz : CF     =      3.4 : 1.0
                   count = True           Utiliz : CF     =      2.8 : 1.0
                      hi = None           Utiliz : Greeti =      2.7 : 1.0
               remaining = None           Utiliz : Balanc =      2.4 : 1.0
                   leave = True           Defaul : CF     =      2.2 : 1.0


### Question and Answers Sample

In [26]:
def reply(input_sentence):
    category = dtclassifier.classify(word_feats(extract_feature(input_sentence)))
    return answers[category]

In [27]:
response = reply('Hi')
response
response2 = reply('How many annual leaves do I have left?')
response2
response3 = reply('Thanks')
response3

'Hello. I am Dexter. I will serve your leave enquiries.'

'You have 25 annual leaves remaining.'

"It's glad to know that I have been helpful. Have a good day!"

In [28]:
response = reply('Good Morning')
response
response2 = reply('I need your help')
response2
response3 = reply('What is my leave balance?')
response3

'Good afternoon. I am Dexter. I will serve your leave enquiries.'

'How can I help you?'

'You have 25 annual leaves left.'