# Text classification using NLP / Core engine of a chat bot.

Human language is astoundingly complex and diverse. When we write, we often misspell or abbreviate words, or omit punctuation. There is a lot of unstructured data around us. Natural language processing helps computers communicate with humans in their own language and scales other language-related tasks. For example, NLP makes it possible for computers to read text, interpret it, measure sentiment and determine which parts are important. Understanding this will enable you to build the core component of any conversational chatbot. This is the core engine of a conversational chatbot

# Import useful libraries

In [1]:
import nltk

In [2]:
nltk.download("popular")

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\jawhe\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\jawhe\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\jawhe\AppData\Roaming\nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\jawhe\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     C:\Users\jawhe\AppData\Roaming\nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]   

True

### Install NLTK components:
    
nltk.download_gui()

#The above will open a GUI
Select the below

    stopwords from Corpa
    averaged_perceptron_tagger from All corpus
    wordnet
    
OR you can download all the nltk components by:
    nltk.download()
    
Please Note: The above will take much time (30-60mins depending on Internet speed)

In [3]:
import re
import os
import csv
from nltk.stem.snowball import SnowballStemmer
import random
from nltk.classify import SklearnClassifier
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import numpy as np
import pandas as pd

In [5]:
## Get multiple outputs in the same cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Ignore all warnings
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

In [6]:
## Display all rows and columns of a dataframe instead of a truncated version
from IPython.display import display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Preprocess

In [7]:
sentence = "The Big brown fox jumped over a lazy dog."
sentence2 = "This is particularly important in today's world where we are swamped with unstructured natural language data on the variety of social media platforms people engage in now-a-days (note -  now-a-days in the decade of 2010-2020)"

In [8]:
#convert sentence to lower case
'This' == 'this'
print('AbcdEFgH'.lower())
sentence.lower()
sentence2.lower()

False

abcdefgh


'the big brown fox jumped over a lazy dog.'

"this is particularly important in today's world where we are swamped with unstructured natural language data on the variety of social media platforms people engage in now-a-days (note -  now-a-days in the decade of 2010-2020)"

### Tokenize - extract individual words

In [9]:
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(sentence)
tokens
tokens2 = tokenizer.tokenize(sentence2)
tokens2

['The', 'Big', 'brown', 'fox', 'jumped', 'over', 'a', 'lazy', 'dog']

['This',
 'is',
 'particularly',
 'important',
 'in',
 'today',
 's',
 'world',
 'where',
 'we',
 'are',
 'swamped',
 'with',
 'unstructured',
 'natural',
 'language',
 'data',
 'on',
 'the',
 'variety',
 'of',
 'social',
 'media',
 'platforms',
 'people',
 'engage',
 'in',
 'now',
 'a',
 'days',
 'note',
 'now',
 'a',
 'days',
 'in',
 'the',
 'decade',
 'of',
 '2010',
 '2020']

### Stopwords : Filter words to remove non-useful words

In [10]:
filtered_words = [w for w in tokens if not w in stopwords.words('english')]
filtered_words

['The', 'Big', 'brown', 'fox', 'jumped', 'lazy', 'dog']

In [11]:
filtered_words = [w for w in tokens2 if not w in stopwords.words('english')]
filtered_words

['This',
 'particularly',
 'important',
 'today',
 'world',
 'swamped',
 'unstructured',
 'natural',
 'language',
 'data',
 'variety',
 'social',
 'media',
 'platforms',
 'people',
 'engage',
 'days',
 'note',
 'days',
 'decade',
 '2010',
 '2020']

In [12]:
def preprocess(sentence):
    sentence = sentence.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    filtered_words = [w for w in tokens if not w in stopwords.words('english')]
    return filtered_words

In [13]:
preprocessed_sentence = preprocess(sentence)
print(preprocessed_sentence)

['big', 'brown', 'fox', 'jumped', 'lazy', 'dog']


In [14]:
preprocess(sentence2)

['particularly',
 'important',
 'today',
 'world',
 'swamped',
 'unstructured',
 'natural',
 'language',
 'data',
 'variety',
 'social',
 'media',
 'platforms',
 'people',
 'engage',
 'days',
 'note',
 'days',
 'decade',
 '2010',
 '2020']

## Tagging

In [15]:
tags = nltk.pos_tag(preprocessed_sentence)
print(tags)

[('big', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumped', 'VBD'), ('lazy', 'JJ'), ('dog', 'NN')]


In [16]:
tags = nltk.pos_tag(preprocess(sentence2))
print(tags)

[('particularly', 'RB'), ('important', 'JJ'), ('today', 'NN'), ('world', 'NN'), ('swamped', 'VBD'), ('unstructured', 'JJ'), ('natural', 'JJ'), ('language', 'NN'), ('data', 'NNS'), ('variety', 'NN'), ('social', 'JJ'), ('media', 'NNS'), ('platforms', 'NNS'), ('people', 'NNS'), ('engage', 'VBP'), ('days', 'NNS'), ('note', 'VBP'), ('days', 'NNS'), ('decade', 'NN'), ('2010', 'CD'), ('2020', 'CD')]


## Extracting only Nouns and Verb nouns

In [17]:
def extract_tagged(sentences):
    features = []
    for tagged_word in sentences:
        word, tag = tagged_word
        if tag=='NN' or tag == 'VBN' or tag == 'NNS' or tag == 'VBP' or tag == 'RB' or tag == 'VBZ' or tag == 'VBG' or tag =='PRP' or tag == 'JJ':
            features.append(word)
    return features

In [18]:
extract_tagged(tags)

['particularly',
 'important',
 'today',
 'world',
 'unstructured',
 'natural',
 'language',
 'data',
 'variety',
 'social',
 'media',
 'platforms',
 'people',
 'engage',
 'days',
 'note',
 'days',
 'decade']

## Lemmatize words

In [19]:
lmtzr = WordNetLemmatizer()
print(lmtzr.lemmatize('cacti'))
print(lmtzr.lemmatize('willing'))
print(lmtzr.lemmatize('feet'))
print(lmtzr.lemmatize('stemmed'))

print(lmtzr.lemmatize('cactus'))

cactus
willing
foot
stemmed
cactus


## Stem words

In [20]:
words_for_stemming = ['stem', 'stemming', 'stemmed', 'stemmer', 'stems','feet','willing']

In [21]:
stemmer = SnowballStemmer("english")
[stemmer.stem(x) for x in words_for_stemming]

['stem', 'stem', 'stem', 'stemmer', 'stem', 'feet', 'will']

## Putting it all together

In [22]:
def extract_feature(text):
    words = preprocess(text)
#     print('words: ',words)
    tags = nltk.pos_tag(words)
#     print('tags: ',tags)
    extracted_features = extract_tagged(tags)
#     print('Extracted features: ',extracted_features)
    stemmed_words = [stemmer.stem(x) for x in extracted_features]
#     print(stemmed_words)

    result = [lmtzr.lemmatize(x) for x in stemmed_words]
   
    return result

In [23]:
sentence

'The Big brown fox jumped over a lazy dog.'

In [24]:
words = extract_feature(sentence)
print(words)

['big', 'brown', 'fox', 'lazi', 'dog']


In [25]:
words = extract_feature(sentence2)
print(words)

['particular', 'import', 'today', 'world', 'unstructur', 'natur', 'languag', 'data', 'varieti', 'social', 'medium', 'platform', 'peopl', 'engag', 'day', 'note', 'day', 'decad']


In [26]:
extract_feature("He hurt his right foot while he was wearing white shoes on his feet")

['hurt', 'right', 'foot', 'wear', 'white', 'shoe', 'foot']

## Implementing bag of words

In simple terms, it’s a collection of words to represent a sentence, disregarding the order in which they appear.

In [27]:
def word_feats(words):
    return dict([(word, True) for word in words])

In [28]:
word_feats(words)

{'particular': True,
 'import': True,
 'today': True,
 'world': True,
 'unstructur': True,
 'natur': True,
 'languag': True,
 'data': True,
 'varieti': True,
 'social': True,
 'medium': True,
 'platform': True,
 'peopl': True,
 'engag': True,
 'day': True,
 'note': True,
 'decad': True}

## Parsing the whole document

In [29]:
def extract_feature_from_doc(data):
    result = []
    corpus = []
    # The responses of the chat bot
    answers = {}
    for (text,category,answer) in data:

        features = extract_feature(text)

        corpus.append(features)
        result.append((word_feats(features), category))
        answers[category] = answer

    return (result, sum(corpus,[]), answers)

In [30]:
extract_feature_from_doc([['this is the input text from the user','category','answer to give']])

([({'input': True, 'user': True}, 'category')],
 ['input', 'user'],
 {'category': 'answer to give'})

In [34]:
def get_content(filename):
    doc = os.path.join(filename)
    with open(doc, 'r') as content_file:
        lines = csv.reader(content_file,delimiter='|')
        data = [x for x in lines if len(x) == 3]
        return data

In [36]:
filename = 'leaves.txt'
data = get_content(filename)

In [37]:
data

[['Hello',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hi hello',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hi ',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hi', 'Greetings', 'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hi', 'Greetings', 'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hey',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hello, hi',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hey',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hey, hi',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hey, hello',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['Good morning',
  'Morning',
  'Good Morning. I am Dexter. I will serve your leave enquiries.'],
 ['Good afternoon',
  'Afternoon',
  

In [38]:
features_data, corpus, answers = extract_feature_from_doc(data)

In [39]:
print(features_data[50])

({'mani': True, 'option': True, 'leav': True}, 'Utilized-Optional-Leaves')


In [40]:
corpus

['hello',
 'hi',
 'hello',
 'hi',
 'hi',
 'hi',
 'hey',
 'hello',
 'hi',
 'hey',
 'hey',
 'hi',
 'hey',
 'hello',
 'good',
 'morn',
 'good',
 'afternoon',
 'good',
 'even',
 'good',
 'night',
 'today',
 'want',
 'help',
 'need',
 'help',
 'help',
 'want',
 'help',
 'want',
 'assist',
 'help',
 'great',
 'talk',
 'great',
 'thank',
 'help',
 'thank',
 'thank',
 'much',
 'thank',
 'thank',
 'much',
 'mani',
 'type',
 'leav',
 'type',
 'leav',
 'type',
 'leav',
 'type',
 'leav',
 'type',
 'mani',
 'leav',
 'taken',
 'mani',
 'leav',
 'alreadi',
 'taken',
 'mani',
 'annual',
 'leav',
 'mani',
 'annual',
 'leav',
 'taken',
 'mani',
 'annual',
 'leav',
 'alreadi',
 'taken',
 'annual',
 'leav',
 'count',
 'taken',
 'mani',
 'annual',
 'leav',
 'taken',
 'number',
 'annual',
 'leav',
 'taken',
 'annual',
 'leav',
 'taken',
 'number',
 'annual',
 'leav',
 'alreadi',
 'taken',
 'annual',
 'leav',
 'taken',
 'annual',
 'leav',
 'alreadi',
 'taken',
 'number',
 'annual',
 'leav',
 'taken',
 'numbe

In [41]:
answers

{'Greetings': 'Hello. I am Dexter. I will serve your leave enquiries.',
 'Morning': 'Good Morning. I am Dexter. I will serve your leave enquiries.',
 'Afternoon': 'Good afternoon. I am Dexter. I will serve your leave enquiries.',
 'Evening': 'Good evening. I am Dexter. I will serve your leave enquiries.',
 'Goodbye': 'Good night. Take care.',
 'Opening': "I'm fine! Thank you. How can I help you?",
 'Help': 'How can I help you?',
 'No-Help': 'Ok sir/madam. No problem. Have a nice day.',
 'Closing': "It's glad to know that I have been helpful. Have a good day!",
 'Leaves-Type': 'Currently I know about two: annual and optional leaves.',
 'Default-Utilized-Annual-Leaves': 'You have used 12 annual leaves.',
 'Utilized-Annual-Leaves': 'You have taken 12 annual leaves.',
 'Utilized-Optional-Leaves': 'You have taken 1 optional leaves.',
 'Default-Balance-Annual-Leaves': 'You have 25 annual leaves left.',
 'Balance-Annual-Leaves': 'You have 25 annual leaves remaining.',
 'Balance-Optional-Leave

# Train a model using these fetures

In [42]:
## split data into train and test sets
split_ratio = 0.8

In [43]:
def split_dataset(data, split_ratio):
    random.shuffle(data)
    data_length = len(data)
    train_split = int(data_length * split_ratio)
    return (data[:train_split]), (data[train_split:])

In [44]:
training_data, test_data = split_dataset(features_data, split_ratio)

In [45]:
training_data

[({'tell': True, 'carri': True, 'forward': True, 'leav': True}, 'CF'),
 ({'hi': True}, 'Greetings'),
 ({'today': True}, 'Opening'),
 ({'mani': True, 'option': True, 'leav': True}, 'Balance-Optional-Leaves'),
 ({'type': True, 'leav': True}, 'Leaves-Type'),
 ({'thank': True, 'much': True}, 'Closing'),
 ({'hello': True}, 'Greetings'),
 ({'mani': True, 'annual': True, 'leav': True}, 'Utilized-Annual-Leaves'),
 ({'hi': True}, 'Greetings'),
 ({'carri': True, 'forward': True}, 'CF'),
 ({'want': True, 'assist': True}, 'No-Help'),
 ({'option': True, 'leav': True, 'count': True, 'use': True},
  'Utilized-Optional-Leaves'),
 ({'number': True,
   'option': True,
   'leav': True,
   'alreadi': True,
   'taken': True},
  'Utilized-Optional-Leaves'),
 ({'annual': True, 'leav': True}, 'Balance-Annual-Leaves'),
 ({'option': True, 'leav': True, 'count': True, 'taken': True},
  'Utilized-Optional-Leaves'),
 ({'need': True, 'help': True}, 'Help'),
 ({'option': True, 'leav': True, 'taken': True}, 'Utilized

In [46]:
# save the data
np.save('training_data', training_data)
np.save('test_data', test_data)

## Classification using Decision tree

In [47]:
training_data = np.load('training_data.npy', allow_pickle=True)
test_data = np.load('test_data.npy' , allow_pickle=True)

In [48]:
def train_using_decision_tree(training_data, test_data):
    
    classifier = nltk.classify.DecisionTreeClassifier.train(training_data, entropy_cutoff=0.6, support_cutoff=6)
    classifier_name = type(classifier).__name__
    training_set_accuracy = nltk.classify.accuracy(classifier, training_data)
    print('training set accuracy: ', training_set_accuracy)
    test_set_accuracy = nltk.classify.accuracy(classifier, test_data)
    print('test set accuracy: ', test_set_accuracy)
    return classifier, classifier_name, test_set_accuracy, training_set_accuracy

In [67]:
dtclassifier, classifier_name, test_set_accuracy, training_set_accuracy = train_using_decision_tree(training_data, test_data)

training set accuracy:  0.9035087719298246
test set accuracy:  0.8620689655172413


## Classification using Naive Bayes

In [50]:
def train_using_naive_bayes(training_data, test_data):
    classifier = nltk.NaiveBayesClassifier.train(training_data)
    classifier_name = type(classifier).__name__
    training_set_accuracy = nltk.classify.accuracy(classifier, training_data)
    test_set_accuracy = nltk.classify.accuracy(classifier, test_data)
    return classifier, classifier_name, test_set_accuracy, training_set_accuracy

In [62]:
classifier, classifier_name, test_set_accuracy, training_set_accuracy = train_using_naive_bayes(training_data, test_data)
print(training_set_accuracy)
print(test_set_accuracy)
print(len(classifier.most_informative_features()))
classifier.show_most_informative_features()

0.8333333333333334
0.8275862068965517
70
Most Informative Features
                    leav = None           Greeti : Balanc =     12.0 : 1.0
                    mani = True           Defaul : Balanc =      5.7 : 1.0
                 alreadi = True           Defaul : Utiliz =      4.6 : 1.0
                    help = True             Help : Closin =      3.9 : 1.0
                   carri = None           Utiliz : CF     =      3.6 : 1.0
                  remain = None           Utiliz : Balanc =      3.0 : 1.0
                   count = True           Utiliz : CF     =      2.8 : 1.0
                   taken = None           Balanc : Utiliz =      2.8 : 1.0
                   thank = None           Utiliz : Closin =      2.7 : 1.0
                  balanc = True           Defaul : Balanc =      2.1 : 1.0


In [52]:
classifier.classify(({'mani': True, 'option': True, 'leav': True}))

'Utilized-Optional-Leaves'

In [53]:
extract_feature("hello")

['hello']

In [54]:
word_feats(extract_feature("hello"))

{'hello': True}

In [55]:
input_sentence = "how many balanced leaves do I have?"
classifier.classify(word_feats(extract_feature(input_sentence)))

'Utilized-Optional-Leaves'

In [56]:
def reply(input_sentence):
    category = dtclassifier.classify(word_feats(extract_feature(input_sentence)))
    return answers[category]
    
    

In [77]:
reply('hi')

'Hello. I am Dexter. I will serve your leave enquiries.'

In [58]:
reply('How many annual leaves do I have left?')

'You have 25 annual leaves remaining.'

In [59]:
reply('How many leaves have I taken?')

'You have used 12 annual leaves.'

In [63]:
reply('Thanks!')

"It's glad to know that I have been helpful. Have a good day!"

In [72]:
with open('chatbot_model.pkl', 'wb') as model_file:
    pickle.dump(dtclassifier, model_file)

In [65]:
import pickle

# Conclusion:

Once the model has been developed using an algorithm that gives an acceptable accuracy, this model can be called using to any chatbot UI framework

In [61]:
!pip freeze

absl-py==1.0.0
alabaster @ file:///home/ktietz/src/ci/alabaster_1611921544520/work
anaconda-client==1.7.2
anaconda-navigator==2.0.3
anaconda-project @ file:///tmp/build/80754af9/anaconda-project_1610472525955/work
anyio @ file:///C:/ci/anyio_1620153418380/work/dist
appdirs==1.4.4
argcomplete==2.0.0
argh==0.26.2
argon2-cffi @ file:///C:/ci/argon2-cffi_1613037959010/work
arrow==1.2.3
arviz==0.6.1
asgiref==3.5.2
asn1crypto @ file:///tmp/build/80754af9/asn1crypto_1596577642040/work
astroid @ file:///C:/ci/astroid_1613501047216/work
astropy @ file:///C:/ci/astropy_1617745647203/work
astunparse==1.6.3
async-generator @ file:///home/ktietz/src/ci/async_generator_1611927993394/work
atomicwrites==1.4.0
attrs @ file:///tmp/build/80754af9/attrs_1604765588209/work
audioread==2.1.9
autopep8 @ file:///tmp/build/80754af9/autopep8_1615918855173/work
Babel @ file:///tmp/build/80754af9/babel_1607110387436/work
backcall @ file:///home/ktietz/src/ci/backcall_1611930011877/work
backports.functools-lru-cach