In [1]:
# reading and preprocessing
corpusFile=open('Corpus.txt','r', encoding='utf-8')
corpus = corpusFile.read()

In [2]:
# converts to lowercase
corpus = corpus.lower()

In [3]:
# converts to list of sentences
import nltk
sent_tokens = nltk.sent_tokenize(corpus)

In [4]:
# lemmatization
lemmer = nltk.stem.WordNetLemmatizer()

# lemmatize the tokens
def Lemmatize(tokens):
    return [lemmer.lemmatize(token) for token in tokens]

In [5]:
# remove punctuation dictionary
##! The ord() function returns the number representing
##! the unicode code of a specified character.

##! The dict() function creates a dictionary.
##! keyword arguments as much as you like, separated by comma: 
##! key = value, key = value
import string
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

In [6]:
# remove stop words and punctuations
def normalize(text):
    stopEnglish = set(stopwords.words('english'))
    return Lemmatize([token for token in nltk.word_tokenize(text.lower().translate(remove_punct_dict)) if token not in stopEnglish])

In [7]:
# Handle of greetings

import random
##! The choice() method returns a randomly selected element
##! from the specified sequence.

def greet(sentence):
    GREETING_INPUTS = ("welcome","hello", "hi", "greetings", "sup", "what's up","hey",)
    GREETING_RESPONSES = ["Welcome","Hi", "Hey", "*nods*", "Hi there", "Hello", "I am glad! You are talking to me"] 
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)

In [35]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
import nltk
from nltk.corpus import stopwords
from nltk import ngrams
from nltk.tokenize import TweetTokenizer
import csv
import _pickle as cPickle
import re

class EmotionAnalyser:
    __instance = None

    @staticmethod
    def getInstance(base_path):
        """ Static access method. """
        if EmotionAnalyser.__instance == None:
            EmotionAnalyser(base_path)
        return EmotionAnalyser.__instance

    def __init__(self,base_path):
        if EmotionAnalyser.__instance != None:
            raise Exception("This class is a singleton!")
        else:
            EmotionAnalyser.__instance = self
        self.base_path = base_path
        self.stoplist = set(stopwords.words("english"))
        self.punctuation = ['.',',','\'','\"',':',';','...','-','–','—','(',')','[',']','«','»']
        ## read more about https://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize.casual
        self.tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True) 
        # lemmatization
        self.lemmer = nltk.stem.WordNetLemmatizer()
        
    
    def extract_features(self,statement):
        statement = statement.lower()
        # remove all digits
        statement = re.sub(r'[0-9]', ' ', statement)
        # remove all (..)
        statement = re.sub(r'\.{2,}', ' ', statement)
        # remove all hashtags
        statement = re.sub(r'#.+ ', ' ', statement)        
        # tokenization, stopwords and punctuation removal
        word_list = [ word for word in self.tokenizer.tokenize(statement) if word not in self.stoplist and word not in string.punctuation]
        # lemmatize the tokens
        word_list = [self.lemmer.lemmatize(token) for token in word_list]
        # one hot encoding
        return dict([(word,True) for word in word_list])
        # ngrams
#         ngram_tubles = ngrams(word_list, 3)
#         return dict([(gram,True) for gram in ngram_tubles])
#         return dict([(gram,1) for gram in ngram_tubles])

    def train(self):        
        # load train data from csv file
        csv_file = open(self.base_path+'/text_emotion.csv')
        csv_reader = csv.reader(csv_file, delimiter=',')
        trainDataset = {}

        for index, row in enumerate(csv_reader):
            if index != 0:
                if row[0] not in trainDataset:
                    trainDataset[row[0]] = []
                    trainDataset[row[0]].append(row[1])
                else :
                    trainDataset[row[0]].append(row[1])

        # separate train data set into classes of emotion
        # Split the dataset into training and testing datasets (80/20)
        # build the train features 
        # build the test features  
    
        features = {}
        thresholds = {}        
        spilt_factor = 0.9
        features_train = []
        features_test = []       

        for emotion in trainDataset:
            features[emotion] = [(self.extract_features(statement), emotion) for statement in trainDataset[emotion]]
            thresholds[emotion] = int(spilt_factor * len(features[emotion]))       
            features_train.extend( features[emotion][:thresholds[emotion]] )
            features_test.extend( features[emotion][thresholds[emotion]:] )
        
        if __name__ == "__main__":
            print ("sentiments : ",features.keys())
            print ("Number of training records:", len(features_train))
            print ("Number of test records:", len(features_test))

        # joblib.dump(features_train, "classifier.sav")

        # use a Naive Bayes classifier and train it
        classifier = NaiveBayesClassifier.train(features_train)

        if __name__ == "__main__":
            print ("Accuracy of the classifier:", nltk.classify.util.accuracy(classifier, features_test))

            informative = classifier.most_informative_features(1000)
            print(informative)

        # # dump classifier into a file
        f = open(self.base_path+'/classifier.pickle', 'wb')        
        cPickle.dump(classifier, f)
        f.close()
        # joblib.dump(classifier, "classifier.save")
 

    def classify(self,statement,classifier =None): 
        if classifier == None :
            f = open(self.base_path+'/classifier.pickle', 'rb')  
            classifier = cPickle.load(f)
            f.close()
            # using joblib
            # classifier = joblib.load("classifier.save")
        probdist = classifier.prob_classify(self.extract_features(statement))
        predected_sentiment = probdist.max()
        probability = round(probdist.prob(predected_sentiment), 2)
        return predected_sentiment, probability

In [36]:
# example of bag of n-grams
sentence = 'this is a foo bar sentences and i want to ngramize it'

n = 3
grams = ngrams(sentence.split(), n)

for gram in grams:
    print(gram)

('this', 'is', 'a')
('is', 'a', 'foo')
('a', 'foo', 'bar')
('foo', 'bar', 'sentences')
('bar', 'sentences', 'and')
('sentences', 'and', 'i')
('and', 'i', 'want')
('i', 'want', 'to')
('want', 'to', 'ngramize')
('to', 'ngramize', 'it')


In [37]:
# instantiation
analyser = EmotionAnalyser.getInstance(".")

In [38]:
# training model
analyser.train()

sentiments :  dict_keys(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise', 'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'])
Number of training records: 35996
Number of test records: 4004
Accuracy of the classifier: 0.15034965034965034
[('hate', True), ('bored', True), ('california', True), ('edited', True), ('fifteen', True), ('finishing', True), ('fault', True), ('upset', True), ('aaa', True), ('ankle', True), ('blond', True), ('brightens', True), ('bullshit', True), ('cock', True), ('confusing', True), ('cooperating', True), ('cracked', True), ('damned', True), ('declined', True), ('demon', True), ('deserve', True), ('difficult', True), ('dirt', True), ('dope', True), ('dye', True), ('ea', True), ('eee', True), ('ffs', True), ('fishing', True), ('given', True), ('grabbing', True), ('historical', True), ('idiot', True), ('illegal', True), ('lag', True), ('ligament', True), ('mental', True), ('mgmt', True), ('migraine', True), ('mud', True), ('nin', Tr

In [41]:
# predict the sentiment with the model
sent = input("Enter a sentence : ")
# sent = "This movie is awesome"
sentiment , confidence = analyser.classify(sent)
print(sentiment , "with confidence :" , confidence)

Enter a sentence : I dislike batman
boredom with confidence : 0.39


In [42]:
# Handle of emotion
def handleEmotion(user_response,classfier = None):
    analyser = EmotionAnalyser.getInstance(".")
    sentiment , confidence = analyser.classify(user_response,classfier)        
    possitive = ['enthusiasm','fun','happiness','love','surprise','relief']
    negative = ['anger','boredom','hate','sadness','worry']
    nutral = ['empty','neutral'] 
    if sentiment in nutral:
        return "I am sorry! I don't understand you" , sentiment
    if sentiment in possitive:
        return random.choice(["I am happy for your "+sentiment,"Keep up your good feelings :)","Hooray!","Good for you","I am happy for you"]) , sentiment
    if sentiment in negative:
        return random.choice(["You aren't alone","Cheer up","I am sad for your "+sentiment,"It's ganna be okay","Sorry to hear that :(","It will be alright","It's bad for you to feel "+sentiment]) , sentiment

In [43]:
# Get the response from corpus to the user's questions

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def response(user_response):

    # add user_response to sent_tokens 
    sent_tokens.append(user_response) 

    # Apply TF-IDF
    TfidfVec = TfidfVectorizer(tokenizer=normalize)
    tfidf = TfidfVec.fit_transform(sent_tokens)

    # Apply cosine similarity to user_response and the corpus
    vals = cosine_similarity(tfidf[-1], tfidf)
    print('cosine_similarity:',vals)
    ##! argsort(): Returns the indices that would sort an array. 
    idx=vals.argsort()[0][-2]
    print('indices that would sort:',vals.argsort())
    ##! flatten(): Return a copy of the array collapsed into one dimension.
    flat = vals.flatten()
    print('flatten vals:',flat)
    flat.sort()
    print('flatten vals:',flat)
    req_tfidf = flat[-2]
    print('potential ans:',req_tfidf)
    
    if(req_tfidf==0):
        return "I am sorry! I don't understand you"
    else:
        result = sent_tokens[idx]
        sent_tokens.remove(user_response)
        return result

In [44]:
# build up the conversation along with sentiment
def generate_reply(user_response,classfier = None):
    user_response=user_response.lower()
    if(user_response!='bye'):
        if(user_response=='thanks' or user_response=='thank you' ):
            return "You are welcome.." , "relief"
        else:
            greeting = greet(user_response)
            if(greeting != None):
                return greeting , "happiness"
            else:     
                result = response(user_response)                
                if(result == "I am sorry! I don't understand you"):
                    return handleEmotion(user_response,classfier)
                else :
                    return result , "neutral"
    else:      
        return "Bye! take care.." , "relief"

In [50]:
# trial #1
generate_reply("Hi")

('*nods*', 'happiness')

In [51]:
generate_reply("What is a chatbot?")

cosine_similarity: [[0.26655127 0.         0.         0.         0.         0.
  0.14944342 0.         0.         0.         0.         0.
  0.         0.12386919 0.         0.         0.         0.
  0.12894911 0.16495713 0.13348603 0.         0.1596626  0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.26095977 0.         0.         0.20237935
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  1.        ]]
indices that would sort: [[24 25 26 27 28 29 30 31 33 47 34 37 38 39 40 41 42 43 44 36 45 23 10  1
   2  3  4  5  7  8  9 21 11 46 12 14 15 16 17 13 18 20  6 22 19 35 32  0
  48]]
flatten vals: [0.26655127 0.         0.         0.         0.         0.
 0.14944342 0.         0.         0.         0.         0.
 0.         0.12386919 0.         0.         0.         0.
 0.12894911 0.16495713 0.13348603 0.         0.1596626  0.
 0.         0.         0.         0.         0. 

('chatbot\nchatbot is a computer program or an artificial intelligence which conducts a conversation via auditory or textual methods.such programs are often designed to convincingly simulate how a human would behave as a conversational partner, thereby passing the turing test.',
 'neutral')

In [52]:
generate_reply("Who is Alan Turing?")

cosine_similarity: [[0.10463124 0.         0.         0.         0.         0.
  0.41584972 0.         0.11538435 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.20585627 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  1.        ]]
indices that would sort: [[24 47 25 26 27 28 29 30 31 23 44 34 35 36 37 38 39 40 41 33 22 21 20  1
   2  3  4  5 46  7 45  9 10 11 12 13 14 15 16 17 18 19 42 43  0  8 32  6
  48]]
flatten vals: [0.10463124 0.         0.         0.         0.         0.
 0.41584972 0.         0.11538435 0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         

('background of chatbot\nin 1950, alan turing\'s famous article "computing machinery and intelligence" was published, which proposed what is now called the turing test as a criterion of intelligence.',
 'neutral')

In [53]:
generate_reply("What is intelligent virtual assistant?")

cosine_similarity: [[0.         0.         0.         0.         0.1282141  0.07600809
  0.         0.         0.         0.10785829 0.         0.13831769
  0.         0.         0.         0.13060283 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.61602487 0.21618184
  0.         0.24649887 0.06861413 0.18103829 0.         0.
  0.         0.07594456 0.         0.09616724 0.16614955 0.
  1.        ]]
indices that would sort: [[ 0 22 23 47 25 26 27 28 29 30 31 32 33 36 40 41 42 44 21 20 24 18  1  2
   3 19  7  8 10  6 12 13 14 17 16 38 43  5 45  9  4 15 11 46 39 35 37 34
  48]]
flatten vals: [0.         0.         0.         0.         0.1282141  0.07600809
 0.         0.         0.         0.10785829 0.         0.13831769
 0.         0.         0.         0.13060283 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.  

('virtual assistant\nintelligent virtual assistant (iva) or intelligent personal assistant (ipa) is a software agent that can perform tasks or services for an individual based on verbal commands[citation needed].',
 'neutral')

In [56]:
generate_reply("I am bored")

cosine_similarity: [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  1. 1.]]
indices that would sort: [[ 0 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 26 25
  24 23  1  2  3  4  5  6  7  8  9 10 11 13 14 15 16 17 18 19 20 21 22 12
  48 49]]
flatten vals: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 1. 1.]
flatten vals: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 1. 1.]
potential ans: 1.0


('i am bored', 'neutral')

In [198]:
generate_reply("I am watching this movie tomorrow night")

cosine_similarity: [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 1.]]
indices that would sort: [[ 0 27 28 29 30 31 32 33 34 35 36 26 37 39 40 41 42 43 44 45 46 47 48 38
  49 25 23  1  2  3  4  5  6  7  8  9 10 24 11 13 14 15 16 17 18 19 20 21
  22 12 50]]
flatten vals: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1.]
flatten vals: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1.]
potential ans: 0.0


('Hooray!', 'happiness')

In [57]:
# Take user query through input()
query = input()
generate_reply(query)

bye


('Bye! take care..', 'relief')