In [1]:
import numpy as np
import pandas as pd

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
import spacy

import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.metrics.pairwise import cosine_similarity

import random
import string

# Tag stages vs don't tag stages, see the differences in accuracy
# Don't tag stages - Seems ok..

In [None]:
# Full edition of prompts
f = open('prompts_full.txt', 'r', encoding='utf=8')
raw_txt = f.read()
raw_txt = raw_txt.replace('\n', ' ')

In [None]:
# Reading in the prompts and response database
qandadb = pd.read_excel('q&adatabase.xlsx')

# Changing questions column to lowercase
qandadb['questions'] = qandadb['questions'].apply(lambda x : x.lower())

# Droping all the duplicates that appear in each stage
qandadb = qandadb.drop_duplicates(subset=['stage', 'questions', 'resp'], keep='first')

In [None]:
qandadb[(qandadb['questions'].str.contains('what should i do?'))]

In [None]:
# qandadb[(qandadb['questions'] == 'what should i do?')]
#  & (qandadb['stage'] == 'FC1â€“ A1')

In [None]:
# Loop method
# for sentence in qandadb[qandadb['questions'] == 'Is there any late payment fee?']['resp']:
#    print(sentence)

# Index method (pretty dumb code but 1 line)
# qandadb[qandadb['questions'] == 'How to make payment?']['resp'][qandadb[qandadb['questions'] == 'How to make payment?']['resp'].index[0]]

In [None]:
# NLP Processing
"""
1. Convert all to lower case
2. Tokenization
3. Removing noise (rubbish like special characters)
4. Removing stop words
5. Stemming
6. Lemmatization
7. Bag of words/Tfidf
"""

In [None]:
# Let's start with just 3 sentences
# 1. How to pay sch fees
# 2. What is my progress
# 3. How do I contact mentor


# Nltk processing functions works better with strings
# Separate sentences by punctuation marks
# data = 'What should I do to pay my school fees? How can I check my course progress? If I want to contact my mentor, how can I go about it? What if I do not want to pay school fees?'     

# Response library
# library = {'What should I do to pay my school fees?' : 'You can pay with Paylah!',
#           'How can I check my course progress?' : 'You can check on eDX.',
#           'If I want to contact my mentor, how can I go about it?' : 'You can message them on Telegram.',
#           'What if I do not want to pay school fees?' : 'You must pay school fees or else you cannot continue!'}


# Changing to lower case

data_lower = raw_txt.lower()

In [None]:
# Converting to word & sentence tokens
sent_token = nltk.sent_tokenize(data_lower)
word_token = nltk.word_tokenize(data_lower)

In [None]:
lemmer = nltk.stem.WordNetLemmatizer()

def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]

remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

In [None]:
greet_input = ['hello', 'hi', 'greetings', 'sup',
               "what's up", 'hey', 'yo', 'henlo']
greet_reply = ['Hello nigga', 'Hi homie', 'Yo wassup', 'Ni hao',
               'Konichiwa', 'Annyeong haseyo', 'Sawadikap']

def greeting(sentence): 
    for word in sentence.split():
        if word.lower() in greet_input:
            return random.choice(greet_reply)

In [None]:
def response(user_input):
    bot_response = ''
    sent_token.append(user_input)
    
    TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
    tfidf = TfidfVec.fit_transform(sent_token)
    
    cosine_vals = cosine_similarity(tfidf[-1], tfidf)
    
    # Use to index out the matched response after sorting it
    # [-2] means 2nd from behind, use -2 because -1 is the input sentence
    # So, -2 is the next best score for cosine similarity
    index = cosine_vals.argsort()[0][-2]
    
    flat = cosine_vals.flatten()
    flat.sort()
    
    matched_cosine = flat[-2]
    
    if matched_cosine == 0:
        bot_response = bot_response + "I'm sorry, I do not understand you."
    
    else:
        bot_response = bot_response + qandadb[(qandadb['questions'].str.contains(sent_token[index]))]['resp'][qandadb[qandadb['questions'] == sent_token[index]]['resp'].index[0]]
        # library[sent_token[index]]
        return bot_response

In [None]:
flag=True
print("Hello, please ask me a question!")
while(flag==True):
    user_input = input()
    user_input = user_input.lower()
    
    if greeting(user_input) != None:
        print("Bot: " + greeting(user_input))
    
    elif user_input == 'thanks' or user_input == 'thank you' or user_input == 'bye':
        flag=False
        print("Bot: Bye! take care..")
    
    elif user_input != 'bye' or user_input != 'thanks' or user_input != 'thank you':
        print("Bot: ", end= "")
        print(response(user_input))
        sent_token.remove(user_input)