# Chatbot
by Muhammad Jaysy Ansharulloh
## Import datasets

In [208]:
import json

intents_filepath = 'datasets/intents.json'
with open(intents_filepath) as fi:
    intents = json.load(fi)

In [210]:
import pandas as pd

df_intents = pd.json_normalize(intents)

In [211]:
df_intents

Unnamed: 0,text,intent
0,"Sisca, please schedule a 20 minute meeting wit...",schedule creation
1,"Sisca, I want to plan a meeting on Thursday mo...",schedule creation
2,I would like to take this opportunity to invit...,schedule creation
3,Are you free to meet at Bali meeting room on T...,schedule creation
4,Are you free to chat at 4 PM to talk through h...,schedule creation
5,Want to book in a quick meeting this week to s...,schedule creation
6,I've got some free time next week to talk this...,schedule creation
7,Do you have five minutes this week to talk?,schedule creation
8,Please book a meeting room for tomorrow mornin...,schedule creation
9,Would you be interested in hopping on a quick ...,schedule creation


## Text Preprocessing
### download and import required libraries and dictionaries

In [212]:
import nltk

language = 'english'
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jaysy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jaysy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jaysy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [213]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
import re

### process text

In [214]:
texts = df_intents['text'].tolist()

In [215]:
def preprocess_text(text):
    regex = r'[^\w]'
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words(language))
    sentences = ' '.join(nltk.sent_tokenize(re.sub(regex, ' ', text.lower())))
    words = [lemmatizer.lemmatize(word, wordnet.VERB) for word in nltk.word_tokenize(sentences) if not word in stop_words]
    return ' '.join(words)

In [216]:
documents = []
for text in texts:
    documents.append(preprocess_text(text))

In [217]:
documents

['sisca please schedule 20 minute meet john week let deployment issue',
 'sisca want plan meet thursday morning 9 precisely',
 'would like take opportunity invite meet office 9 november 25 2020 discuss',
 'free meet bali meet room tuesday morning talk',
 'free chat 4 pm talk could help product release',
 'want book quick meet week see let know available',
 'get free time next week talk let create appointment',
 'five minutes week talk',
 'please book meet room tomorrow morning invite jays',
 'would interest hop quick 15 minute call learn',
 'sisca schedule tomorrow',
 'need check calendar see available',
 'please check schedule next week',
 'sisca free next two days',
 'let know appointment month',
 'want make sure take day tomorrow',
 'meet john today',
 'need cancel meet today',
 'please tell faiz meet tomorrow cancel',
 'guess need next day meet anymore',
 'sisca please tell participants project brief meet meet cancel',
 'sisca cancel appointment today',
 'want call next month meet'

## Scoring

In [218]:
from sklearn.feature_extraction.text import TfidfVectorizer

def vectorize(documents, vectorizer=TfidfVectorizer()):
    intent_values = vectorizer.fit_transform(documents)
    return intent_values

## Create ML Model

In [219]:
vectorizer = TfidfVectorizer()

X_raw = vectorize(documents, vectorizer)
X = X_raw.toarray()
y = df_intents['intent']

In [220]:
feature_names = vectorizer.get_feature_names()
learning_df = pd.DataFrame(X_raw.toarray(), columns = feature_names)

In [221]:
learning_df['CLASS'] = y

In [222]:
learning_df.head()

Unnamed: 0,15,20,2020,25,accept,aliyyah,another,anymore,appointment,available,...,today,tomorrow,tuesday,two,urgent,want,week,would,yes,CLASS
0,0.0,0.384587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.252766,0.0,0.0,schedule creation
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.373043,0.0,0.0,0.0,schedule creation
2,0.0,0.0,0.326879,0.326879,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.294109,0.0,schedule creation
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.443362,0.0,0.0,0.0,0.0,0.0,0.0,schedule creation
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,schedule creation


### train test split

In [223]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7)

### create classifier using Naive Bayes Gaussian model

In [231]:
# KNN
from sklearn.neighbors import KNeighborsClassifier

classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)

# predict
y_pred = classifier.predict(X_test)

# check accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.6153846153846154


## User Input Test

In [232]:
user_input = input('Your message:')
processed_input = preprocess_text(str(user_input)).split()
input_row = [col for col in learning_df.columns if not col == 'CLASS']
user_input_row = [input for input in processed_input if input in input_row]
all_input_string = ' '.join(input_row)
user_input_string = ' '.join(user_input_row)
input_vectorizer = TfidfVectorizer()
input_array = vectorize([all_input_string, user_input_string], input_vectorizer)
processed_input_array = input_array[1]
prediction = classifier.predict(processed_input_array)
print(prediction)

['schedule check']


## Entities Extractor

In [226]:
entities_filepath = 'datasets/entities.json'
with open(entities_filepath) as fe:
    entities = json.load(fe)

df_entities = pd.json_normalize(entities)
col_entities = df_entities.columns

ent_list = {}
for col in col_entities:
    for processed in processed_input:
        if not col in ent_list:
            ent_list[col] = []
        if processed in [preprocess_text(ent) for ent in df_entities[col][0]]:
            ent_list[col].append(processed)

output_ent = []
for key, value in ent_list.items():
    if len(value) > 0:
        output_ent.append(key)

print('entity defined:', output_ent)
print('entity details:', json.dumps(ent_list, indent=2))

entity defined: ['RELATIVE_TIME', 'MEETING']
entity details: {
  "NUMBER": [],
  "RELATIVE_TIME": [
    "tomorrow"
  ],
  "DAY_TIME": [],
  "MONTH": [],
  "DAY": [],
  "PLACE": [],
  "PERSON": [],
  "DURATION": [],
  "MEETING": [
    "meet"
  ]
}
