# Chatbot
by Muhammad Jaysy Ansharulloh
## Import datasets

In [73]:
entities_filepath = 'datasets/entities.json'
intents_filepath = 'datasets/intents.json'

In [74]:
import json

In [75]:
with open(intents_filepath) as fi:
    intents = json.load(fi)

with open(entities_filepath) as fe:
    entities = json.load(fe)

In [76]:
print(intents)
print(entities)

[{'text': 'schedule a meeting', 'intent': 'schedule creation'}, {'text': 'write schedule', 'intent': 'schedule creation'}, {'text': 'create an appointment', 'intent': 'schedule creation'}, {'text': 'host a meeting', 'intent': 'schedule creation'}, {'text': 'create a meeting schedule', 'intent': 'schedule creation'}, {'text': 'help me schedule a meeting', 'intent': 'schedule creation'}, {'text': 'please write a meeting schedule', 'intent': 'schedule creation'}, {'text': 'check my schedule', 'intent': 'schedule check'}, {'text': 'open my calendar', 'intent': 'schedule check'}, {'text': 'what is today schedule', 'intent': 'schedule check'}, {'text': 'what is on today', 'intent': 'schedule check'}, {'text': 'how to check my schedule', 'intent': 'schedule check'}, {'text': 'today schedule', 'intent': 'schedule check'}, {'text': 'check calendar', 'intent': 'schedule check'}, {'text': 'please schedule for', 'intent': 'duration definition'}, {'text': 'for some minutes', 'intent': 'duration def

## JSON to dataframe

In [77]:
import pandas as pd

In [78]:
df_intents = pd.json_normalize(intents)

In [79]:
df_intents

Unnamed: 0,text,intent
0,schedule a meeting,schedule creation
1,write schedule,schedule creation
2,create an appointment,schedule creation
3,host a meeting,schedule creation
4,create a meeting schedule,schedule creation
5,help me schedule a meeting,schedule creation
6,please write a meeting schedule,schedule creation
7,check my schedule,schedule check
8,open my calendar,schedule check
9,what is today schedule,schedule check


## Text Preprocessing
### download and import required libraries and dictionaries

In [80]:
import nltk

language = 'english'
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jaysy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jaysy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jaysy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [81]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

### process text

In [82]:
import re

In [83]:
texts = df_intents['text'].tolist()

In [84]:
def preprocess_text(text):
    regex = r'[^\w]'
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words(language))
    sentences = ' '.join(nltk.sent_tokenize(re.sub(regex, ' ', text.lower())))
    words = [lemmatizer.lemmatize(word, wordnet.VERB) for word in nltk.word_tokenize(sentences) if not word in stop_words]
    return ' '.join(words)

In [85]:
documents = []
for text in texts:
    documents.append(preprocess_text(text))

In [86]:
documents

['schedule meet',
 'write schedule',
 'create appointment',
 'host meet',
 'create meet schedule',
 'help schedule meet',
 'please write meet schedule',
 'check schedule',
 'open calendar',
 'today schedule',
 'today',
 'check schedule',
 'today schedule',
 'check calendar',
 'please schedule',
 'minutes',
 'hour',
 'hours',
 'please schedule hour',
 'create schedule duration',
 'create schedule time',
 'time time',
 'plan meet',
 'plan',
 'please invite people',
 'please tell',
 'bring somebody',
 'want meet',
 'forget tell',
 'please schedule',
 'schedule meet',
 'schedule meet',
 'schedule time',
 'meet tomorrow',
 'meet',
 'let us start',
 'let',
 'appointment',
 'need talk',
 'meet',
 'meet',
 'let us talk',
 'discuss']

## Scoring

In [87]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [88]:
def vectorize(documents, vectorizer=TfidfVectorizer()):
    intent_values = vectorizer.fit_transform(documents)
    return intent_values

## Create ML Model

In [89]:
X = vectorize(documents).toarray()
y = df_intents['intent']

In [90]:
print(X)
print(y)

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.88944694]
 [0.75766103 0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.59235091 0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
0       schedule creation
1       schedule creation
2       schedule creation
3       schedule creation
4       schedule creation
5       schedule creation
6       schedule creation
7          schedule check
8          schedule check
9          schedule check
10         schedule check
11         schedule check
12         schedule check
13         schedule check
14    duration definition
15    duration definition
16    duration definition
17    duration definition
18    duration definition
19    duration definition
20    duration definition
21    dura

### train test split

In [91]:
from sklearn.model_selection import train_test_split

In [92]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [93]:
print(X_train)
print(X_test)
print(y_train)
print(y_test)

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.75766103 0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.88944694]]
[[0.         0.         0.         0.         0.         1.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.         0.         0.66932979 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.74296543
  0.         0.         0.         0.      

### create classifier using Naive Bayes Gaussian model

In [94]:
# Naive Bayes classifier
from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB()

In [95]:
# predict
y_pred = classifier.predict(X_test)

In [96]:
# check accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

In [97]:
accuracy

0.6363636363636364

## User Input Test

In [105]:
user_input = input('Your message:')
processed_input = preprocess_text(str(user_input))
input_data = vectorize([processed_input]).toarray()
prediction = classifier.predict(input_data)
print(prediction)

['schedule creation']
