# Chatbot
by Muhammad Jaysy Ansharulloh
## Import datasets

In [1]:
entities_filepath = 'datasets/entities.json'
intents_filepath = 'datasets/intents.json'

In [2]:
import json

In [3]:
with open(intents_filepath) as fi:
    intents = json.load(fi)

with open(entities_filepath) as fe:
    entities = json.load(fe)

In [4]:
print(intents)
print(entities)

[{'text': 'schedule a meeting', 'intent': 'schedule creation'}, {'text': 'write schedule', 'intent': 'schedule creation'}, {'text': 'create an appointment', 'intent': 'schedule creation'}, {'text': 'host a meeting', 'intent': 'schedule creation'}, {'text': 'create a meeting schedule', 'intent': 'schedule creation'}, {'text': 'help me schedule a meeting', 'intent': 'schedule creation'}, {'text': 'please write a meeting schedule', 'intent': 'schedule creation'}, {'text': 'check my schedule', 'intent': 'schedule check'}, {'text': 'open my calendar', 'intent': 'schedule check'}, {'text': 'what is today schedule', 'intent': 'schedule check'}, {'text': 'what is on today', 'intent': 'schedule check'}, {'text': 'how to check my schedule', 'intent': 'schedule check'}, {'text': 'today schedule', 'intent': 'schedule check'}, {'text': 'check calendar', 'intent': 'schedule check'}, {'text': 'please schedule for', 'intent': 'duration definition'}, {'text': 'for some minutes', 'intent': 'duration def

## JSON to dataframe

In [5]:
import pandas as pd

In [6]:
df_intents = pd.json_normalize(intents)

In [7]:
df_intents

Unnamed: 0,text,intent
0,schedule a meeting,schedule creation
1,write schedule,schedule creation
2,create an appointment,schedule creation
3,host a meeting,schedule creation
4,create a meeting schedule,schedule creation
5,help me schedule a meeting,schedule creation
6,please write a meeting schedule,schedule creation
7,check my schedule,schedule check
8,open my calendar,schedule check
9,what is today schedule,schedule check


## Text Preprocessing
### download and import required libraries and dictionaries

In [8]:
import nltk

language = 'english'
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jaysy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jaysy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jaysy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

### process text

In [10]:
import re

In [11]:
texts = df_intents['text'].tolist()

In [12]:
regex = r'[^\w]'
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words(language))

In [13]:
documents = []
for text in texts:
    sentences = ' '.join(nltk.sent_tokenize(re.sub(regex, ' ', text.lower())))
    words = [lemmatizer.lemmatize(word, wordnet.VERB) for word in nltk.word_tokenize(sentences) if not word in stop_words]
    documents.append(' '.join(words))

In [14]:
documents

['schedule meet',
 'write schedule',
 'create appointment',
 'host meet',
 'create meet schedule',
 'help schedule meet',
 'please write meet schedule',
 'check schedule',
 'open calendar',
 'today schedule',
 'today',
 'check schedule',
 'today schedule',
 'check calendar',
 'please schedule',
 'minutes',
 'hour',
 'hours',
 'please schedule hour',
 'create schedule duration',
 'create schedule time',
 'time time',
 'plan meet',
 'plan',
 'please invite people',
 'please tell',
 'bring somebody',
 'want meet',
 'forget tell',
 'please schedule',
 'schedule meet',
 'schedule meet',
 'schedule time',
 'meet tomorrow',
 'meet',
 'let us start',
 'let',
 'appointment',
 'need talk',
 'meet',
 'meet',
 'let us talk',
 'discuss']

## Scoring

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
count_vectorizer = CountVectorizer()
intent_values = count_vectorizer.fit_transform(documents)

In [17]:
feature_names = count_vectorizer.get_feature_names()
pd.DataFrame(intent_values.toarray(), columns=feature_names)

Unnamed: 0,appointment,bring,calendar,check,create,discuss,duration,forget,help,host,...,somebody,start,talk,tell,time,today,tomorrow,us,want,write
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


## Create ML Model

In [18]:
X = intent_values.toarray()
y = df_intents['intent']

In [19]:
print(X)
print(y)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]
 [1 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 0]]
0       schedule creation
1       schedule creation
2       schedule creation
3       schedule creation
4       schedule creation
5       schedule creation
6       schedule creation
7          schedule check
8          schedule check
9          schedule check
10         schedule check
11         schedule check
12         schedule check
13         schedule check
14    duration definition
15    duration definition
16    duration definition
17    duration definition
18    duration definition
19    duration definition
20    duration definition
21    duration definition
22     participant invite
23     participant invite
24     participant invite
25     participant invite
26     participant invite
27     participant invite
28     participant invite
29        time definition
30        time definition
31        time definition
32        time definition
33        time definit

### train test split

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [22]:
print(X_train)
print(X_test)
print(y_train)
print(y_test)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 1 0]]
[[0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0]
 [0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0]
 [1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
23     participant invite
0       schedule creation
36     subject definition
28     participant invite
25     participant invite
20    durati