# Chatbot
by Muhammad Jaysy Ansharulloh
## Import datasets

In [309]:
import json

In [310]:
intents_filepath = 'datasets/intents.json'
with open(intents_filepath) as fi:
    intents = json.load(fi)

In [311]:
print(intents)

[{'text': 'schedule a meeting', 'intent': 'schedule creation'}, {'text': 'write schedule', 'intent': 'schedule creation'}, {'text': 'create an appointment', 'intent': 'schedule creation'}, {'text': 'host a meeting', 'intent': 'schedule creation'}, {'text': 'create a meeting schedule', 'intent': 'schedule creation'}, {'text': 'help me schedule a meeting', 'intent': 'schedule creation'}, {'text': 'please write a meeting schedule', 'intent': 'schedule creation'}, {'text': 'check my schedule', 'intent': 'schedule check'}, {'text': 'open my calendar', 'intent': 'schedule check'}, {'text': 'what is today schedule', 'intent': 'schedule check'}, {'text': 'what is on today', 'intent': 'schedule check'}, {'text': 'how to check my schedule', 'intent': 'schedule check'}, {'text': 'today schedule', 'intent': 'schedule check'}, {'text': 'check calendar', 'intent': 'schedule check'}, {'text': 'please schedule for', 'intent': 'duration definition'}, {'text': 'for some minutes', 'intent': 'duration def

## JSON to dataframe

In [312]:
import pandas as pd

In [313]:
df_intents = pd.json_normalize(intents)

In [314]:
df_intents

Unnamed: 0,text,intent
0,schedule a meeting,schedule creation
1,write schedule,schedule creation
2,create an appointment,schedule creation
3,host a meeting,schedule creation
4,create a meeting schedule,schedule creation
5,help me schedule a meeting,schedule creation
6,please write a meeting schedule,schedule creation
7,check my schedule,schedule check
8,open my calendar,schedule check
9,what is today schedule,schedule check


## Text Preprocessing
### download and import required libraries and dictionaries

In [315]:
import nltk

language = 'english'
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jaysy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jaysy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jaysy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [316]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

### process text

In [317]:
import re

In [318]:
texts = df_intents['text'].tolist()

In [319]:
def preprocess_text(text):
    regex = r'[^\w]'
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words(language))
    sentences = ' '.join(nltk.sent_tokenize(re.sub(regex, ' ', text.lower())))
    words = [lemmatizer.lemmatize(word, wordnet.VERB) for word in nltk.word_tokenize(sentences) if not word in stop_words]
    return ' '.join(words)

In [320]:
documents = []
for text in texts:
    documents.append(preprocess_text(text))

In [321]:
documents

['schedule meet',
 'write schedule',
 'create appointment',
 'host meet',
 'create meet schedule',
 'help schedule meet',
 'please write meet schedule',
 'check schedule',
 'open calendar',
 'today schedule',
 'today',
 'check schedule',
 'today schedule',
 'check calendar',
 'please schedule',
 'minutes',
 'hour',
 'hours',
 'please schedule hour',
 'create schedule duration',
 'create schedule time',
 'time time',
 'plan meet',
 'plan',
 'please invite people',
 'please tell',
 'bring somebody',
 'want meet',
 'forget tell',
 'please schedule',
 'schedule meet',
 'schedule meet',
 'schedule time',
 'meet tomorrow',
 'meet',
 'let us start',
 'let',
 'appointment',
 'need talk',
 'meet',
 'meet',
 'let us talk',
 'discuss']

## Scoring

In [322]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [323]:
def vectorize(documents, vectorizer=TfidfVectorizer()):
    intent_values = vectorizer.fit_transform(documents)
    return intent_values

## Create ML Model

In [324]:
vectorizer = TfidfVectorizer()

In [325]:
X_raw = vectorize(documents, vectorizer)
X = X_raw.toarray()
y = df_intents['intent']

In [326]:
y

0       schedule creation
1       schedule creation
2       schedule creation
3       schedule creation
4       schedule creation
5       schedule creation
6       schedule creation
7          schedule check
8          schedule check
9          schedule check
10         schedule check
11         schedule check
12         schedule check
13         schedule check
14    duration definition
15    duration definition
16    duration definition
17    duration definition
18    duration definition
19    duration definition
20    duration definition
21    duration definition
22     participant invite
23     participant invite
24     participant invite
25     participant invite
26     participant invite
27     participant invite
28     participant invite
29        time definition
30        time definition
31        time definition
32        time definition
33        time definition
34        time definition
35        time definition
36     subject definition
37     subject definition
38     subje

In [327]:
feature_names = vectorizer.get_feature_names()
learning_df = pd.DataFrame(X_raw.toarray(), columns = feature_names)

In [328]:
learning_df['CLASS'] = y

In [329]:
learning_df.head()

Unnamed: 0,appointment,bring,calendar,check,create,discuss,duration,forget,help,host,...,start,talk,tell,time,today,tomorrow,us,want,write,CLASS
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,schedule creation
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.889447,schedule creation
2,0.757661,0.0,0.0,0.0,0.652648,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,schedule creation
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.885635,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,schedule creation
4,0.0,0.0,0.0,0.0,0.742803,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,schedule creation


In [330]:
print(X)
print(y)

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.88944694]
 [0.75766103 0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.59235091 0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
0       schedule creation
1       schedule creation
2       schedule creation
3       schedule creation
4       schedule creation
5       schedule creation
6       schedule creation
7          schedule check
8          schedule check
9          schedule check
10         schedule check
11         schedule check
12         schedule check
13         schedule check
14    duration definition
15    duration definition
16    duration definition
17    duration definition
18    duration definition
19    duration definition
20    duration definition
21    dura

### train test split

In [331]:
from sklearn.model_selection import train_test_split

In [332]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8)

In [333]:
print(X_train)
print(X_test)
print(y_train)
print(y_test)

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.59235091 0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.52514627 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.63227249
  0.         0.         0.         0.         0.         0.56960767
  0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.46438157 0.         0.         0.
  0.         0.         0.         

### create classifier using Naive Bayes Gaussian model

In [334]:
# KNN
from sklearn.neighbors import KNeighborsClassifier

classifier = KNeighborsClassifier(n_neighbors=3)
classifier.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=3)

In [335]:
# predict
y_pred = classifier.predict(X_test)

In [336]:
# check accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

In [337]:
y_pred

array(['subject definition', 'subject definition', 'schedule creation',
       'time definition', 'subject definition', 'schedule check',
       'duration definition', 'duration definition', 'schedule creation'],
      dtype=object)

In [338]:
y_test

35        time definition
27     participant invite
42     subject definition
4       schedule creation
22     participant invite
9          schedule check
18    duration definition
29        time definition
15    duration definition
Name: intent, dtype: object

In [339]:
accuracy

0.2222222222222222

## User Input Test

In [375]:
user_input = input('Your message:')
processed_input = preprocess_text(str(user_input)).split()

In [376]:
input_row = [col for col in learning_df.columns if not col == 'CLASS']
user_input_row = [input for input in processed_input if input in input_row]

In [377]:
all_input_string = ' '.join(input_row)
user_input_string = ' '.join(user_input_row)

In [378]:
input_vectorizer = TfidfVectorizer()
input_array = vectorize([all_input_string, user_input_string], input_vectorizer)
processed_input_array = input_array[1]

In [379]:
print(processed_input_array)

  (0, 21)	0.5773502691896258
  (0, 20)	0.5773502691896258
  (0, 14)	0.5773502691896258


In [380]:
prediction = classifier.predict(processed_input_array)
print(prediction)

['time definition']


## Entities Extractor

In [381]:
entities_filepath = 'datasets/entities.json'
with open(entities_filepath) as fe:
    entities = json.load(fe)

In [382]:
df_entities = pd.json_normalize(entities)

In [383]:
col_entities = df_entities.columns

In [386]:
ent_list = {}
for col in col_entities:
    for processed in processed_input:
        if not col in ent_list:
            ent_list[col] = []
        if processed in df_entities[col][0]:
            ent_list[col].append(processed)

In [387]:
ent_list

{'TIME': ['friday'], 'PERSON': ['jays'], 'DURATION': [], 'MEETING': []}

In [390]:
output_ent = []
for key, value in ent_list.items():
    if len(value) > 0:
        output_ent.append(key)

In [391]:
output_ent

['TIME', 'PERSON']