In [1]:
# prepare dataset
import json
import pandas as pd

with open("intents.json") as data_file:
    data = json.load(data_file)

In [2]:
data['intents'][1]

{'tag': 'salam-day',
 'patterns': ['Selamat Pagi',
  'Selamat Siang',
  'Selamat Sore',
  'Selamat Malam'],
 'responses': ['Halo selamat juga, ada yang bisa saya bantu kah? silahkan']}

In [3]:
text_input = []
intents = []

for intent in data['intents']:
    for pattern in intent['patterns']:
        text_input.append(pattern)
        intents.append(intent['tag'])

df = pd.DataFrame({'text_input': text_input,
                    'intents': intents})

df.head(10)

Unnamed: 0,text_input,intents
0,Assalamualaikum,salam-formal
1,Assalamualaikum warahmatullahi wabarakatuh,salam-formal
2,Selamat Pagi,salam-day
3,Selamat Siang,salam-day
4,Selamat Sore,salam-day
5,Selamat Malam,salam-day
6,Hai,salam-usual
7,Hi,salam-usual
8,Halo,salam-usual
9,Permisi,salam-usual


In [4]:
df.tail(5)

Unnamed: 0,text_input,intents
53,Sudah,finish
54,Selesai,finish
55,Sampai jumpa,finish
56,Bye,finish
57,Terima Kasih,finish


In [5]:
df.intents.value_counts()

jumlah          12
pesan            9
salam-usual      8
menu             7
feedback2        7
finish           5
salam-day        4
feedback1        4
salam-formal     2
Name: intents, dtype: int64

In [6]:
# data cleansing
import string

# convert lowercase
df.text_input = df.text_input.apply(lambda x: x.lower())

# remove punctuation
exclude = set(string.punctuation)
df.text_input = df.text_input.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [7]:
# label encoding
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

le = LabelEncoder()
y_train = le.fit_transform(df.intents)
y_train = to_categorical(y_train)
y_train

array([[0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0.

In [8]:
all_vocab = []
length = []

for idx, row in df.iterrows():
    sent = row['text_input']
    [all_vocab.append(i) for i in sent.split()]
    length.append(len(sent.split()))

In [9]:
len(all_vocab)

75

In [10]:
max(length)

3

In [11]:
len(set(all_vocab))

66

In [12]:
from tensorflow.keras.layers import TextVectorization

max_vocab_length = 66
max_length = 3

text_vectorization = TextVectorization(max_tokens=max_vocab_length,
                                       standardize='lower_and_strip_punctuation',
                                       split='whitespace',
                                       ngrams=None,
                                       output_mode='int',
                                       output_sequence_length=max_length
                                       )

In [13]:
text_vectorization.adapt(df.text_input)

In [14]:
text_vectorization.get_vocabulary()

['',
 '[UNK]',
 'selamat',
 'sedap',
 'aja',
 'assalamualaikum',
 'apa',
 'warahmatullahi',
 'wabarakatuh',
 'tiga',
 'tidak',
 'test',
 'terima',
 'sudah',
 'sore',
 'siang',
 'seporsi',
 'sempurna',
 'selesai',
 'sedia',
 'satu',
 'sangat',
 'sampai',
 'salam',
 'saji',
 'rasanya',
 'porsi',
 'ping',
 'pilih',
 'pesen',
 'pesan',
 'permisi',
 'pengen',
 'pagi',
 'p',
 'order',
 'ngga',
 'mesen',
 'menyediakan',
 'menyajikan',
 'menu',
 'memesan',
 'mau',
 'malam',
 'luar',
 'kurang',
 'kasih',
 'jumpa',
 'itu',
 'ini',
 'ingin',
 'hidangan',
 'hi',
 'halo',
 'hai',
 'enak',
 'empat',
 'dua',
 'cukup',
 'bye',
 'buruk',
 'biasa',
 'baik',
 'ada',
 '4',
 '3']

In [15]:
text_vectorization.get_vocabulary()[5]

'assalamualaikum'

In [16]:
from tensorflow.keras.layers import Embedding
embedding = Embedding(input_dim=max_vocab_length,
                      output_dim=16,
                      embeddings_initializer="uniform",
                      input_length=max_length)

In [17]:
import numpy as np
# modelling
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense, LSTM
inputs = Input(shape=(1,), dtype='string')
x = text_vectorization(inputs)
x = embedding(x)
x = LSTM(12)(x)
outputs = Dense(9, activation='softmax')(x)
model_lstm = Model(inputs, outputs, name="LSTM_model")

In [18]:
# compile model
model_lstm.compile(loss='categorical_crossentropy',
                   optimizer='adam',
                   metrics=["accuracy"])

In [19]:
model_lstm.fit(df.text_input,
               y_train,
               epochs=250,
               verbose=0)

<keras.callbacks.History at 0x2aad0ea9610>

In [20]:
model_lstm.evaluate(df.text_input, y_train)



[0.06253372132778168, 1.0]

In [21]:
model_lstm.save("bot_model.tf")



INFO:tensorflow:Assets written to: bot_model.tf\assets


INFO:tensorflow:Assets written to: bot_model.tf\assets


In [22]:
import pickle
le_filename = open("label_encoder.pickle", "wb")
pickle.dump(le, le_filename)
le_filename.close()