In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import json
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, GlobalAveragePooling1D, Flatten
from tensorflow.keras.models import Model
import matplotlib.pyplot as plt

In [None]:
with open('artour.json') as artour:
  data = json.load(artour)

In [None]:
tags = []
inputs = []
outputs = {}
for intent in data['intents']:
  outputs[intent['tag']]=intent['output']
  for lines in intent['input']:
    inputs.append(lines)
    tags.append(intent['tag'])

In [None]:
my_data = pd.DataFrame({"inputs":inputs,
                      "tags":tags})

In [None]:
my_data.head(20)

# Preprocessing Data

In [None]:
import string
my_data['inputs'] = my_data['inputs'].apply(lambda wrd:[ltrs.lower() for ltrs in wrd if ltrs not in string.punctuation])
my_data['inputs'] = my_data['inputs'].apply(lambda wrd: ''.join(wrd))
my_data.head(20)

In [None]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

factory = StopWordRemoverFactory()
stopword = factory.create_stop_word_remover()

my_data['inputs'] = my_data['inputs'].apply(lambda wrd:[stopword.remove(x) for x in wrd])
my_data['inputs'] = my_data['inputs'].apply(lambda wrd: ''.join(wrd))
my_data.head(20)

In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

my_data['inputs'] = my_data['inputs'].apply(lambda wrd:[stemmer.stem(x) for x in wrd])
my_data['inputs'] = my_data['inputs'].apply(lambda wrd: ''.join(wrd))
my_data.head(20)

In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemming process
sentence = 'apa harimu menyenangkan'
hasil = stemmer.stem(sentence)
print(hasil)

In [None]:
tokenizer = Tokenizer(num_words=2000)
tokenizer.fit_on_texts(my_data['inputs'])
train = tokenizer.texts_to_sequences(my_data['inputs'])

from tensorflow.keras.preprocessing.sequence import pad_sequences
x_train = pad_sequences(train)

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(my_data['tags'])

In [None]:
input_shape = x_train.shape[1]
print(input_shape)

In [None]:
vocabulary = len(tokenizer.word_index)
print("number of unique words : ", vocabulary)
output_length = le.classes_.shape[0]
print("output length: ", output_length)

In [None]:
i = Input(shape=(input_shape,))
x = Embedding(vocabulary+1,10)(i)
x = LSTM(10, return_sequences=True)(x)
x = Flatten()(x)
x = Dense(output_length, activation="softmax")(x)
model = Model(i,x)

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
train = model.fit(x_train, y_train, epochs=200)

In [None]:
plt.plot(train.history['accuracy'], label='training set accuracy')
plt.plot(train.history['loss'], label='training set loss')
plt.legend()

In [None]:
import random
print("ARTour : Selamat Datang ARTourist!")

while True:
  texts_p = []
  prediction_input = input('You : ')
  #if prediction_input == 'keluar':
  #  print("ARTour : Terima kasih telah berkunjung :D, selamat menikmati liburan Anda~")
  #  break
  prediction_input = [letters.lower() for letters in prediction_input if letters not in string.punctuation]
  prediction_input = ''.join(prediction_input)
  texts_p.append(prediction_input)

  prediction_input = tokenizer.texts_to_sequences(texts_p)
  prediction_input = np.array(prediction_input).reshape(-1)
  prediction_input = pad_sequences([prediction_input], input_shape)

  output = model.predict(prediction_input)
  output = output.argmax()

  response_tag = le.inverse_transform([output])[0]
  print("ARTour : ", random.choice(outputs[response_tag]))
  #if response_tag < 0.5:
  #  print("ARTour : Maaf, bisa diulang pertanyaannya?")
  #  continue
  if response_tag == 'keluar':
    break