In [None]:
!git clone https://github.com/irzaip/INDOSOAI-belajarLSTM data

# Video Penjelasan Konsep
* [Membuat Word2Vec](https://www.youtube.com/watch?v=Ae3GVw5nTYU)
* [Bermain dengan Word2vec](https://www.youtube.com/watch?v=bv_iVVrlfbU)
* [contoh LSTM ](https://www.youtube.com/watch?v=qN9hHlZKIL4)

# Contoh LSTM

End-to-End Memory Networks:

* Jason Weston, Antoine Bordes, Sumit Chopra, Tomas Mikolov, Alexander M. Rush, ["Towards AI-Complete Question Answering: A Set of Prerequisite Toy Tasks"](http://arxiv.org/abs/1502.05698)
* Sainbayar Sukhbaatar, Arthur Szlam, Jason Weston, Rob Fergus, ["End-To-End Memory Networks"](http://arxiv.org/abs/1503.08895)

Link yang berhubungan

* [bAbI Datasets](https://research.fb.com/downloads/babi/)
* [Keras End-To-End Memory Networks](https://github.com/fchollet/keras/blob/master/examples/babi_memnn.py)
* [Online JavaScript Demo of End-to-End Memory Networks](http://yerevann.com/dmn-ui/#/)

#### Fungsi bantuan

Fungsi dibawah ini di pergunakan untuk membantu pembuatan vector

In [None]:
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot, concatenate
from keras.layers import LSTM
from keras.utils.data_utils import get_file
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from functools import reduce
import pickle
import tarfile
import numpy as np
import re
import os
import time


def tokenize(sent):
    '''
    Memecah-mecah kalimat menjadi list yang sudah satu persatu kata.
    '''
    return [x.strip() for x in re.split('(\W+)', sent) if x.strip()]


def vectorize_stories(data):
    ''' Membuat input kalimat menjadi vector.
    contoh:
    x,y,z = vectorize_stories([(xx,yy,zz)])
    '''
    inputs, queries, answers = [], [], []
    for story, query, answer in data:
        inputs.append([word_idx[w] for w in story])
        queries.append([word_idx[w] for w in query])
        answers.append(word_idx[answer])
    return (pad_sequences(inputs, maxlen=story_maxlen),
            pad_sequences(queries, maxlen=query_maxlen),
            np.array(answers))

## IMPORT DATASET INDONESIA

In [None]:
import pickle
train_stories = pickle.load(open('./data/id_train_stories.pickle','rb'))
test_stories = pickle.load(open('./data/id_test_stories.pickle','rb'))

In [None]:
# Periksa data

for i in range(8):
    print("Cerita: {}".format(' '.join(train_stories[i][0])))
    print("Pertanyaan: {}".format(' '.join(train_stories[i][1])))
    print("Jawaban: {}".format(train_stories[i][2]))
    print("---")

#### Buat Daftar Vocabulary

Jaringan syaraf Tiruan hanya bisa menggunkan angka untuk memprosesnya. oleh karena itu
data kita harus di rubah menjadi sebuah vector.

In [None]:
vocab = set()
for story, q, answer in train_stories + test_stories:
    vocab |= set(story + q + [answer])
vocab = sorted(vocab)

# Reserve 0 for masking via pad_sequences
vocab_size = len(vocab) + 1
story_maxlen = max(map(len, (x for x, _, _ in train_stories + test_stories)))
query_maxlen = max(map(len, (x for _, x, _ in train_stories + test_stories)))

print('-')
print('Vocab total:', vocab_size, 'kata')
print('Panjang max cerita:', story_maxlen, 'kata')
print('Panjang max pertanyaan:', query_maxlen, 'kata')
print('Panjang max dari cerita training:', len(train_stories))
print('Jumlah cerita test:', len(test_stories))
print('-')
print('Sperti ini tuplenya (input, query, answer):')
print(train_stories[0])
print('-')


for s in list(enumerate(vocab)):
    print(s)

#### Membuat data Training dan Validasi

data training di masukkan ke JST dalam bentuk vektor representasi dari kalimat. setiap kalimat
di ganti dengan nilai vocab. - selalu ada 2 input, cerita  dan pertanyaan (story & query)
lalu jawaban akan mengeluarkan 1 angka. yang di map ke index dari vocab.

In [None]:
print('Vectorizing the word sequences...')
word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
inputs_train, queries_train, answers_train = vectorize_stories(train_stories)
inputs_test, queries_test, answers_test = vectorize_stories(test_stories)

print('-')
print('inputs: integer tensor of shape (samples, max_length)')
print('inputs_train shape:', inputs_train.shape)
print('inputs_test shape:', inputs_test.shape)
print('-')
print('queries: integer tensor of shape (samples, max_length)')
print('queries_train shape:', queries_train.shape)
print('queries_test shape:', queries_test.shape)
print('-')
print('answers: binary (1 or 0) tensor of shape (samples, vocab_size)')
print('answers_train shape:', answers_train.shape)
print('answers_test shape:', answers_test.shape)
print('-')


In [None]:
# See individual training element.

print("Story (x): {}".format(inputs_train[100]))
print("Question (x): {}".format(queries_train[100]))
print("Answer: {}".format(answers_train[100]))

In [None]:
inputs_train[0], answers_train[0]

#### Compile the Neural Network

In [None]:
print('Compiling...')

# placeholders
input_sequence = Input((story_maxlen,))
question = Input((query_maxlen,))

# encoders
# embed the input sequence into a sequence of vectors
input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_size,
                              output_dim=64))
input_encoder_m.add(Dropout(0.3))
# output: (samples, story_maxlen, embedding_dim)

# embed the input into a sequence of vectors of size query_maxlen
input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_size,
                              output_dim=query_maxlen))
input_encoder_c.add(Dropout(0.3))
# output: (samples, story_maxlen, query_maxlen)

# embed the question into a sequence of vectors
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size,
                               output_dim=64,
                               input_length=query_maxlen))
question_encoder.add(Dropout(0.3))
# output: (samples, query_maxlen, embedding_dim)

# encode input sequence and questions (which are indices)
# to sequences of dense vectors
input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)

# compute a 'match' between the first input vector sequence
# and the question vector sequence
# shape: `(samples, story_maxlen, query_maxlen)`
match = dot([input_encoded_m, question_encoded], axes=(2, 2))
match = Activation('softmax')(match)

# add the match matrix with the second input vector sequence
response = add([match, input_encoded_c])  # (samples, story_maxlen, query_maxlen)
response = Permute((2, 1))(response)  # (samples, query_maxlen, story_maxlen)

# concatenate the match matrix with the question vector sequence
answer = concatenate([response, question_encoded])

# the original paper uses a matrix multiplication for this reduction step.
# we choose to use a RNN instead.
answer = LSTM(32)(answer)  # (samples, 32)

# one regularization layer -- more would probably be needed.
answer = Dropout(0.3)(answer)
answer = Dense(vocab_size)(answer)  # (samples, vocab_size)
# we output a probability distribution over the vocabulary
answer = Activation('softmax')(answer)

# build the final model
model = Model([input_sequence, question], answer)
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
print("Done.")


In [None]:
model.summary()

#### Training Jaringan Syaraf Tiruan

Kecepatan training bervariasi sesuai dengan komputer

In [None]:
start_time = time.time()
# train
model.fit([inputs_train, queries_train], answers_train,
          batch_size=32,
          epochs=120,
          validation_data=([inputs_test, queries_test], answers_test))

## SAVE DATA

In [None]:
# save
save_path = "./data/"
# save entire network to HDF5 (save everything, suggested)
model.save(os.path.join(save_path,"chatbot.h5"))
# save the vocab too, indexes must be the same
pickle.dump( vocab, open( os.path.join(save_path,"vocab.pkl"), "wb" ) )

elapsed_time = time.time() - start_time
#print("Elapsed time: {}".format(hms_string(elapsed_time)))

## LOADING DATA

In [None]:
# Load model kalau sudah pernah di save
save_path = "./data/"
model = load_model(os.path.join(save_path,"chatbot.h5"))
vocab = pickle.load( open( os.path.join(save_path,"vocab.pkl"), "rb" ) )

#### Evaluasi Keakuratan

Cara menilai keakuratan.

In [None]:
pred = model.predict([inputs_test, queries_test])
# See what the predictions look like, they are just probabilities of each class.
print(pred)

In [None]:
#menggunakan argmax untuk prediksi agar menjadi angka bulat

pred = np.argmax(pred,axis=1)
print(pred)

In [None]:
score = metrics.accuracy_score(answers_test, pred)
print("Final accuracy: {}".format(score))

#### Cerita Sendiri

Anda bisa masukan input cerita anda sendiri.

In [None]:
print("Remember, I only know these words: {}".format(vocab))
print()
story = "Mary pindah ke toilet. John pergi ke toilet. Daniel kembali ke lorong. Sandra pindah ke taman."
query = "Dimana Sandra ?"

adhoc_stories = (tokenize(story), tokenize(query), '?')

adhoc_train, adhoc_query, adhoc_answer = vectorize_stories([adhoc_stories])

pred = model.predict([adhoc_train, adhoc_query])
print(pred[0])
pred = np.argmax(pred,axis=1)
print("Answer: {}({})".format(vocab[pred[0]-1],pred))