In [1]:
# Membangun POS tagger dengan Bidirectional-LSTM menggunakan Keras
# Sumber: https://nlpforhackers.io/lstm-pos-tagger-keras/

import nltk
# Download corpus 'treebank' dari NLTK (di run sekali saja)
nltk.download('treebank')
tagged_sentences = nltk.corpus.treebank.tagged_sents()
 
# Melihat sampel dan ukuran (length) dari corpus
print(tagged_sentences[0])
print("Tagged sentences: ", len(tagged_sentences))
print("Tagged words:", len(nltk.corpus.treebank.tagged_words()))

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]
Tagged sentences:  3914
Tagged words: 100676


In [2]:
# Restrukturisasi data. Memisahkan kata-kata (words) dari tag nya 

import numpy as np
 
sentences, sentence_tags =[], [] 
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    sentences.append(np.array(sentence))
    sentence_tags.append(np.array(tags))

# Melihat bagaimana isi sebuah sequence
 
print(sentences[5])
print(sentence_tags[5])

['Lorillard' 'Inc.' ',' 'the' 'unit' 'of' 'New' 'York-based' 'Loews'
 'Corp.' 'that' '*T*-2' 'makes' 'Kent' 'cigarettes' ',' 'stopped' 'using'
 'crocidolite' 'in' 'its' 'Micronite' 'cigarette' 'filters' 'in' '1956'
 '.']
['NNP' 'NNP' ',' 'DT' 'NN' 'IN' 'JJ' 'JJ' 'NNP' 'NNP' 'WDT' '-NONE-' 'VBZ'
 'NNP' 'NNS' ',' 'VBD' 'VBG' 'NN' 'IN' 'PRP$' 'NN' 'NN' 'NNS' 'IN' 'CD'
 '.']


In [0]:
# Memisahkan data menjadi data training dan test, sebelum train model nya

# Digunakan fungsi train_test_split dari Scikit-Learn
from sklearn.model_selection import train_test_split

# 80% data training, 20% data test
(train_sentences, 
 test_sentences, 
 train_tags, 
 test_tags) = train_test_split(sentences, sentence_tags, test_size=0.2)

In [0]:
# Meng-assign integer unik untuk tiap word (dan tag), agar bisa diolah oleh Keras

# Kita mengkomputasi satu set words (dan tag) yang unik, lalu mengubahnya dalam satu list
#   dan mengindeks nya dalam sebuah kamus (dictionary)

# Kamus-kamus ini adalah word vocabulary dan tag vocabulary nya.

# Kita akan juga menambahkan value khusus untuk padding sequence nya 
#   dan satu lagi untuk kata-kata tak dikenal (OOV - Out of Vocabulary)

words, tags = set([]), set([])
 
for s in train_sentences:
    for w in s:
        words.add(w.lower())
 
for ts in train_tags:
    for t in ts:
        tags.add(t)
 
word2index = {w: i + 2 for i, w in enumerate(list(words))}
word2index['-PAD-'] = 0  # Value khusus digunakan untuk padding
word2index['-OOV-'] = 1  # Value khusus digunakan untuk OOVs
 
tag2index = {t: i + 1 for i, t in enumerate(list(tags))}
tag2index['-PAD-'] = 0  # Value khusus digunakan untuk padding

In [5]:
# Sekarang kita konversi dataset word ke dataset integer, untuk words nya dan tag nya

train_sentences_X, test_sentences_X, train_tags_y, test_tags_y = [], [], [], []
 
for s in train_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    train_sentences_X.append(s_int)
 
for s in test_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    test_sentences_X.append(s_int)
 
for s in train_tags:
    train_tags_y.append([tag2index[t] for t in s])
 
for s in test_tags:
    test_tags_y.append([tag2index[t] for t in s])
 
print(train_sentences_X[0])
print(test_sentences_X[0])
print(train_tags_y[0])
print(test_tags_y[0])

[5539, 9075, 8767, 6954, 7160, 3165, 2974, 27, 1907, 3950, 3310, 2666, 2697, 8244, 2347, 7289, 90, 8183, 5731, 3727, 1952, 9012, 6446, 810, 8720]
[9311, 6205, 8151, 9941, 5297, 8877, 7901, 969, 810, 1172, 3947, 4326, 3970, 7453, 928, 1408, 8564, 362, 890, 255, 9555, 7453, 9705, 1, 7636, 7286, 7848, 1172, 1417, 6910, 6523, 1, 1, 1172, 4051, 8710, 3950, 2720, 3979, 6987, 7208, 8720]
[18, 22, 11, 34, 15, 6, 2, 44, 28, 44, 28, 29, 29, 15, 9, 29, 11, 44, 28, 29, 20, 45, 43, 20, 37]
[23, 31, 2, 29, 43, 44, 2, 29, 20, 8, 40, 43, 20, 10, 13, 28, 29, 20, 44, 29, 20, 10, 13, 11, 44, 22, 11, 8, 4, 43, 20, 25, 25, 8, 29, 29, 44, 28, 25, 25, 25, 37]


In [6]:
# Keras hanya bisa bekerja dengan ukuran sequence yang fix
# Kita akan pad ke kanan semua sequence dengan satu value khusus, dimana
# 0 sebagai index dan "-PAD-" sebagai word/tag yang yang bersesuaian ke panjang dari sequence terpanjang di dataset

# Hasil nya adalah panjang yang maksimum dari semua sequence

MAX_LENGTH = len(max(train_sentences_X, key=len))
print(MAX_LENGTH)  # 271

271


In [7]:
# Digunakan fungsi utility pad_sequences dari Keras 

from keras.preprocessing.sequence import pad_sequences
 
train_sentences_X = pad_sequences(train_sentences_X, maxlen=MAX_LENGTH, padding='post')
test_sentences_X = pad_sequences(test_sentences_X, maxlen=MAX_LENGTH, padding='post')
train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_LENGTH, padding='post')
test_tags_y = pad_sequences(test_tags_y, maxlen=MAX_LENGTH, padding='post')
 
print(train_sentences_X[0])
print(test_sentences_X[0])
print(train_tags_y[0])
print(test_tags_y[0])

Using TensorFlow backend.


[5539 9075 8767 6954 7160 3165 2974   27 1907 3950 3310 2666 2697 8244
 2347 7289   90 8183 5731 3727 1952 9012 6446  810 8720    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

In [8]:
# Arsitektur Network, sekarang kita definisikan modelnya:

# Kita akan membutuhkan embedding layer untuk mengkomputasi vector model (word) untuk words kita

# Kita akan membutuhkan LSTM layer dengan sebuah modifier bidirectional,
#   modifier tsb sebagai input bagi LSTM nilai (values) selanjutnya di dalam sequence
#   bukan hanya yang sebelumnya

# Kita butuh untuk men-set parameter return_sequence=True, sehingga output LSTM adalah sequence,
#   bukan hanya value final nya

# Setelah LSTM layer kita butuh Dense layer (atau layer yang fully-connected)
#   yang memilih POS tag yang sesuai.

# Karena Dense layer butuh untuk dijalankan pada setiap elemen dari sequence, maka kita
#   perlu menambahkan modifier TimeDistributed

from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.optimizers import Adam
 
 
model = Sequential()
model.add(InputLayer(input_shape=(MAX_LENGTH, )))
model.add(Embedding(len(word2index), 128))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag2index))))
model.add(Activation('softmax'))
 
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy'])
 
model.summary()






Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 271, 128)          1304448   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 271, 512)          788480    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 271, 47)           24111     
_________________________________________________________________
activation_1 (Activation)    (None, 271, 47)           0         
Total params: 2,117,039
Trainable params: 2,117,039
Non-trainable params: 0
_________________________________________________________________


In [0]:
# Ada satu hal lagi yang perlu dilakukan sebelum training:
# Kita butuh untuk men-transform sequences dari tag ke sequences dari One-Hot Encoded tags
# Berikut adalah fungsi yang melakukan hal tersebut

def to_categorical(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)

In [10]:
# Inilah tampilan dari One Hot Encoded Tags 

cat_train_tags_y = to_categorical(train_tags_y, len(tag2index))
print(cat_train_tags_y[0])

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


In [11]:
# Sekarang kita training model nya
# Epochs 40

model.fit(train_sentences_X, to_categorical(train_tags_y, len(tag2index)), batch_size=128, epochs=40, validation_split=0.2)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Train on 2504 samples, validate on 627 samples
Epoch 1/40





Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7fd5352e29e8>

In [12]:
# Evaluasi model pada data testing:

scores = model.evaluate(test_sentences_X, to_categorical(test_tags_y, len(tag2index)))
print(f"{model.metrics_names[1]}: {scores[1] * 100}")   # acc: 99.09751977804825 (hasil asli, 40 epochs)

acc: 99.14087669146015


In [13]:
# Catatan: hasil akurasi yang amat tinggi ini karena banyak nya padding
#   kita kesempingkan dulu faktor tersebut, yang penting adalah tahapan nya

# Sekarang kita ambil dua kalimat:

test_samples = [
    "running is very important for me .".split(),
    "I was running every day for a month .".split()
]
print(test_samples)

[['running', 'is', 'very', 'important', 'for', 'me', '.'], ['I', 'was', 'running', 'every', 'day', 'for', 'a', 'month', '.']]


In [14]:
# Sekarang transform test_samples diatas ke dalam padded sequences dari id-id nya word:

test_samples_X = []
for s in test_samples:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
    test_samples_X.append(s_int)
 
test_samples_X = pad_sequences(test_samples_X, maxlen=MAX_LENGTH, padding='post')
print(test_samples_X)

[[2628 8331 1267 4433  890 6851 8720    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0   

In [15]:
# Membuat prediksi-prediksi pertama kita:

predictions = model.predict(test_samples_X)
print(predictions, predictions.shape)

[[[1.3265696e-02 1.7446376e-03 3.2747224e-02 ... 3.8458791e-04
   5.2888278e-04 3.0618454e-03]
  [9.1930146e-05 5.7931338e-04 4.8240243e-05 ... 4.8970906e-03
   6.8551209e-03 7.4273138e-04]
  [1.2951572e-05 4.0897508e-03 4.5213923e-03 ... 4.0296405e-03
   7.7858378e-05 6.5452012e-04]
  ...
  [9.9994564e-01 2.0682047e-09 4.2286902e-10 ... 1.4072725e-10
   2.1951270e-09 1.5315843e-08]
  [9.9989986e-01 4.3015564e-09 5.0305715e-10 ... 2.2893756e-10
   6.3541603e-09 3.6072109e-08]
  [9.9980527e-01 9.4434576e-09 6.2562466e-10 ... 3.9498693e-10
   1.9466844e-08 8.5982556e-08]]

 [[2.5589390e-05 2.8136175e-03 4.7953095e-06 ... 5.7543558e-04
   9.0147287e-01 4.9354089e-04]
  [1.2442895e-06 2.4854023e-02 1.0375743e-03 ... 1.0678200e-04
   4.2358269e-03 8.4647720e-05]
  [1.4789445e-04 4.8128720e-03 3.1223379e-02 ... 1.7817457e-04
   2.3956534e-04 1.0269531e-03]
  ...
  [9.9994493e-01 1.9189839e-09 3.7345019e-10 ... 1.2985676e-10
   2.2362223e-09 1.5132430e-08]
  [9.9989748e-01 3.9912007e-09 4.442

In [0]:
# Cukup sulit untuk dibaca, kan? Kita butuh untuk melakukan “reverse” operation untuk to_categorical:

def logits_to_tokens(sequences, index):
    token_sequences = []
    for categorical_sequence in sequences:
        token_sequence = []
        for categorical in categorical_sequence:
            token_sequence.append(index[np.argmax(categorical)])
 
        token_sequences.append(token_sequence)
 
    return token_sequences

In [17]:
# Dan berikut tampilan prediksi-prediksi nya:

print(logits_to_tokens(predictions, {i: t for t, i in tag2index.items()}))
 
# ['JJ', 'NNS', 'NN', 'NNP', 'NNP', 'NNS', '-NONE-', '-PAD-', ...
# ['VBP', 'CD', 'JJ', 'CD', 'NNS', 'NNP', 'POS', 'NN', '-NONE-', '-PAD-', ...

[['NNP', 'VBZ', 'RB', 'JJ', 'IN', 'PRP', '.', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', 

In [0]:
# Hasil tampilan di atas mayoritas diisi oleh "padding tokens", karena itu akurasinya tinggi.
# Sekarang kita tulis akurasi yang custom, yang tidak mengindahkan paddings:

from keras import backend as K
 
def ignore_class_accuracy(to_ignore=0):
    def ignore_accuracy(y_true, y_pred):
        y_true_class = K.argmax(y_true, axis=-1)
        y_pred_class = K.argmax(y_pred, axis=-1)
 
        ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32')
        matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
        accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
        return accuracy
    return ignore_accuracy

In [19]:
# Sekarang kita train ulang, dengan menambahkan metric ignore_class_acuracy pada tahapan kompile 

from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.optimizers import Adam
 
 
model = Sequential()
model.add(InputLayer(input_shape=(MAX_LENGTH, )))
model.add(Embedding(len(word2index), 128))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag2index))))
model.add(Activation('softmax'))
 
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy', ignore_class_accuracy(0)])
 
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 271, 128)          1304448   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 271, 512)          788480    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 271, 47)           24111     
_________________________________________________________________
activation_2 (Activation)    (None, 271, 47)           0         
Total params: 2,117,039
Trainable params: 2,117,039
Non-trainable params: 0
_________________________________________________________________


In [20]:
# Sekarang kita train ulang:

model.fit(train_sentences_X, to_categorical(train_tags_y, len(tag2index)), batch_size=128, epochs=40, validation_split=0.2)

Train on 2504 samples, validate on 627 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7fd4e009be80>

In [21]:
#Sekarang kita lihat bagaimana performa model nya
predictions = model.predict(test_samples_X)
print(logits_to_tokens(predictions, {i: t for t, i in tag2index.items()}))

[['NNP', 'VBZ', 'RB', 'JJ', 'IN', 'PRP', '.', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', 

In [0]:
# Hasilnya sudah amat baik dan ada kemungkinan bisa lebih baik lagi
# Dengan beberapa strategi lain:

# 1. Gunakan pretrained vectors – Transfer Learning
# 2. Gunakan custom feature seperti pada POS Tagging klasik yang dikombinasikan dengan embeddings
# 3. Coba arsitektur yang berbeda