### POS tagging using Bidirectional LSTM

Data ingestion

In [1]:
import nltk
nltk.download('treebank')
nltk.download('brown')
nltk.download('conll2000')

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\anasm\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\treebank.zip.
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\anasm\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.
[nltk_data] Downloading package conll2000 to
[nltk_data]     C:\Users\anasm\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\conll2000.zip.


True

In [3]:
nltk.download('universal_tagset')

[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\anasm\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\universal_tagset.zip.


True

In [None]:
from nltk.corpus import treebank, brown, conll2000
tagged_sentences = treebank.tagged_sents(tagset='universal') +\
                   brown.tagged_sents(tagset='universal') +\
                   conll2000.tagged_sents(tagset='universal')

print(tagged_sentences[0])#list of list of tuples
len(tagged_sentences)

[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')]


72202

Our model will input text and output parts of speech label. Accordingly we will create the data

In [12]:
sentences=[]
sentence_tags=[]

for s in tagged_sentences:
    sentence=[]
    tags=[]
    for word, tag in s:
        sentence.append(word)
        tags.append(tag)
    
    sentences.append(sentence)
    sentence_tags.append(tags)

In [14]:
print(sentence_tags[0])
print(sentences[0])

['NOUN', 'NOUN', '.', 'NUM', 'NOUN', 'ADJ', '.', 'VERB', 'VERB', 'DET', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'NOUN', 'NUM', '.']
['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.']


In [15]:
# a sanity check whether we correctly tagged
print(len(sentences), len(sentence_tags))

72202 72202


In [17]:
#now we split the data into training and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(sentences, sentence_tags, test_size=0.2)
print(len(X_train), len(X_test), len(y_train), len(y_test), sep='\n')

57761
14441
57761
14441


Now we tokenize the training text using the keras tokenizer with an out of vocabulary token

In [20]:
from keras_preprocessing.text import Tokenizer
tok_sentence=Tokenizer(oov_token='<OOV>')
tok_sentence.fit_on_texts(X_train)

In [22]:
# we need another tokenizer for labels because they are also sequences
tok_label=Tokenizer()
tok_label.fit_on_texts(y_train)

In [23]:
tok_label.word_index

{'noun': 1,
 'verb': 2,
 '.': 3,
 'adp': 4,
 'det': 5,
 'adj': 6,
 'adv': 7,
 'pron': 8,
 'conj': 9,
 'prt': 10,
 'num': 11,
 'x': 12}

Now we vecorize the sentences and corresponding tags since both are sequences

In [25]:
X_train_seq=tok_sentence.texts_to_sequences(X_train)
y_train_seq=tok_label.texts_to_sequences(y_train)

Although RNN can process variable input length we will pad the sequences to improve performance

In [27]:
from keras_preprocessing.sequence import pad_sequences
X_train_padded=pad_sequences(X_train_seq, maxlen=170, padding='pre')

In [32]:
y_train_padded=pad_sequences(y_train_seq, maxlen=170, padding='pre')
y_train_padded[0]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  3,  2,  8,  2,  8,  7, 10,  2,  7,  4,  5,  1,  3,  3])

In [36]:
from keras.utils import to_categorical
y_train_categorical=to_categorical(y_train_padded)
y_train_categorical.shape

(57761, 170, 13)

In [38]:
print(f'a single label containing 170 vectors {y_train_categorical[0]}')
print(f'a single tag one hot encoded {y_train_categorical[0][0]}')

a single label containing 170 vectors [[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
a single tag one hot encoded [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


To get the POS tag from the one hot encoded labels we can query the dictionary

In [40]:
import numpy as np
idx=np.argmax(y_train_categorical[0][1])+1
print(f'index={idx}')
print(f'tag: {tok_label.index_word[idx]}')

index=1
tag: noun


Model Building

In [43]:
num_tokens=len(tok_sentence.word_index)+1
embedding_dim=128

#for output layer, number of classes=nuber of possible tags
num_classes=len(tok_label.word_index)+1

In [51]:
from keras import Sequential
from keras.layers import Dense, Embedding, Bidirectional, LSTM

model=Sequential()
model.add(Embedding(input_dim=num_tokens, output_dim=embedding_dim, input_length=170, mask_zero=True))
#by setting mask_zero=True, we make sure the pre padded zeros are ignored by subsequent layers

model.add(Bidirectional(LSTM(128, return_sequences=True)))#since it is a many to many RNN we need to set return seq=True
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [52]:
model.fit(X_train_padded, y_train_categorical, epochs=20)

Epoch 1/20
[1m1806/1806[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m235s[0m 128ms/step - accuracy: 0.1079 - loss: 0.5227
Epoch 2/20
[1m1806/1806[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 107ms/step - accuracy: 0.1216 - loss: 0.0648
Epoch 3/20
[1m1806/1806[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 107ms/step - accuracy: 0.1219 - loss: 0.0411
Epoch 4/20
[1m1806/1806[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 107ms/step - accuracy: 0.1231 - loss: 0.0286
Epoch 5/20
[1m1806/1806[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m194s[0m 107ms/step - accuracy: 0.1227 - loss: 0.0211
Epoch 6/20
[1m1806/1806[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m194s[0m 108ms/step - accuracy: 0.1239 - loss: 0.0143
Epoch 7/20
[1m1806/1806[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 107ms/step - accuracy: 0.1239 - loss: 0.0092
Epoch 8/20
[1m1806/1806[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m 106ms/step - accuracy: 0.1237 - loss:

<keras.src.callbacks.history.History at 0x1fd950408c0>

In [74]:
def tag_sentences(sentences: list):
    sentences_seq=tok_sentence.texts_to_sequences(sentences)
    sentences_seq_padded=pad_sequences(sentences_seq, maxlen=170, padding='pre')

    tag_pred=model.predict(sentences_seq_padded)
    sentence_tags=[]

    for i, pred in enumerate(tag_pred):
        tags_seq=[np.argmax(p) for p in pred[170-len(sentences_seq[i]): 170]]

        words = [tok_sentence.index_word[w] for w in sentences_seq[i]]
        tags = [tok_label.index_word[t] for t in tags_seq]
        sentence_tags.append(list(zip(words, tags)))

    return sentence_tags

In [79]:
samples = [
    "Brown refused to testify.",
    "Come as you are",
]

In [80]:
tagged_sentences=tag_sentences(samples)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 131ms/step


In [81]:
print(tagged_sentences[0])
print(tagged_sentences[1])

[('brown', 'noun'), ('refused', 'verb'), ('to', 'prt'), ('testify', 'verb')]
[('come', 'verb'), ('as', 'adp'), ('you', 'pron'), ('are', 'verb')]
