In [None]:
import nltk
nltk.download('brown')
nltk.download('punkt')
nltk.download('universal_tagset')
brownwords = nltk.corpus.brown.tagged_words(categories='news', tagset='universal')

In [None]:
# nltk can evaluate the tagger, but we need it represented as tagged sentences:
brownsentences = nltk.corpus.brown.tagged_sents(categories='news', tagset='universal')

In [None]:
vocab = list(set([w for sent in brownsentences for (w,t) in sent]))
vocab.append('<PAD>')
print(len(vocab))
tags = list(set([t for sent in brownsentences for (w,t) in sent]))
tags.append('<PAD>')
print(tags)
print(brownsentences[0])

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_len = 60
word2index = {w: i for i, w in enumerate(vocab)}
tag2index = {t: i for i, t in enumerate(tags)}
onehot = [[word2index[w[0]] for w in s] for s in brownsentences]
X = pad_sequences(maxlen=max_len, sequences=onehot, padding="post", value=len(vocab)-1)

In [None]:
from tensorflow.keras.utils import to_categorical
onehot_y = [[tag2index[w[1]] for w in s] for s in brownsentences]
y = pad_sequences(maxlen=max_len, sequences=onehot_y, padding="post", value=tag2index["<PAD>"])
y = [to_categorical(i, num_classes=len(tags)) for i in y]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional 
model = Sequential()
model.add(Embedding(input_dim=len(vocab), output_dim=50, input_length=max_len))
model.add(Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)))
model.add(TimeDistributed(Dense(len(tags), activation="softmax")))
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
print(model.summary())

In [None]:
import numpy
n_epochs=15
# run fit on Colab or hopper
#history = model.fit(X_train, y_train, batch_size=16, epochs=n_epochs, validation_split=0.1, verbose=1)

In [None]:
test_sent=41
pred = model.predict(numpy.array([X_test[test_sent]]))
p = numpy.argmax(pred, axis=-1)
for i,ix in enumerate(X_test[test_sent]):
    if ix==len(vocab)-1:
        break
    print("{:20} — {}".format(vocab[ix], tags[p[0][i]]))