In [1]:
from keras.layers import Input, Dense, LSTM, Dropout, RepeatVector
from keras.layers.embeddings import Embedding
from keras.layers.wrappers import TimeDistributed
from keras.layers.recurrent import GRU
from keras.models import Model
import collections
import nltk
import numpy as np
import random

Using TensorFlow backend.


In [2]:
nltk.download('treebank')
sents = nltk.corpus.treebank.tagged_sents()

[nltk_data] Downloading package treebank to /home/jonki/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


In [3]:
word_freq = collections.Counter()
data = []
for sent in sents:
    sentence, poss = [], []
    for word, pos in sent:
        if pos == '-NONE-': continue
        sentence.append(word)
        word_freq[word] += 1
        poss.append(pos)
    data.append((sentence, poss))

In [9]:
max_raw_vocab = 5000
raw_vocab_size = min(max_raw_vocab, len(word_freq)) + 2
raw_w2i = {w[0]:i for i, w in enumerate(word_freq.most_common(max_raw_vocab), 2)}
PAD = '<PADDING>'
UNK = '<UNK>'
raw_w2i[PAD], raw_w2i[UNK] = 0, 1
raw_i2w = {v:k for k, v in raw_w2i.items()}
print('raw_vocab', len(raw_w2i))

sentence_maxlen = max([len(sent) for sent, pos in data])
print('sentence maxlen', sentence_maxlen)

pos_vocab = set()
for sentence, poss in data:
#     raw_vocab |= set(sentence)
    pos_vocab |= set(poss)

pos_w2i = {p:i for i, p in enumerate(pos_vocab, 1)}
pos_w2i[PAD] = 0
pos_i2w = {v:k for k, v in enumerate(pos_w2i.items())}
pos_vocab_size = len(pos_w2i)
# print('raw_vocab', len(raw_vocab))
print('pos_vocab', pos_vocab_size)

raw_vocab 5002
sentence maxlen 249
pos_vocab 46


In [10]:
# vectorize
N = len(data)
random.shuffle(data)
X = np.zeros((N, sentence_maxlen), dtype=np.uint32)
Y = np.zeros((N, sentence_maxlen, pos_vocab_size))
print('N', N)
for i, (sentence, poss) in enumerate(data):
    for t, (word, pos) in enumerate(zip(sentence, poss)):
        if word in raw_w2i:
            X[i, t] = raw_w2i[word]
        else:
            X[i, t] = raw_w2i[UNK]
        Y[i, t, pos_w2i[pos]] = 1 # one-hot

print(data[0])
print(X[0])
print(Y[0])

N 3914
(['The', 'latest', '10-year', 'notes', 'were', 'quoted', 'at', '100', '22\\/32', 'to', 'yield', '7.88', '%', 'compared', 'with', '100', '16\\/32', 'to', 'yield', '7.90', '%', '.'], ['DT', 'JJS', 'JJ', 'NNS', 'VBD', 'VBN', 'IN', 'CD', 'CD', 'TO', 'VB', 'CD', 'NN', 'VBN', 'IN', 'CD', 'CD', 'TO', 'VB', 'CD', 'NN', '.'])
[  14  482 2347  324   50 1670   23  247 3886    6  187 2865   21  305   24
  247    1    6  187 2501   21    4    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0 

In [11]:
# define model
embd_size = 128
latendt_size = 64
sentence_input = Input(shape=(sentence_maxlen,))
embd_sentence = Embedding(input_dim=raw_vocab_size, output_dim=embd_size)(sentence_input)
rnn_out = GRU(latendt_size)(embd_sentence)
repeated_in = RepeatVector(sentence_maxlen)(rnn_out)
rnn_out2 = GRU(latendt_size, return_sequences=True)(repeated_in)
pos_out = TimeDistributed(Dense(pos_vocab_size, activation='softmax'))(rnn_out2)
# pos_out = Dense(len(pos_vocab), activation='softmax')(rnn_out2)
model = Model(sentence_input, pos_out)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
print(model.summary())

# from keras.models import Sequential
# from keras.layers.core import Activation, Dense, Dropout, RepeatVector, SpatialDropout1D
# model = Sequential()
# model.add(Embedding(raw_vocab_size, embd_size,input_length=sentence_maxlen))
# # model.add(SpatialDropout1D(Dropout(0.2)))
# model.add(GRU(latendt_size, dropout=0.2, recurrent_dropout=0.2))
# model.add(RepeatVector(sentence_maxlen))
# model.add(GRU(latendt_size, return_sequences=True))
# model.add(TimeDistributed(Dense(len(pos_vocab))))
# model.add(Activation("softmax"))

# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 249)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 249, 128)          640256    
_________________________________________________________________
gru_3 (GRU)                  (None, 64)                37056     
_________________________________________________________________
repeat_vector_2 (RepeatVecto (None, 249, 64)           0         
_________________________________________________________________
gru_4 (GRU)                  (None, 249, 64)           24768     
_________________________________________________________________
time_distributed_2 (TimeDist (None, 249, 46)           2990      
Total params: 705,070
Trainable params: 705,070
Non-trainable params: 0
_________________________________________________________________
None

In [12]:
model.fit(X, Y, batch_size=64, epochs=30, validation_split=.2)

Train on 3131 samples, validate on 783 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f8b2ada3a20>