In [84]:
import tensorflow as tf
import numpy as np
import nltk
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from collections import Counter, Iterable
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, AlphaDropout, Dropout, LSTM, Bidirectional, TimeDistributed, InputLayer, Embedding, Conv1D, Input, Flatten, concatenate

In [2]:
nltk.download('treebank')

[nltk_data] Downloading package treebank to
[nltk_data]     /Users/macbookpro/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

In [3]:
tagged_sentences = nltk.corpus.treebank.tagged_sents()

In [4]:
sentences, sentence_tags = [], [] 
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    sentences.append(sentence)
    sentence_tags.append(tags)

In [7]:
sent_train, sent_test, tag_train, tag_test = train_test_split(sentences, sentence_tags, test_size=0.2, 
                                                              random_state=0)

In [10]:
vocab = Counter()
for sent in sent_train:
    sent = [word.lower() for word in sent]
    vocab.update(sent)

In [11]:
filtered_vocab = {word for word in vocab if vocab[word] > 5}

In [12]:
len(filtered_vocab)

1679

In [13]:
word2id = {'PAD':0,'UNK':1}    
for i,word in enumerate(filtered_vocab):
      word2id[word] = i + 2

In [14]:
id2word = {i:word for word, i in word2id.items()}

In [15]:
len(id2word)

1681

In [16]:
tag2id = {'UNK': 1, 'PAD':0}  
for tags in tag_train:
    for tag in tags:
        if tag.lower() not in tag2id:
            tag2id[tag.lower()] = len(tag2id)

In [17]:
id2tag = {i:tag for tag, i in tag2id.items()}

In [18]:
len(id2tag)

48

In [19]:
def data2ints(data, smth2id):
    int_data = []
    for seq in data:
        int_seq = []
        for i in seq:
            try:
                int_seq.append(smth2id[i.lower()])
            except KeyError:
                int_seq.append(smth2id['UNK'])
  
        int_data.append(int_seq)
    return int_data

In [20]:
X_train_ids, X_test_ids = data2ints(sent_train, word2id), data2ints(sent_test, word2id)
y_train_ids, y_test_ids = data2ints(tag_train, tag2id), data2ints(tag_test, tag2id)

In [21]:
MAX_LEN = max(len(x) for x in sent_train)

In [24]:
X_train, X_test = pad_sequences(X_train_ids, maxlen=MAX_LEN, padding='post'), pad_sequences(X_test_ids, maxlen=MAX_LEN, padding='post')
y_train_pad, y_test_pad = pad_sequences(y_train_ids, maxlen=MAX_LEN, padding='post'), pad_sequences(y_test_ids, maxlen=MAX_LEN, padding='post')

In [25]:
print(X_train.shape, y_train_pad.shape, X_test.shape, y_test_pad.shape)

(3131, 128) (3131, 128) (783, 128) (783, 128)


In [28]:
y_train, y_test = to_categorical(y_train_pad, num_classes=len(tag2id)), to_categorical(y_test_pad, num_classes=len(tag2id))

In [30]:
print(y_train.shape, y_test.shape)

(3131, 128, 48) (783, 128, 48)


In [32]:
by_char = Counter()
for sent in sent_train:
    for word in sent:
        word = [char.lower() for char in word]
        by_char.update(word)

In [33]:
len(by_char)

52

In [34]:
char2id = {'PAD':0,'UNK':1}    
for i,char in enumerate(by_char):
      char2id[char] = i + 2

In [35]:
id2char = {i:char for char, i in char2id.items()}

In [37]:
X_train_ids_char = [data2ints(sent, char2id) for sent in sent_train]
X_test_ids_char = [data2ints(sent, char2id) for sent in sent_test]

In [38]:
MAX_LEN_CHAR = max(max(len(word) for word in sent) for sent in X_train_ids_char)

In [39]:
MAX_LEN_CHAR

24

In [42]:
def char_padding(data, MAX_LEN, MAX_LEN_CHAR):
    char_pad = np.zeros((len(data), MAX_LEN, MAX_LEN_CHAR))
    for i, s in enumerate(data):
        for j, word in enumerate(s):
            for k, char in enumerate(word):
                try:
                    char_pad[i][j][k] = char
                except:
                    continue
    return char_pad

In [47]:
X_train_char = char_padding(X_train_ids_char, MAX_LEN, MAX_LEN_CHAR)
X_test_char = char_padding(X_test_ids_char, MAX_LEN, MAX_LEN_CHAR)

In [48]:
print(X_train_char.shape,X_train.shape, y_train.shape, X_test_char.shape, X_test.shape, y_test.shape)

(3131, 128, 24) (3131, 128) (3131, 128, 48) (783, 128, 24) (783, 128) (783, 128, 48)


In [67]:
input_1 = Input(shape=(MAX_LEN,))
embeddings_1 = Embedding(len(word2id), 70, mask_zero=True)(input_1)
bilstm_1 = Bidirectional(LSTM(256, return_sequences=True))(embeddings_1)
drop_1 = Dropout(0.2)(bilstm_1)

input_2 = Input(shape=(MAX_LEN, MAX_LEN_CHAR,))
embeddings_2 = TimeDistributed(Embedding(len(char2id), input_length=MAX_LEN_CHAR, output_dim=50))(input_2)
conv_1 = TimeDistributed(Conv1D(kernel_size=5, filters=40, strides=2))(embeddings_2)
flat_1 = TimeDistributed(Flatten())(conv_1)
drop_2 = AlphaDropout(0.2)(flat_1)

concat = concatenate([drop_1, drop_2])
bilstm_2 = Bidirectional(LSTM(256, return_sequences=True))(concat)
outputs = TimeDistributed(Dense(len(tag2id), activation='sigmoid'))(bilstm_2)

model = tf.keras.Model(inputs=[input_1, input_2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_12 (InputLayer)           [(None, 128, 24)]    0                                            
__________________________________________________________________________________________________
input_11 (InputLayer)           [(None, 128)]        0                                            
__________________________________________________________________________________________________
time_distributed_19 (TimeDistri (None, 128, 24, 50)  2700        input_12[0][0]                   
__________________________________________________________________________________________________
embedding_10 (Embedding)        (None, 128, 70)      117670      input_11[0][0]                   
____________________________________________________________________________________________

In [68]:
model.fit([X_train, X_train_char], y_train, validation_data=([X_test, X_test_char], y_test), batch_size=128, epochs=32)

Train on 3131 samples, validate on 783 samples
Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epoch 25/32
Epoch 26/32
Epoch 27/32
Epoch 28/32
Epoch 29/32
Epoch 30/32
Epoch 31/32
Epoch 32/32


<tensorflow.python.keras.callbacks.History at 0x1549c9e50>

In [125]:
def predict_tag(sentence):
    sentence_len = len(sentence)
    sent_ids = data2ints([sentence], word2id)
    sentence_pad = pad_sequences(sent_ids, maxlen=MAX_LEN, padding='post')
    char_sent = [[list(word) for word in s] for s in [sentence]]
    char_sent_ids = [data2ints(sent, char2id) for sent in char_sent]
    padded_chars = char_padding(char_sent_ids, MAX_LEN, MAX_LEN_CHAR)
    tags_ids = np.argmax(model.predict([sentence_pad, padded_chars]), axis=2).tolist()[0][:sentence_len]
    tags = [id2tag[ind] for ind in tags_ids]  
    return [(word, tag) for word, tag in zip(sentence, tags)]

In [126]:
predict_tag(sent_train[1])

[('Composer', 'nnp'),
 ('Marc', 'nnp'),
 ('Marder', 'nnp'),
 (',', ','),
 ('a', 'dt'),
 ('college', 'nn'),
 ('friend', 'nn'),
 ('of', 'in'),
 ('Mr.', 'nnp'),
 ('Lane', 'nnp'),
 ("'s", 'pos'),
 ('who', 'wp'),
 ('*T*-66', '-none-'),
 ('earns', 'vbz'),
 ('his', 'prp$'),
 ('living', 'nns'),
 ('*-1', '-none-'),
 ('playing', 'vbg'),
 ('the', 'dt'),
 ('double', 'jj'),
 ('bass', 'nn'),
 ('in', 'in'),
 ('classical', 'nnp'),
 ('music', 'nnp'),
 ('ensembles', 'nns'),
 (',', ','),
 ('has', 'vbz'),
 ('prepared', 'vbn'),
 ('an', 'dt'),
 ('exciting', 'nn'),
 (',', ','),
 ('eclectic', 'jj'),
 ('score', 'nn'),
 ('that', 'wdt'),
 ('*T*-67', '-none-'),
 ('tells', 'vbz'),
 ('you', 'prp'),
 ('what', 'wp'),
 ('the', 'dt'),
 ('characters', 'nns'),
 ('are', 'vbp'),
 ('thinking', 'vbg'),
 ('*T*-2', '-none-'),
 ('and', 'cc'),
 ('feeling', 'nns'),
 ('*T*-2', '-none-'),
 ('far', 'rb'),
 ('more', 'rbr'),
 ('precisely', 'jj'),
 ('than', 'in'),
 ('intertitles', 'nns'),
 (',', ','),
 ('or', 'cc'),
 ('even', 'rb'),
 (