## Portuguese POS Tagging

In [2]:
import nltk
from keras.preprocessing.sequence import pad_sequences
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import numpy as np

In [3]:
file_train = 'data/macmorpho-train.txt'
file_test = 'data/macmorpho-test.txt'
file_val = 'data/macmorpho-dev.txt'

In [4]:
def pre_processing(fname, train=False):
    with open(fname, 'r') as text:
        lines = text.readlines()

    sentences_words = []
    sentences_tags = []
    for l in lines:
        sentence = l.replace('\n', '').split(' ')
        words_token = []
        tags_token = []
        for s in sentence:
            word_tag = s.split('_')
            words_token.append(word_tag[0])
            tags_token.append(word_tag[1])
        sentences_words.append(words_token)
        sentences_tags.append(tags_token)
        
    if train:
        return create_dict_to_numbers(sentences_words, sentences_tags)
    
    return sentences_words, sentences_tags

def create_dict_to_numbers(sentences_words, sentences_tags):
    word2number = {}
    tag2number = {}
    
    i = 2
    for s in sentences_words:
        for word in s:
            if word.lower() not in word2number.keys():
                word2number[word.lower()] = i
                i+=1
    word2number['--padding--'] = 0
    word2number['--not-exist--'] = 1
    
    i = 1
    for s in sentences_tags:
        for tag in s:
            if tag not in tag2number.keys():
                tag2number[tag] = i
                i+=1
    tag2number['--padding--'] = 0
    
    return sentences_words, sentences_tags, word2number, tag2number

def one_hot_encoding_tags(sentences_tags, tag2number):
    sentences_Y = []
    
    for s in sentences_tags:
        s_categories = []
        for t in s:
            tags = np.zeros(len(tag2number)) 
            tags[tag2number[t]] = 1
            s_categories.append(tags)
        sentences_Y.append(s_categories)
    
    return sentences_Y

def convert_to_numbers(sentences_words, sentences_tags, word2number, tag2number):
    sentences_X = []
    
    for s in sentences_words:
        aux_sent = []
        for w in s:
            try:
                aux_sent.append(word2number[w.lower()])
            except:
                aux_sent.append(word2number['--not-exist--'])
        sentences_X.append(aux_sent)
    
    return sentences_X, one_hot_encoding_tags(sentences_tags, tag2number)



In [5]:
train_words, train_tags, word2number, tag2number = pre_processing(file_train, train=True)
train_X, train_Y = convert_to_numbers(train_words, train_tags, word2number, tag2number)

print('End Pre-Processing Train')
test_words, test_tags = pre_processing(file_test)
test_X, test_Y = convert_to_numbers(test_words, test_tags, word2number, tag2number)
print('End Pre-Processing Test')
val_words, val_tags = pre_processing(file_val)
val_X, val_Y = convert_to_numbers(val_words, val_tags, word2number, tag2number)
print('End Pre-Processing Val')

End Pre-Processing Train
End Pre-Processing Test
End Pre-Processing Val


In [7]:
MAX_LENGTH = len(max(train_X, key=len))
print(MAX_LENGTH)  

190


In [6]:
def padding_sequences(sequences_words, sequences_tags, MAX_LENGTH):
    sequences_words = pad_sequences(sequences_words, maxlen=MAX_LENGTH, padding='post')
    sequences_tags = pad_sequences(sequences_tags, maxlen=MAX_LENGTH, padding='post')
    for s in sequences_tags:
        for t in s:
            if np.all((t == 0)):
                t[0] = 1
    return sequences_words, sequences_tags

In [8]:
train_X, train_Y = padding_sequences(train_X, train_Y, MAX_LENGTH)

In [11]:
test_X, test_Y = padding_sequences(test_X, test_Y, MAX_LENGTH)

In [13]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.optimizers import Adam
from keras import backend as K

def single(INTERESTING_CLASS_ID):
    def single_class_accuracy(y_true, y_pred):
        class_id_true = K.argmax(y_true, axis=-1)
        class_id_preds = K.argmax(y_pred, axis=-1)
        # Replace class_id_preds with class_id_true for recall here
        accuracy_mask = K.cast(K.equal(class_id_preds, INTERESTING_CLASS_ID), 'int32')
        class_acc_tensor = K.cast(K.equal(class_id_true, class_id_preds), 'int32') * accuracy_mask
        class_acc = K.sum(class_acc_tensor) / K.maximum(K.sum(accuracy_mask), 1)
        return class_acc
    return single_class_accuracy

 

model = Sequential()
model.add(InputLayer(input_shape=(MAX_LENGTH, )))
model.add(Embedding(len(word2number), 128))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag2number))))
model.add(Activation('softmax'))
 
model.compile(loss='categorical_crossentropy',
              optimizer ='rmsprop',
              metrics=['accuracy', single(0), single(1), single(2), single(3), single(4), single(5), single(6),
                      single(7), single(8), single(9), single(10), single(11), single(12), single(13),
                      single(14), single(15), single(16), single(17), single(18), single(19), single(20),
                      single(21), single(22), single(23), single(24), single(25), single(26)])
 
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 190, 128)          6046464   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 190, 512)          788480    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 190, 27)           13851     
_________________________________________________________________
activation_1 (Activation)    (None, 190, 27)           0         
Total params: 6,848,795
Trainable params: 6,848,795
Non-trainable params: 0
_________________________________________________________________


In [14]:
model.fit(train_X, train_Y, batch_size=128, epochs=1, validation_split=0)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/1


<keras.callbacks.History at 0x7ffad0e4f8d0>

In [16]:
scores = model.evaluate(test_X, test_Y)



In [18]:
scores

[0.03487198840991995,
 0.9896428521252184,
 1.0,
 0.868517440912023,
 0.8502589772370378,
 0.9122992847331929,
 0.3732852708521077,
 0.7632375010398152,
 0.9423193529833024,
 0.7505708996938794,
 0.9995071591144277,
 0.942215660694664,
 0.8181098920660352,
 0.7470542458365096,
 0.9779134634227807,
 0.9439283291330504,
 0.5851208868716193,
 0.7197817124177733,
 0.800885547415469,
 0.9477818363473742,
 0.48436300524014547,
 0.7583916233961293,
 0.7244543447607432,
 0.0,
 0.15379993992189847,
 0.0,
 0.01922499249023731,
 0.0,
 0.0]

In [17]:
print(f"{model.metrics_names[1]}: {scores[1] * 100}")

acc: 98.96428521252184


In [65]:
print(scores)

[0.019485465943363096, 0.08829794530188391]
