## Portuguese POS Tagging

In [1]:
import nltk
import numpy as np
import os

from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.optimizers import Adam
from keras import backend as K
from tensorflow.keras.metrics import Precision

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
file_train = 'data/macmorpho-train.txt'
file_test = 'data/macmorpho-test.txt'
file_val = 'data/macmorpho-dev.txt'

In [3]:
def pre_processing(fname, train=False):
    with open(fname, 'r') as text:
        lines = text.readlines()

    sentences_words = []
    sentences_tags = []
    for l in lines:
        sentence = l.replace('\n', '').split(' ')
        words_token = []
        tags_token = []
        for s in sentence:
            word_tag = s.split('_')
            words_token.append(word_tag[0])
            tags_token.append(word_tag[1])
        sentences_words.append(words_token)
        sentences_tags.append(tags_token)
        
    if train:
        return create_dict_to_numbers(sentences_words, sentences_tags)
    
    return sentences_words, sentences_tags

def create_dict_to_numbers(sentences_words, sentences_tags):
    word2number = {}
    tag2number = {}
    
    i = 2
    for s in sentences_words:
        for word in s:
            if word.lower() not in word2number.keys():
                word2number[word.lower()] = i
                i+=1
    word2number['--padding--'] = 0
    word2number['--not-exist--'] = 1
    
    i = 1
    for s in sentences_tags:
        for tag in s:
            if tag not in tag2number.keys():
                tag2number[tag] = i
                i+=1
    tag2number['--padding--'] = 0
    
    return sentences_words, sentences_tags, word2number, tag2number

def convert_to_numbers(sentences_words, sentences_tags, word2number, tag2number):
    sentences_X, sentences_Y = [], []
    
    for s in sentences_words:
        aux_sent = []
        for w in s:
            try:
                aux_sent.append(word2number[w.lower()])
            except:
                aux_sent.append(word2number['--not-exist--'])
        sentences_X.append(aux_sent)
        
    for s in sentences_tags:
        sentences_Y.append([tag2number[t] for t in s])
    
    return sentences_X, sentences_Y

def one_hot_encoding_tags(sentences_tags, tag2number):
    sentences_Y = []
    
    for s in sentences_tags:
        s_categories = []
        for t in s:
            tags = np.zeros(len(tag2number)) 
            tags[t] = 1
            s_categories.append(tags)
        sentences_Y.append(s_categories)
    
    return sentences_Y

def padding_sequences(sequences_words, sequences_tags, MAX_LENGTH, tag2number):
    sequences_words = pad_sequences(sequences_words, maxlen=MAX_LENGTH, padding='post')
    sequences_tags = pad_sequences(sequences_tags, maxlen=MAX_LENGTH, padding='post')
#     for s in sequences_tags:
#         for t in s:
#             if np.all((t == 0)):
#                 t[0] = 1
    sequences_tags = one_hot_encoding_tags(sequences_tags, tag2number)
    return sequences_words, sequences_tags


def ignore_pad_acc(to_ignore=0):
    ## ignorar a classe 0, referente ao padding, para computar a acurácia total
    def ignore_padding_accuracy(y_true, y_pred):
        y_true_ids = K.argmax(y_true, axis=-1)
        y_pred_ids = K.argmax(y_pred, axis=-1)
 
        padding_mask = K.cast(K.not_equal(y_pred_ids, to_ignore), 'int32')
        matches_without_padding = K.cast(K.equal(y_true_ids, y_pred_ids), 'int32') * padding_mask
        accuracy = K.sum(matches_without_padding) /  K.maximum(K.sum(padding_mask), 1)
        
        return accuracy
    
    return ignore_padding_accuracy

def precision(id_of_interest):
    
    def class_precision(y_true, y_pred):
        y_true_ids = K.argmax(y_true, axis=-1)
        y_preds_ids = K.argmax(y_pred, axis=-1)
        
        retrieved_mask = K.cast(K.equal(y_preds_ids, id_of_interest), 'int32') # tp+fp
        class_true_positive = K.cast(K.equal(y_true_ids, y_preds_ids), 'int32') * retrieved_mask # tp
        class_prec = K.sum(class_true_positive) /  K.maximum(K.sum(retrieved_mask), 1) # tp / tp+fp
        
        return class_prec
    
    return class_precision

def recall(id_of_interest):
    
    def class_recall(y_true, y_pred):
        y_true_ids = K.argmax(y_true, axis=-1)
        y_preds_ids = K.argmax(y_pred, axis=-1)
        
        relevant_mask = K.cast(K.equal(y_true_ids, id_of_interest), 'int32') # tp+fn
        class_true_positive = K.cast(K.equal(y_true_ids, y_preds_ids), 'int32') * relevant_mask # tp
        class_rec = K.sum(class_true_positive) /  K.maximum(K.sum(relevant_mask), 1) # tp / tp+fn
        
        return class_rec
    
    return class_recall


def accuracy(id_of_interest):
    
    def class_accuracy(y_true, y_pred):
        y_true_ids = K.argmax(y_true, axis=-1)
        y_preds_ids = K.argmax(y_pred, axis=-1)

        positive_mask = K.cast(K.equal(y_preds_ids, id_of_interest), 'int32') # tp+fp
        class_true_positive = K.cast(K.equal(y_true_ids, y_preds_ids), 'int32') * positive_mask # tp
        
        negative_mask = K.cast(K.not_equal(y_preds_ids, id_of_interest), 'int32') # tn+fn
        class_true_negative = K.cast(K.equal(y_true_ids, y_preds_ids), 'int32') * negative_mask # tn
        
        tp_sum_tn = K.sum(class_true_positive) + K.sum(class_true_negative)
        sum_all =  K.maximum(K.sum(positive_mask) + K.sum(negative_mask), 1)
        
        return tp_sum_tn / sum_all
    
    return class_accuracy

def create_model(MAX_LENGTH, tag2number, word2number):
    model = Sequential()
    model.add(InputLayer(input_shape=(MAX_LENGTH, )))
    model.add(Embedding(len(word2number), 128))
    model.add(Bidirectional(LSTM(256, return_sequences=True)))
    model.add(TimeDistributed(Dense(len(tag2number))))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer ='rmsprop',
                  metrics=['accuracy', ignore_pad_acc(0), accuracy(0), accuracy(1), accuracy(2), 
                          accuracy(3), accuracy(4), accuracy(5), accuracy(6), accuracy(7), accuracy(8), 
                          accuracy(9), accuracy(10), accuracy(11), accuracy(12), accuracy(13), accuracy(14), 
                          accuracy(15), accuracy(16), accuracy(17), accuracy(18), accuracy(19), accuracy(20), 
                          accuracy(21), accuracy(22), accuracy(23), accuracy(24), accuracy(25), accuracy(26), 
                          precision(0), precision(1), precision(2), precision(3), precision(4), precision(5), 
                          precision(6), precision(7), precision(8), precision(9), precision(10), precision(11),
                          precision(12), precision(13), precision(14), precision(15), precision(16), 
                          precision(17), precision(18), precision(19), precision(20), precision(21), 
                          precision(22), precision(23), precision(24), precision(25), precision(26)])
    #                       recall(1), recall(2),
    #                       recall(3), recall(4), recall(5), recall(6), recall(7), 
    #                       recall(8), recall(9), recall(10), recall(11), recall(12),
    #                       recall(13), recall(14), recall(15), recall(16), recall(17), 
    #                       recall(18), recall(19), recall(20), recall(21), recall(22),
    #                       recall(23), recall(24), recall(25), recall(26)])
 
    return model

In [4]:
def get_key_from_value(d, val):
    return [k for k, v in d.items() if v == val]

def save_results(scores, batch_size, epochs):
    
    if not os.path.exists('results/'):
        os.makedirs('results/')
    
    with open('results/geral_acc_'+str(batch_size)+'_'+str(epochs)+'.csv', 'w') as f:
        f.write('batch_size,epochs,loss,acc\n')
        f.write(str(batch_size)+','+str(epochs)+','+str(scores[0]+','+str(scores[2])+'\n'))

    with open('results/acc_class_'+str(batch_size)+'_'+str(epochs)+'.csv', 'w') as f:
        f.write('batch_size,epochs,tag,accuracy\n')

    with open('results/precision_class_'+str(batch_size)+'_'+str(epochs)+'.csv', 'w') as f:
        f.write('batch_size,epochs,tag,precision\n')

    for i, s in enumerate(scores):    
        if i > 2 and i < 30:
            with open('results/acc_class.csv', 'a+') as f:
                f.write(str(batch_size)+','+str(epochs)+','+str(get_key_from_value(tag2number, i-2-1)[0])+','+
                        str(s)+'\n')
        if i >= 30:
            with open('results/precision_class.csv', 'a+') as f:
                f.write(str(batch_size)+','+str(epochs)+','+str(get_key_from_value(tag2number, i-29-1)[0])+","+
                        str(s)+"\n")

In [5]:
def main():
    train_words, train_tags, word2number, tag2number = pre_processing(file_train, train=True)
    train_X, train_Y = convert_to_numbers(train_words, train_tags, word2number, tag2number)

    test_words, test_tags = pre_processing(file_test)
    test_X, test_Y = convert_to_numbers(test_words, test_tags, word2number, tag2number)

    val_words, val_tags = pre_processing(file_val)
    val_X, val_Y = convert_to_numbers(val_words, val_tags, word2number, tag2number)
    
    print('Pre-Processing Done.')
    
    MAX_LENGTH = max(max(len(max(train_X, key=len)), len(max(val_X, key=len))), len(max(test_X, key=len)))
    print('Max Length of sentences: {}'.format(MAX_LENGTH))
    
    train_X, train_Y = padding_sequences(train_X, train_Y, MAX_LENGTH, tag2number)
    test_X, test_Y = padding_sequences(test_X, test_Y, MAX_LENGTH, tag2number)
    val_X, val_Y = padding_sequences(val_X, val_Y, MAX_LENGTH, tag2number)
    
    print('Padding Done.')
    
    model = create_model(MAX_LENGTH, tag2number, word2number)
    
    batch_sizes = [64, 128, 256] # verificar
    epochs = [1, 5, 10]
    
    for e in epochs:
        for bs in batch_sizes:
            model.fit(train_X, train_Y, verbose=1, batch_size=bs, epochs=e, validation_data=(val_X, val_Y))
            scores = model.evaluate(test_X, test_Y)
            save_results(scores, batch_size, epochs)
    

In [6]:
main()

Pre-Processing Done.
Max Length of sentences: 248
Padding Done.


In [28]:
model.fit(train_X, train_Y, batch_size=128, epochs=1, validation_data=(val_X, val_Y))

Train on 37948 samples, validate on 1997 samples
Epoch 1/1


<keras.callbacks.History at 0x7feb38b10590>

In [32]:
scores = model.evaluate(test_X, test_Y)

