## Portuguese POS Tagging

In [41]:
import nltk
from keras.preprocessing.sequence import pad_sequences
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [26]:
file_train = 'data/macmorpho-train.txt'
file_test = 'data/macmorpho-test.txt'
file_val = 'data/macmorpho-dev.txt'

In [34]:
def pre_processing(fname, train=False):
    with open(fname, 'r') as text:
        lines = text.readlines()

    sentences_words = []
    sentences_tags = []
    for l in lines:
        sentence = l.replace('\n', '').split(' ')
        words_token = []
        tags_token = []
        for s in sentence:
            word_tag = s.split('_')
            words_token.append(word_tag[0])
            tags_token.append(word_tag[1])
        sentences_words.append(words_token)
        sentences_tags.append(tags_token)
        
    if train:
        return create_dict_to_numbers(sentences_words, sentences_tags)
    
    return sentences_words, sentences_tags

def create_dict_to_numbers(sentences_words, sentences_tags):
    word2number = {}
    tag2number = {}
    
    i = 2
    for s in sentences_words:
        for word in s:
            if word.lower() not in word2number.keys():
                word2number[word.lower()] = i
                i+=1
    word2number['--padding--'] = 0
    word2number['--not-exist--'] = 1
    
    i = 1
    for s in sentences_tags:
        for tag in s:
            if tag not in tag2number.keys():
                tag2number[tag] = i
                i+=1
    tag2number['--padding--'] = 0
    
    return sentences_words, sentences_tags, word2number, tag2number

def convert_to_numbers(sentences_words, sentences_tags, word2number, tag2number):
    sentences_X, sentences_Y = [], []
    
    for s in sentences_words:
        aux_sent = []
        for w in s:
            try:
                aux_sent.append(word2number[w.lower()])
            except:
                aux_sent.append(word2number['--not-exist--'])
        sentences_X.append(aux_sent)
    
    for s in sentences_tags:
        sentences_Y.append([tag2number[t] for t in s])
    
    return sentences_X, sentences_Y


In [35]:
train_words, train_tags, word2number, tag2number = pre_processing(file_train, train=True)
train_X, train_Y = convert_to_numbers(train_words, train_tags, word2number, tag2number)

print('End Pre-Processing Train')
test_words, test_tags = pre_processing(file_test)
test_X, test_Y = convert_to_numbers(test_words, test_tags, word2number, tag2number)
print('End Pre-Processing Test')
val_words, val_tags = pre_processing(file_val)
val_X, val_Y = convert_to_numbers(val_words, val_tags, word2number, tag2number)
print('End Pre-Processing Val')

End Pre-Processing Train
End Pre-Processing Test
End Pre-Processing Val
