In [161]:
# !pip install conllu
# !wget http://marlovss.work.gd:8080/tomorrow/aula2/bosque.conllu

"""
Implemente um classificador morfossintático através da estratégia de classificação de palavras. 
Você pode utilizar para tal implementação a classe ClassifierBasedTagger do NLTK, definindo uma 
função de representação (extração de features) e um classificador qualquer da API nltk.classify.
"""

import conllu
import itertools as it

class AttributeDict(dict):
    __getattr__ = dict.__getitem__
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__


class CoNLLU:
   def __init__(self, files):
      self.words = []
      self.sentences = []
      for f in files:
         parsed = conllu.parse(open(f, encoding="utf8").read())
         sents = [[AttributeDict(form = token['form'], lemma=token['lemma'],pos=token['upos'],feats=token['feats']) for token in tokenlist if token['upos']!='_'] for tokenlist in parsed]
         self.sentences.extend(sents)
         self.words.extend([word for sent in sents for word in sent])
      self.pos_tags = set([word.pos for word in self.words])
      self.feats_dict ={pos:set(it.chain.from_iterable([list(word.feats.keys()) for word in self.words if word.pos==pos and word.feats!= None])) for pos in self.pos_tags}

bosque = CoNLLU(files=["bosque.conllu"])

## Exemplo de função de representação.
**NOTE**: As funções de representação devem sempre receber três argumentos: uma lista de tokens, o índice do token sendo processado e um histórico das classificações já realizadas pelo tagger

In [162]:
# # Draft
# def repr(tokens,index,history):
#   feats ={}
  
#   feats['token'] = tokens[index][0].lower()
#   # feats['token_lower'] = tokens[index][0].lower()
#   feats['suffix'] = tokens[index][0].lower()[-5:]
#   # feats['suffix'] = tokens[index][0].lower()[len(stem):]
#   # feats['stem'] = stem

#   # feats['token_position'] = index
  
#   # feats['suffix1'] = tokens[index][0].lower()[-1:]
#   # feats['suffix2'] = tokens[index][0].lower()[-2:]
#   # feats['suffix3'] = tokens[index][0].lower()[-3:]
#   # feats['suffix4'] = tokens[index][0].lower()[-4:]
#   # feats['prefix1'] = tokens[index][0].lower()[:1]
#   # feats['prefix2'] = tokens[index][0].lower()[:2]
#   # feats['prefix3'] = tokens[index][0].lower()[:3]
#   # feats['prefix4'] = tokens[index][0].lower()[:4]

#   # feats['token_before1'] = tokens[index-1][0] if index > 0 else '<start>'
#   # feats['suf_before1'] = tokens[index-1][0].lower()[-4:] if index > 0 else '<start>'
#   feats['pos_before1'] = history[index-1][1] if index > 0 else '<start>'

#   # feats['token_before2'] = tokens[index-2][0] if index > 1 else '<start>'
#   # feats['suf_before2'] = tokens[index-2][0].lower()[-4:] if index > 1 else '<start>'
#   feats['pos_before2'] = history[index-2][1] if index > 1 else '<start>'

#   # feats['token_before3'] = tokens[index-3][0] if index > 2 else '<start>'
#   # feats['suf_before3'] = tokens[index-3][0].lower()[-4:] if index > 2 else '<start>'
#   # feats['pos_before3'] = history[index-3][1] if index > 2 else '<start>'

#   # feats['token_after'] = tokens[index+1][0] if index < len(tokens)-1 else '<end>'
#   feats['suf_after'] = tokens[index+1][0].lower()[-5:] if index < len(tokens)-1 else '<end>'

#   # feats['token_after2'] = tokens[index+2][0] if index < len(tokens)-2 else '<end>'
#   # feats['suf_after2'] = tokens[index+2][0].lower()[-4:] if index < len(tokens)-2 else '<end>'
  
#   # feats['token_after3'] = tokens[index+3][0] if index < len(tokens)-3 else '<end>'
#   # feats['suf_after3'] = tokens[index+3][0].lower()[-4:] if index < len(tokens)-3 else '<end>'
  
#   # feats['is_upper'] = tokens[index][0].isupper()
#   feats['is_punct'] = tokens[index][0] in ['.',',','!','?',';',':','(',')','[',']','{','}','"','`',"'",'-','_','/','\\','|','@','#','$','%','^','&','*','+','=','<','>','~']
#   feats['is_digit'] = tokens[index][0].isdigit()
#   feats['is_adp'] = tokens[index][0] in ["ante", "após", "até", "com", "contra", "de", "desde", "em", "entre", "para", "perante", "por", "sem", "sob", "sobre", "trás"]

#   return feats

In [163]:
# # 0.4419243666169896 
# # using decision tree

# def repr(tokens,index,history):
#   feats ={}
  
#   feats['token'] = tokens[index][0].lower()
#   feats['suffix'] = tokens[index][0].lower()[-4:]

#   feats['pos_before1'] = history[index-1][1] if index > 0 else '<start>'

#   feats['is_punct'] = tokens[index][0] in ['.',',','!','?',';',':','(',')','[',']','{','}','"','`',"'",'-','_','/','\\','|','@','#','$','%','^','&','*','+','=','<','>','~']
#   feats['is_digit'] = tokens[index][0].isdigit()
#   feats['is_adp'] = tokens[index][0] in ["ante", "após", "até", "com", "contra", "de", "desde", "em", "entre", "para", "perante", "por", "sem", "sob", "sobre", "trás"]

#   return feats

In [164]:
# # 0.4176485655737705
# def repr(tokens,index,history):
#   feats ={}
  
#   feats['token'] = tokens[index][0].lower()
#   feats['suffix'] = tokens[index][0].lower()[-4:]

#   feats['pos_before1'] = history[index-1][1] if index > 0 else '<start>'

#   feats['suf_after'] = tokens[index+1][0].lower()[-4:] if index < len(tokens)-1 else '<end>'

#   feats['is_punct'] = tokens[index][0] in ['.',',','!','?',';',':','(',')','[',']','{','}','"','`',"'",'-','_','/','\\','|','@','#','$','%','^','&','*','+','=','<','>','~']
#   feats['is_digit'] = tokens[index][0].isdigit()
#   feats['is_adp'] = tokens[index][0] in ["ante", "após", "até", "com", "contra", "de", "desde", "em", "entre", "para", "perante", "por", "sem", "sob", "sobre", "trás"]

#   return feats

In [165]:
# # 0.36805490871833085
# def repr(tokens,index,history):
#   feats ={}
  
#   feats['token'] = tokens[index][0].lower()
#   feats['suffix'] = tokens[index][0].lower()[-5:]

#   feats['pos_before1'] = history[index-1][1] if index > 0 else '<start>'
#   feats['pos_before2'] = history[index-2][1] if index > 1 else '<start>'

#   feats['suf_after'] = tokens[index+1][0].lower()[-5:] if index < len(tokens)-1 else '<end>'

#   feats['is_punct'] = tokens[index][0] in ['.',',','!','?',';',':','(',')','[',']','{','}','"','`',"'",'-','_','/','\\','|','@','#','$','%','^','&','*','+','=','<','>','~']
#   feats['is_digit'] = tokens[index][0].isdigit()
#   feats['is_adp'] = tokens[index][0] in ["ante", "após", "até", "com", "contra", "de", "desde", "em", "entre", "para", "perante", "por", "sem", "sob", "sobre", "trás"]

#   return feats

In [166]:

import nltk
stemmer = nltk.RSLPStemmer()

def repr(tokens,index,history):
  feats ={}

  stem = stemmer.stem(tokens[index][0].lower())
  
  feats['token'] = tokens[index][0].lower()
  feats['suffix'] = tokens[index][0].lower()[len(stem):]
  feats['token_length'] = len(tokens[index][0])

  # feats['suffix1'] = tokens[index][0].lower()[-1:]
  # feats['suffix2'] = tokens[index][0].lower()[-2:]
  # feats['suffix3'] = tokens[index][0].lower()[-3:]

  feats['pos_before1'] = history[index-1][1] if index > 0 else '<start>'
  feats['pos_before2'] = history[index-2][1] if index > 1 else '<start>'

  # feats['suf_after3'] = tokens[index+1][0].lower()[-3:] if index < len(tokens)-1 else '<end>'
  # feats['suf_after2'] = tokens[index+1][0].lower()[-2:] if index < len(tokens)-1 else '<end>'
  # feats['suf_after1'] = tokens[index+1][0].lower()[-1:] if index < len(tokens)-1 else '<end>'

  feats['suffix_after'] = tokens[index+1][0].lower()[len(stemmer.stem(tokens[index+1][0].lower())):] if index < len(tokens)-1 else '<end>'
  feats['token_after'] = tokens[index+1][0].lower() if index < len(tokens)-1 else '<end>'
  feats['token_length_after'] = len(tokens[index+1][0]) if index < len(tokens)-1 else '<end>'

  # feats['suf_after'] = tokens[index+1][0].lower()[-5:] if index < len(tokens)-1 else '<end>'

  # feats['is_punct'] = tokens[index][0] in ['.',',','!','?',';',':','(',')','[',']','{','}','"','`',"'",'-','_','/','\\','|','@','#','$','%','^','&','*','+','=','<','>','~']
  # feats['is_digit'] = tokens[index][0].isdigit()
  # feats['is_adp'] = tokens[index][0] in ["ante", "após", "até", "com", "contra", "de", "desde", "em", "entre", "para", "perante", "por", "sem", "sob", "sobre", "trás"]

  return feats

In [182]:
from nltk.tag.sequential import ClassifierBasedTagger
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from nltk.classify import NaiveBayesClassifier

tagged_sents = [[(word.form,word.pos) for word in sent] for sent in bosque.sentences]
# data = [(repr(sent, i, sent[:i]), sent[i][1]) for sent in tagged_sents for i in range(len(sent))]
# labels = [sent[i][1] for sent in tagged_sents for i in range(len(sent))]
# features = [list(repr(sent, i, sent[:i]).values()) for sent in tagged_sents for i in range(len(sent))]

train_X = [list(repr(sent, i, sent[:i]).values()) for sent in tagged_sents for i in range(len(sent))]
train_y = [sent[i][1] for sent in tagged_sents for i in range(len(sent))]

# data = [(list(repr(sent, i, sent[:i]).values()), sent[i][1]) for sent in tagged_sents for i in range(len(sent))]

In [183]:
train_x_np = np.array(train_X)
train_y_np = np.array(train_y)

In [184]:
clf = GaussianNB()
clf = clf.fit(train_x_np, train_y_np)

ValueError: dtype='numeric' is not compatible with arrays of bytes/strings.Convert your data to numeric values explicitly instead.

In [None]:
# Utilizando o classificador DecisionTree
# classifier = SklearnClassifier(DecisionTreeClassifier(criterion='entropy'))
# classifier = SklearnClassifier(KNeighborsClassifier(n_neighbors=3))
# classifier = SklearnClassifier(SVC(kernel='linear', C=1))

# classifier = classifier.train(data)
# tagger = ClassifierBasedTagger(feature_detector=repr, classifier = classifier)

In [None]:
# print(tagger.tag(["o","rato","roeu","a","roupa","do","rei","de","roma"]))
# print(tagger.accuracy(tagged_sents))

[('o', 'DET'), ('rato', 'PROPN'), ('roeu', 'PROPN'), ('a', 'ADP'), ('roupa', 'PROPN'), ('do', 'PUNCT'), ('rei', 'PROPN'), ('de', 'PUNCT'), ('roma', 'PUNCT')]
0.3224082526080477
