<a href="https://colab.research.google.com/github/joaoBernardinoo/formas-research/blob/main/atividade_01_formas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import pickle
import nltk
import pandas as pd
import numpy as np
import spacy
import spacy.cli
from sklearn.model_selection import train_test_split

In [9]:
!pip install conllu
!wget http://marlovss.work.gd:8080/tomorrow/aula2/bosque.conllu

--2024-10-25 01:34:35--  http://marlovss.work.gd:8080/tomorrow/aula2/bosque.conllu
Resolving marlovss.work.gd (marlovss.work.gd)... 177.180.148.12
Connecting to marlovss.work.gd (marlovss.work.gd)|177.180.148.12|:8080... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11291250 (11M)
Saving to: ‘bosque.conllu.2’


2024-10-25 01:34:38 (4.40 MB/s) - ‘bosque.conllu.2’ saved [11291250/11291250]



In [10]:
import conllu
import itertools as it

class AttributeDict(dict):
    __getattr__ = dict.__getitem__
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

class CoNLLU:
   def __init__(self, files):
      self.words = []
      self.sentences = []
      for f in files:
         parsed = conllu.parse(open(f).read())
         sents = [[AttributeDict(form = token['form'], lemma=token['lemma'],pos=token['upos'],feats=token['feats']) for token in tokenlist if token['upos']!='_'] for tokenlist in parsed]
         self.sentences.extend(sents)
         self.words.extend([word for sent in sents for word in sent])
      self.pos_tags = set([word.pos for word in self.words])
      self.feats_dict ={pos:set(it.chain.from_iterable([list(word.feats.keys()) for word in self.words if word.pos==pos and word.feats!= None])) for pos in self.pos_tags}


In [11]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
bosque = CoNLLU(files=["bosque.conllu"])

In [47]:
# aqui train_data, patch_data e test_data são o corpus "bosque" particionado por suas sentenćas, não palavras
# deve-se verificar se todas as partićões abrangem todas as "universal pos tags"

train_data, temp_data = train_test_split(bosque.sentences, test_size=0.1, random_state=42) # 90% train, 10% temp
patch_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42) # Split the 10% into 5% patch and 5% test

print(f"Training data size: {len(train_data)}")
print(f"Patch data size: {len(patch_data)}")
print(f"Test data size: {len(test_data)}")

Training data size: 6316
Patch data size: 351
Test data size: 351


In [48]:
train_data[0]

[{'form': 'Afirmou',
  'lemma': 'afirmar',
  'pos': 'VERB',
  'feats': {'Mood': 'Ind',
   'Number': 'Sing',
   'Person': '3',
   'Tense': 'Past',
   'VerbForm': 'Fin'}},
 {'form': 'que', 'lemma': 'que', 'pos': 'SCONJ', 'feats': None},
 {'form': 'o',
  'lemma': 'o',
  'pos': 'DET',
  'feats': {'Definite': 'Def',
   'Gender': 'Masc',
   'Number': 'Sing',
   'PronType': 'Art'}},
 {'form': 'conjunto',
  'lemma': 'conjunto',
  'pos': 'NOUN',
  'feats': {'Gender': 'Masc', 'Number': 'Sing'}},
 {'form': 'de', 'lemma': 'de', 'pos': 'ADP', 'feats': None},
 {'form': 'fatos',
  'lemma': 'fato',
  'pos': 'NOUN',
  'feats': {'Gender': 'Masc', 'Number': 'Plur'}},
 {'form': ',', 'lemma': ',', 'pos': 'PUNCT', 'feats': None},
 {'form': 'em', 'lemma': 'em', 'pos': 'ADP', 'feats': None},
 {'form': 'princípio',
  'lemma': 'princípio',
  'pos': 'NOUN',
  'feats': {'Gender': 'Masc', 'Number': 'Sing'}},
 {'form': ',', 'lemma': ',', 'pos': 'PUNCT', 'feats': None},
 {'form': 'aponta',
  'lemma': 'apontar',
  'p

In [55]:
train_words = [word for sentence in train_data for word in sentence]

In [59]:
from nltk.probability import FreqDist
suffixes = set([word.form.lower()[-3:] for word in train_words])

In [62]:
# O artigo utiliza 3 ultimos caracteres do token do corpus anotado na lingua inglesa,
# hipótese ( precisa testar ): a língua portuguesa é mais verbosa, seria 3 caracteres o suficiente
# para, por exemplo, contemplar todas as conjugaćões verbais??

try:
    with open('/content/drive/MyDrive/Colab Notebooks/suf_to_tag.pkl', 'rb') as f:
        suf_to_tag = pickle.load(f)
except FileNotFoundError:
    print("Arquivo nao encontrado, extraindo os sufixos...")
    suf_to_tag = {suf: FreqDist([word.pos for word in train_words if word.form.lower()[-3:] == suf]).max() for suf in suffixes}

    with open('/content/drive/MyDrive/Colab Notebooks/suf_to_tag.pkl', 'wb') as f:
        pickle.dump(suf_to_tag, f)

Arquivo nao encontrado, extraindo os sufixos...


In [64]:
rules = {
    'ADJ': [],
    'ADP': [],
    'ADV': [],
    'AUX': [],
    'CCONJ': [],
    'DET': [],
    'INTJ': [],
    'NOUN': [],
    'NUM': [],
    'PART': [],
    'PRON': [],
    'PROPN': [],
    'PUNCT': [],
    'SCONJ': [],
    'SYM': [],
    'VERB': [],
    'X': []
}

df = pd.DataFrame(list(rules.items()), columns=['pos_tag', 'token'])
df['token'] = df['token'].apply(set)


In [65]:
df.head()

Unnamed: 0,pos_tag,token
0,ADJ,{}
1,ADP,{}
2,ADV,{}
3,AUX,{}
4,CCONJ,{}


In [91]:
train_sents = [[word.form for word in sent] for sent in train_data]
patch_sents = [[word.form for word in sent] for sent in patch_data]
patch_gold = [[(word.form.lower(),word.pos) for word in sent] for sent in patch_data]
test_sents  = [[(word.form.lower(),word.pos) for word in sent] for sent in test_data]

In [99]:
# primeiro vamos etiquetar o patch
# quantificando as vezes ao invés de etiquetar tag b, etiquetou tag a
# < tagA, tagB, number >

def lexic_tag(tokens):
  tagged = []
  for token in tokens:
    if token.lower()[-3:] in suffixes:
       tagged.append((token,suf_to_tag[token.lower()[-3:]]))
    else:
       tagged.append((token,"_"))
  return tagged

In [66]:
def tag(tokens):
  tagged = []
  for token in tokens:
    if token.lower()[-3:] in suffixes:
       tagged.append((token,suf_to_tag[token.lower()[-3:]]))
    else:
       tagged.append((token,"_"))
  return tagged

In [19]:
def accuracy(predicted,gold):

   acertos = len([predicted[i][j][1] for i in range(len(gold)) for j in range(len(gold[i])) if predicted[i][j][1]==gold[i][j][1]])
   totais = sum([len(sent) for sent in gold])
   return acertos/totais

def abrangencia(predicted,gold):
  tagged_tokens = 0

  for sent in predicted:
    for _, predicted_tag in sent:
      if predicted_tag != "_":
        tagged_tokens += 1
  total_tokens = 0

  for sent in gold:
    for _, gold_tag in sent:
      if gold_tag != "_":
        total_tokens += 1
  return tagged_tokens / total_tokens

def F(predicted,gold):
  return 2 * (abrangencia(predicted,gold) * accuracy(predicted,gold)) / (abrangencia(predicted,gold) + accuracy(predicted,gold))

In [20]:
!wget http://marlovss.work.gd:8080/tomorrow/aula2/test.conllu

--2024-10-25 01:35:11--  http://marlovss.work.gd:8080/tomorrow/aula2/test.conllu
Resolving marlovss.work.gd (marlovss.work.gd)... 177.180.148.12
Connecting to marlovss.work.gd (marlovss.work.gd)|177.180.148.12|:8080... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1819980 (1.7M)
Saving to: ‘test.conllu’


2024-10-25 01:35:13 (1.40 MB/s) - ‘test.conllu’ saved [1819980/1819980]



In [21]:
test = CoNLLU(files=["test.conllu"])
test_sents = [[word.form for word in sent] for sent in test.sentences]
gold = [[(word.form.lower(),word.pos) for word in sent] for sent in test.sentences]
predicted = [tag(sent) for sent in test_sents]

In [22]:
def validate(train,test):
  gold = [[(word.form.lower(),word.pos) for word in sent] for sent in test]
  predicted = [tag(sent) for sent in train]
  return {
        'accuracy': accuracy(predicted,gold),
        'coverage': abrangencia(predicted,gold),
        "F" : F(predicted,gold)
}

In [23]:
def humanTaggingHelper():
    tagged_sent = next((sent for sent in predicted if any(tag == "_" for _, tag in sent)), None)
    if tagged_sent:
        print(tagged_sent)
        tokens = [token for token, _ in tagged_sent]
        doc = spacy.tokens.Doc(nlp.vocab, words=tokens)
        for token, (text, tag) in zip(doc, tagged_sent):
            if tag != "_":
                token.pos_ = tag
            else:
                token.tag_ = tag
        displacy.render(doc, style='dep', jupyter=True, options={'distance': 70})

In [24]:
def add_rule_to_df(df, category, token):
    token_lower = token.lower()
    idx = df[df['pos_tag'] == category].index[0]
    df.iloc[idx,1].add(token_lower)
    print(f"'{token_lower}' adicionado à regra {category}")

In [25]:
from spacy import displacy
categories = df.iloc[:,0]

def humanTaggingHelper():
    tagged_sent = next((sent for sent in predicted if any(tag == "_" for _, tag in sent)), None)
    if tagged_sent:
        print(tagged_sent)
        tokens = [token for token, _ in tagged_sent]
        doc = spacy.tokens.Doc(nlp.vocab, words=tokens)
        for token, (text, tag) in zip(doc, tagged_sent):
            if tag != "_":
                token.pos_ = tag
            else:
                token.tag_ = tag
        displacy.render(doc, style='dep', jupyter=True, options={'distance': 70})

        print("Mapeamento de Categorias:")
        for i, category in enumerate(categories):
            print(f"{i} - {category}")

        for token, (text, tag) in zip(doc, tagged_sent):
            if tag == "_":
                new_tag = input(f"Qual é a categoria de '{text}'? (Q para sair) ").strip().upper()
                if new_tag == "Q":
                    print("Encerrando o programa.")
                    return
                if new_tag.isdigit() and int(new_tag) < len(categories):
                    category = categories[int(new_tag)]
                    add_rule_to_df(df, category, text)
                else:
                    print(f"Entrada inválida ou categoria não encontrada.")

In [26]:
predicted[0]

[('Folha', 'NOUN'),
 ('--', 'PUNCT'),
 ('Como', 'ADP'),
 ('você', 'PRON'),
 ('recebeu', 'VERB'),
 ('a', 'DET'),
 ('notícia', 'NOUN'),
 ('de', 'ADP'),
 ('que', 'PRON'),
 ('seria', 'NOUN'),
 ('substituído', 'VERB'),
 ('?', 'PUNCT')]

In [27]:
humanTaggingHelper()

[('A', 'DET'), ('reivindicação', 'NOUN'), ('é', 'AUX'), ('de', 'ADP'), ('equiparação', 'NOUN'), ('salarial', 'ADJ'), ('com', 'ADP'), ('a', 'DET'), ('Polícia', 'NOUN'), ('Civil', 'ADJ'), ('de', 'ADP'), ('o', 'DET'), ('DF', '_'), (',', 'PUNCT'), ('o', 'DET'), ('que', 'PRON'), ('representaria', 'NOUN'), ('um', 'DET'), ('reajuste', 'DET'), ('de', 'ADP'), ('300', 'NUM'), ('%', 'SYM'), ('.', 'PUNCT')]


NameError: name 'nlp' is not defined

In [None]:
df

In [None]:
for sent in predicted:
    if any(predicted_tag == "_" for _,predicted_tag in sent):
      print(str(sent))
      break

In [None]:
test_sents[0]
sent = ""
for word in test_sents[0]:
  sent += word + " "
print(sent)

## Fazendo nosso lematizador e analisador de flexões

O lematizador e a análise flexional podem ser realizados em conuunto, uma vez que determinar as flexões nos permitem "desfazê-las", i.e. obter uma versão "normalizada" do item lexical