In [58]:
!python -m pip install --upgrade pip
!pip install seqeval
!pip install -U sklearn_crfsuite
!pip install scikit-learn==1.3.2

!python -m spacy download pt_core_news_sm

[0m2023-12-08 11:55:39.161788: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-08 11:55:39.161849: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-08 11:55:39.161901: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Collecting pt-core-news-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.6.0/pt_core_news_sm-3.6.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m41.1 MB/s[0m eta [36m0:00:00[0m
[0m[38;5;2m✔ Download and installation successful[0m
Yo

In [59]:
import os
import numpy as np

from sklearn_crfsuite import CRF
from seqeval.metrics import classification_report

In [60]:
!wget https://raw.githubusercontent.com/messias077/ner_pt/main/data/corpora/le_ner/train.conll
!wget https://raw.githubusercontent.com/messias077/ner_pt/main/data/corpora/le_ner/test.conll
!wget https://raw.githubusercontent.com/ulysses-camara/ulysses-ner-br/main/annotated-corpora/PL_corpus_conll/pl_corpus_categorias/train.txt
!wget https://raw.githubusercontent.com/ulysses-camara/ulysses-ner-br/main/annotated-corpora/PL_corpus_conll/pl_corpus_categorias/test.txt

--2023-12-08 11:55:57--  https://raw.githubusercontent.com/messias077/ner_pt/main/data/corpora/le_ner/train.conll
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2142199 (2.0M) [text/plain]
Saving to: ‘train.conll’


2023-12-08 11:55:57 (45.3 MB/s) - ‘train.conll’ saved [2142199/2142199]

--2023-12-08 11:55:57--  https://raw.githubusercontent.com/messias077/ner_pt/main/data/corpora/le_ner/test.conll
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 438441 (428K) [text/plain]
Saving to: ‘test.conll’


2023-12-08 11:55:57 (15.2 MB/s) 

## Função para leitura da base de dados no padrão BIO

In [61]:
def read_corpus_file(corpus_file, delimiter='\t', ner_column=1):
    with open(corpus_file, encoding='utf-8') as file:
        lines = file.readlines()
    data = []
    words = []
    tags = []
    for line in lines:
        line = line.replace('\n', '')
        if line != '':
            if delimiter in line:
                fragments = line.split(delimiter)
                words.append(fragments[0])
                tags.append(fragments[ner_column])
        else:
            if len(words) > 1:
                data.append((words, tags))
            words = []
            tags = []
    return data

##Leitura dos dados de treinamento e teste

In [74]:
# corpus_name = 'le_ner'
corpus_name = 'ulysses_ner'

report_dir = 'report/'

train_file = None
test_file = None

id_ner = 1
delimiter = ' '

if corpus_name == 'le_ner':
  train_file = '/content/train.conll'
  test_file = '/content/test.conll'
elif corpus_name == 'ulysses_ner':
  train_file = '/content/train.txt'
  test_file = '/content/test.txt'

print(f'\nCorpus: {corpus_name}')

report_dir = os.path.join(report_dir, corpus_name)

os.makedirs(report_dir, exist_ok=True)

train_data = read_corpus_file(train_file, delimiter=delimiter, ner_column=id_ner)
test_data = read_corpus_file(test_file, delimiter=delimiter, ner_column=id_ner)

print(f'\nTrain data: {len(train_data)}')
print(f'Test data: {len(test_data)}')

test_data_original = np.array(test_data, dtype=object)


Corpus: ulysses_ner

Train data: 2269
Test data: 523


## Função que executa o pré-processamento do corpus usando a ferramenta Spacy

In [75]:
import spacy

def data_preprocessing(data):
    nlp = spacy.load(name='pt_core_news_sm',
                     disable=['parser', 'ner', 'lemmatizer', 'textcat'])
    preprocessed_data = []
    for d in data:
        sentence = ' '.join(d[0])
        doc = nlp(sentence)
        pos_tags = [t.pos_ for t in doc]
        preprocessed_data.append((d[0], pos_tags, d[1]))
    return preprocessed_data

In [76]:
train_data = data_preprocessing(train_data)

test_data = data_preprocessing(test_data)

In [77]:
train_data[0]

(['Sala', 'das', 'Sessões', ',', 'em', 'de', 'de', '2019', '.'],
 ['PROPN', 'ADP', 'PROPN', 'PUNCT', 'ADP', 'ADP', 'ADP', 'NUM', 'PUNCT'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATA', 'O'])

## Funções usadas para extrair as features dos tokens e de seus vizinhos.

In [78]:
def extract_sent_features(sentence):
    return [extract_features(sentence, i) for i in range(len(sentence))]


def extract_labels(sentence):
    return [label for _, _, label in sentence]


def extract_features(sentence, i):
    word = sentence[i][0]
    postag = sentence[i][1]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
        'word.islower()': word.islower(),
        'word[0].isupper()': word[0].isupper(),
        'word[0].islower()': word[0].islower(),
        'not word[0].isalnum()': not word[0].isalnum(),
        'not word.isalnum()': not word.isalnum(),
        'word.isalpha()': word.isalpha()
    }
    if i > 0:
        word1 = sentence[i - 1][0]
        postag1 = sentence[i - 1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
            '-1:word.islower()': word1.islower()
        })
    else:
        features['BOS'] = True # BOS = Begin of Sentence
    if i > 1:
        word1 = sentence[i - 2][0]
        postag1 = sentence[i - 2][1]
        features.update({
            '-2:word.lower()': word1.lower(),
            '-2:word.istitle()': word1.istitle(),
            '-2:word.isupper()': word1.isupper(),
            '-2:postag': postag1,
            '-2:postag[:2]': postag1[:2],
            '-2:word.islower()': word1.islower()
        })
    if i < len(sentence) - 1:
        word1 = sentence[i + 1][0]
        postag1 = sentence[i + 1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
            '+1:word.islower()': word1.islower()
        })
    else:
        features['EOS'] = True # EOS = End of Sentence
    if i < len(sentence) - 2:
        word1 = sentence[i + 2][0]
        postag1 = sentence[i + 2][1]
        features.update({
            '+2:word.lower()': word1.lower(),
            '+2:word.istitle()': word1.istitle(),
            '+2:word.isupper()': word1.isupper(),
            '+2:postag': postag1,
            '+2:postag[:2]': postag1[:2],
            '+2:word.islower()': word1.islower()
        })
    return features


def convert_data(data):
    sentences = []
    for d in data:
        sentences.append(list(zip(d[0], d[1], d[2])))
    x_data = [extract_sent_features(s) for s in sentences]
    y_data = [extract_labels(s) for s in sentences]
    return x_data, y_data

In [79]:
X_train, y_train = convert_data(train_data)

X_test, y_test = convert_data(test_data)

In [80]:
print(f'\nExample features: {X_train[0]}')

print(f'\nLabel: {y_train[0]}')


Example features: [{'bias': 1.0, 'word.lower()': 'sala', 'word[-3:]': 'ala', 'word[-2:]': 'la', 'word.isupper()': False, 'word.istitle()': True, 'word.isdigit()': False, 'postag': 'PROPN', 'postag[:2]': 'PR', 'word.islower()': False, 'word[0].isupper()': True, 'word[0].islower()': False, 'not word[0].isalnum()': False, 'not word.isalnum()': False, 'word.isalpha()': True, 'BOS': True, '+1:word.lower()': 'das', '+1:word.istitle()': False, '+1:word.isupper()': False, '+1:postag': 'ADP', '+1:postag[:2]': 'AD', '+1:word.islower()': True, '+2:word.lower()': 'sessões', '+2:word.istitle()': True, '+2:word.isupper()': False, '+2:postag': 'PROPN', '+2:postag[:2]': 'PR', '+2:word.islower()': False}, {'bias': 1.0, 'word.lower()': 'das', 'word[-3:]': 'das', 'word[-2:]': 'as', 'word.isupper()': False, 'word.istitle()': False, 'word.isdigit()': False, 'postag': 'ADP', 'postag[:2]': 'AD', 'word.islower()': True, 'word[0].isupper()': False, 'word[0].islower()': True, 'not word[0].isalnum()': False, 'n

## Reconhecimento de Entidades Nomeadas utilizando o método CRF (*Conditional Random Fields*)

In [81]:
crf = CRF(max_iterations=100, c1=0.1, c2=0.1, all_possible_transitions=True)

In [82]:
try:
  crf.fit(X_train, y_train)
except AttributeError:
  pass

In [83]:
y_pred = crf.predict(X_test)

In [84]:
import pandas as pd

report = classification_report(y_test, y_pred)

print(report)

              precision    recall  f1-score   support

        DATA       0.96      0.94      0.95        98
      EVENTO       1.00      0.22      0.36         9
  FUNDAMENTO       0.85      0.85      0.85       124
       LOCAL       0.81      0.72      0.76       101
 ORGANIZACAO       0.76      0.72      0.74        94
      PESSOA       0.92      0.81      0.86       119
PRODUTODELEI       0.73      0.65      0.69        54

   micro avg       0.85      0.79      0.82       599
   macro avg       0.86      0.70      0.74       599
weighted avg       0.85      0.79      0.81       599



In [85]:
report = classification_report(y_test, y_pred, output_dict=True)
df = pd.DataFrame(report).transpose()

print(f'\nCorpus: {corpus_name}\n')

df_results = df.drop(index=['micro avg', 'macro avg', 'weighted avg'], columns='support')
df_results


Corpus: ulysses_ner



Unnamed: 0,precision,recall,f1-score
DATA,0.958333,0.938776,0.948454
EVENTO,1.0,0.222222,0.363636
FUNDAMENTO,0.853659,0.846774,0.850202
LOCAL,0.811111,0.722772,0.764398
ORGANIZACAO,0.755556,0.723404,0.73913
PESSOA,0.923077,0.806723,0.860987
PRODUTODELEI,0.729167,0.648148,0.686275
