In [1]:
import spacy
from spacy.tokens import Span
import re

In [2]:
text = "Higor Miller was an Student, but Higor Grassi is a British TV Host. The name Higor is quite commun"

In [3]:
pattern = r"Higor [A-Z]\w+" #encontrar qualquer instancia de Paul neste caso uma letra maiscula ate a palavra real

In [4]:
matches = re.finditer(pattern, text) #encontrar o padrao neste texto
for match in matches:
    print(match)

<re.Match object; span=(0, 12), match='Higor Miller'>
<re.Match object; span=(33, 45), match='Higor Grassi'>


In [5]:
nlp = spacy.blank("en")
doc = nlp(text)
original_ents = list(doc.ents)
mwt_ents = [] #lista que sera preenchida pelas entidades
for match in re.finditer(pattern, doc.text): #itera sobre as correspondencias do regex
    start, end = match.span() #posicao inicial e final do texto
    span = doc.char_span(start, end) #define o span
    if span is not None:
        mwt_ents.append((span.start, span.end, span.text)) #comeco meio e fim do proprio texto
for ent in mwt_ents:
    start, end, name = ent #comeco, fim e nome
    per_ent = Span(doc,start, end, label = "PERSON" ) #criamos o objeto span, onde podemos injetar na lista com o rotulo person
    original_ents.append(per_ent)
doc.ents = original_ents # atualizar as entidades do Doc
for ent in doc.ents: #exibir entidades conhecidas
    print(ent.text, ent.label)

Higor Miller 380
Higor Grassi 380


In [6]:
print(mwt_ents) #esta neste intervalo de palavras os respectivos nomes

[(0, 2, 'Higor Miller'), (7, 9, 'Higor Grassi')]


In [7]:
from spacy.language import Language
@Language.component("paul_ner")
def paul_ner(doc):
    pattern = r"Paul [A-Z]\w+" #encontrar qualquer instancia de Paul neste caso uma letra maiscula ate a palavra real
    original_ents = list(doc.ents)
    mwt_ents = [] #lista que sera preenchida pelas entidades
    for match in re.finditer(pattern, doc.text): #itera sobre as correspondencias do regex
        start, end = match.span() #posicao inicial e final do texto
        span = doc.char_span(start, end) #define o span
        if span is not None:
            mwt_ents.append((span.start, span.end, span.text)) #comeco meio e fim do proprio texto
    for ent in mwt_ents:
        start, end, name = ent #comeco, fim e nome
        per_ent = Span(doc,start, end, label = "PERSON" ) #criamos o objeto span, onde podemos injetar na lista com o rotulo person
        original_ents.append(per_ent)
    doc.ents = original_ents # atualizar as entidades do Doc
    return(doc)


In [8]:
nlp2 = spacy.blank("en")
nlp2.add_pipe("paul_ner")

<function __main__.paul_ner(doc)>

In [9]:
doc2 = nlp2(text)
print(doc2)

Higor Miller was an Student, but Higor Grassi is a British TV Host. The name Higor is quite commun


In [10]:
import re
import spacy
from spacy.tokens import Span
from spacy.language import Language
from spacy.util import filter_spans  # importar a funcao filter_spans

# definir o componente personalizado
@Language.component("cinema_ner")
def cinema_ner(doc):
    pattern = r"Hollywood"  # padrao regex para encontrar "Hollywood"
    original_ents = list(doc.ents)  # entidades originais do doc
    mwt_ents = []  # lista para armazenar as novas entidades

    # iterar sobre as correspondencias do regex
    for match in re.finditer(pattern, doc.text):
        start, end = match.span()  # posicao inicial e final da correspondencia
        span = doc.char_span(start, end, label="CINEMA")  # criar um span com o rotulo "CINEMA"
        if span is not None:  # verificar se o span é valido
            mwt_ents.append(span)  # adicionar à lista de novas entidades

    # adicionar as novas entidades à lista de entidades originais
    original_ents.extend(mwt_ents)

    # filtrar spans sobrepostos e resolver conflitos
    filtered_ents = filter_spans(original_ents)

    # atualizar as entidades do doc
    doc.ents = filtered_ents
    return doc

In [11]:
nlp3 = spacy.load("en_core_web_sm")
nlp3.add_pipe("cinema_ner")

<function __main__.cinema_ner(doc)>

In [12]:
doc3 = nlp3(text)
for ent in doc3.ents:
    print(ent.text, ent.label_)

Higor Miller PERSON
Student ORG
Higor Grassi PERSON
British NORP
Higor PERSON
