In [1]:
#when to use regex
#when the pattern matching is independent of the lemma, pos or any other 
#linguistic features

import re


In [6]:
text = "Paul Newman was an American actor, but Paul Hollywood is a British TV Host. The name Paul is quite common."

In [7]:
pattern = r"Paul [A-Z]\w+"

In [8]:
matches = re.finditer(pattern, text)
for match in matches:
    print(match)

<re.Match object; span=(0, 11), match='Paul Newman'>
<re.Match object; span=(39, 53), match='Paul Hollywood'>


In [9]:
import spacy
from spacy.tokens import Span

In [17]:
nlp = spacy.blank("en")
doc = nlp(text)
original_ents = list(doc.ents)
#multiword tokens
mwt_ents = []
for match in re.finditer(pattern, doc.text):
    #characte spans
    start, end = match.span()
     #converted to token level
    span = doc.char_span(start, end)
    if span is not None:
        mwt_ents.append((span.start, span.end, span.text))
    #start token, end token, actual token 

#injecting the spans into doc.ents
for ent in mwt_ents:
    start, end, name = ent
    per_ent = Span(doc, start, end, label = "Person")
    original_ents.append(per_ent)
doc.ents = original_ents
for ent in doc.ents:
    print(ent.text, ent.label_)
    

Paul Newman Person
Paul Hollywood Person


In [16]:
print(mwt_ents)

[(0, 2, 'Paul Newman'), (8, 10, 'Paul Hollywood')]


In [24]:
from spacy import Language
@Language.component("paul_ner")
def paul_ner(doc):
    pattern = r"Paul [A-Z]\w+"
    
    original_ents = list(doc.ents)
    
    #multiword tokens
    mwt_ents = []
    for match in re.finditer(pattern, doc.text):
        #characte spans
        start, end = match.span()
         #converted to token level
        span = doc.char_span(start, end)
        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))
            #start token, end token, actual token 

    #injecting the spans into doc.ents
    for ent in mwt_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label = "Person")
        original_ents.append(per_ent)
    doc.ents = original_ents
    return (doc)
    



In [25]:
nlp2 = spacy.blank("en")
nlp2.add_pipe("paul_ner")

<function __main__.paul_ner(doc)>

In [26]:
doc2 = nlp2(text)
print(doc2.ents)

(Paul Newman, Paul Hollywood)


In [28]:
from spacy import Language
@Language.component("cinema_ner")
def cinema_ner(doc):
    pattern = r"Hollywood"
    
    original_ents = list(doc.ents)
    
    #multiword tokens
    mwt_ents = []
    for match in re.finditer(pattern, doc.text):
        #characte spans
        start, end = match.span()
         #converted to token level
        span = doc.char_span(start, end)
        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))
            #start token, end token, actual token 

    #injecting the spans into doc.ents
    for ent in mwt_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label = "CINEMA")
        original_ents.append(per_ent)
    doc.ents = original_ents
    return (doc)
    

In [29]:
nlp3 = spacy.load("en_core_web_sm")
nlp3.add_pipes("cinema_ner")
#cant assign two labels to the same token

AttributeError: 'English' object has no attribute 'add_pipes'

In [30]:
from spacy import Language
from spacy.util import filter_spans
@Language.component("cinema_ner")
def cinema_ner(doc):
    pattern = r"Hollywood"
    
    original_ents = list(doc.ents)
    
    #multiword tokens
    mwt_ents = []
    for match in re.finditer(pattern, doc.text):
        #characte spans
        start, end = match.span()
         #converted to token level
        span = doc.char_span(start, end)
        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))
            #start token, end token, actual token 

    #injecting the spans into doc.ents
    for ent in mwt_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label = "CINEMA")
        original_ents.append(per_ent)
    #gives priority to longer spans
    filtered = filter_spans(original_ents)
    doc.ents = filtered
    return (doc)
    

In [32]:
nlp4 = spacy.load("en_core_web_sm")
nlp4.add_pipe("cinema_ner")

<function __main__.cinema_ner(doc)>

In [35]:
doc4 = nlp4(text)
for ent in doc4.ents:
    print(ent.text, ent.label_)

Paul Newman PERSON
American NORP
Paul Hollywood PERSON
British NORP
Paul PERSON
