In [116]:
import spacy

In [117]:
#Sample text
text = "This is a phone number (555) 555-5555"

In [118]:
#Build upon the spacy small model
nlp = spacy.blank("en")

In [119]:
#Create the ruler and addit
ruler = nlp.add_pipe("entity_ruler")

In [120]:
#List of Entities and Patterns
patterns = [
            {"label": "PHONE_NUMBER", "pattern": [{"SHAPE": "ddd"},
            {"ORTH": "-", "OP": "?"}, {"SHAPE": "dddd"}]}
            ]

In [121]:
#add patterns to ruler
ruler.add_patterns(patterns)

In [122]:
#create the doc
doc = nlp(text)

In [123]:
#extract entities
for ent in doc.ents:
    print (ent.text, ent.label_)

555-5555 PHONE_NUMBER


In [124]:
#With Regex way
import re
pattern = r"((\d){3}-(\d){4})"

In [125]:
text = "This is a phone number 555-5555."

In [126]:
matches = re.findall(pattern,text)

In [127]:
print (matches)

[('555-5555', '5', '5')]


In [128]:
##COMBINE THE TWO

In [129]:
#Import the requisite library
import spacy

#Sample text
text = "This is a phone number (555) 555-5555."

#Build upon the spaCy Small Model
nlp = spacy.blank("en")

#Create the Ruler and Add it
ruler = nlp.add_pipe("entity_ruler")

#List of Entities and Patterns (source: https://spacy.io/usage/rule-based-matching)
patterns = [
                {
                    "label": "PHONE_NUMBER", "pattern": [{"TEXT": {"REGEX": "((\d){3}-(\d){4})"}}
                                                        ]
                }
            ]
#add patterns to ruler
ruler.add_patterns(patterns)


#create the doc
doc = nlp(text)

#extract entities
for ent in doc.ents:
        print (ent.text, ent.label_)

In [130]:
doc.text

'This is a phone number (555) 555-5555.'

## Multi- Word Tokens

In [131]:
import re
import spacy
from spacy.tokens import Span

In [132]:

text = "Paul Newman was an American actor, but Paul Hollywood is a British TV Host. The name Paul is quite common."
pattern = r"Paul [A-Z]\w+"

In [133]:
nlp = spacy.blank("en")
doc = nlp(text)

In [134]:
original_ents = list(doc.ents)

In [135]:
mwt_ents = []
for match in re.finditer(pattern, doc.text):
    start, end = match.span()
    span = doc.char_span(start, end)
    if span is not None:
        mwt_ents.append((span.start, span.end, span.text))


In [136]:
print(mwt_ents)

[(0, 2, 'Paul Newman'), (8, 10, 'Paul Hollywood')]


### Inject the Spans into the doc.ents

In [137]:
for ent in mwt_ents:
    start,end,name = ent
    per_ent = Span(doc,start,end,label = "PERSON")
    original_ents.append(per_ent)
doc.ents = original_ents
for ent in doc.ents:
    print(ent.text,ent.label_)

Paul Newman PERSON
Paul Hollywood PERSON


In [138]:
## Create all the above as an entity ruler

In [142]:
from spacy import Language
pattern = r"Paul [A-Z]\w+"
@Language.component("paul_entity")
def paul_entity(doc):
    original_ents = list(doc.ents)
    mwt_ents = []
    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end)
        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))
    for ent in mwt_ents:
        start,end,name = ent
        per_ent = Span(doc,start,end,label = "PERSON")
        original_ents.append(per_ent)
    doc.ents = original_ents
    return(doc)


In [143]:
nlp2 = spacy.blank("en") #insert into blank model
nlp2.add_pipe("paul_entity")

<function __main__.paul_entity(doc)>

In [144]:
doc2 = nlp2(text)
print(doc2.ents)

(Paul Newman, Paul Hollywood)
