In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' - '+ent.label_ + ' - '+str(spacy.explain(ent.label_)))
    else:
        print('No entities found')

In [4]:
doc = nlp(u"Hi, how are you?")

In [5]:
show_ents(doc)

No entities found


In [8]:
doc = nlp(u"May I go to Washington, DC next May to see the Washington Monument?")

In [9]:
show_ents(doc)

Washington, DC - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
the Washington Monument - ORG - Companies, agencies, institutions, etc.


In [10]:
doc = nlp(u'Can I please borrow 500 dollars from you to buy some Microsoft stock?')

for ent in doc.ents:
    print(ent.text, ent.start, ent.end, ent.start_char, ent.end_char, ent.label_)

500 dollars 4 6 20 31 MONEY
Microsoft 11 12 53 62 ORG


In [11]:
show_ents(doc)

500 dollars - MONEY - Monetary values, including unit
Microsoft - ORG - Companies, agencies, institutions, etc.


In [12]:
#adding a NER
doc = nlp(u"Tesla to build a U.K. factory for $6 million")

In [13]:
show_ents(doc)

U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [14]:
#notice that Tesla is not picked as NER, we need to add it as ORG

from spacy.tokens import Span


In [15]:
ORG =doc.vocab.strings[u"ORG"]

In [16]:
ORG

381

In [18]:
new_ent = Span(doc, 0, 1, label=ORG)

In [19]:
doc.ents = list(doc.ents) + [new_ent]

In [20]:
show_ents(doc)

Tesla - ORG - Companies, agencies, institutions, etc.
U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [21]:
%%latex
$\textbf{Add multiple NER}$

<IPython.core.display.Latex object>

In [22]:
doc = nlp(u"Our company created a brand new vacuum cleaner."
         u"This new vacuum-cleaner us the best in show.")

In [23]:
show_ents(doc)

No entities found


In [24]:
from spacy.matcher import PhraseMatcher

In [25]:
matcher = PhraseMatcher(nlp.vocab)

In [26]:
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']

In [27]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [28]:
matcher.add('newproduct', None, *phrase_patterns)

In [29]:
found_matches = matcher(doc)

In [30]:
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [31]:

from spacy.tokens import Span

In [32]:
PROD = doc.vocab.strings[u"PRODUCT"]

In [33]:
found_matches


[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [34]:
new_ents = [Span(doc, match[1], match[2], label=PROD) for match in found_matches]

In [35]:
doc.ents = list(doc.ents) + new_ents

In [36]:
show_ents(doc)

vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum-cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)


In [37]:
%%latex
$\textbf{Counting NER}$

<IPython.core.display.Latex object>

In [38]:
doc = nlp(u"originally I paid $29.95 for his car, but now it is marked down by $10")


In [39]:
[ent for ent in doc.ents if ent.label_ == 'MONEY']

[29.95, 10]

In [40]:
len([ent for ent in doc.ents if ent.label_ == 'MONEY'])

2