In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' - ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_)))
    else:
        print('No entities found')

In [4]:
doc = nlp(u"HI how are you?")

In [5]:
show_ents(doc)

No entities found


In [6]:
doc = nlp(u"May I go to Washington, DC next May to see the Washington Monument?")

In [7]:
show_ents(doc)

Washington, DC - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
the Washington Monument - ORG - Companies, agencies, institutions, etc.


In [13]:
doc = nlp(u"Can I please have 500 dollars of Microsoft Stock?")

In [14]:
show_ents(doc)

500 dollars - MONEY - Monetary values, including unit
Microsoft Stock - ORG - Companies, agencies, institutions, etc.


In [19]:
doc = nlp(u"Tesla to build a U.K. factory for $6 million")

In [20]:
show_ents(doc)

Tesla - ORDINAL - "first", "second", etc.
U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [21]:
from spacy.tokens import Span

In [22]:
ORG = doc.vocab.strings[u"ORG"]

In [23]:
ORG

383

In [27]:
new_ent = Span(doc,0,1,label=ORG)

In [29]:
doc.ents = list(doc.ents) + [new_ent]

ValueError: [E103] Trying to set conflicting doc.ents: '(0, 1, 'ORDINAL')' and '(0, 1, 'ORG')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap.

In [5]:
doc = nlp(u"Our company created a brand new vacuum cleaner."
         u"This new vacuum-cleaner is the best in show.")

In [8]:
show_ents(doc)

No entities found


In [9]:
from spacy.matcher import PhraseMatcher

In [10]:
matcher = PhraseMatcher(nlp.vocab)

In [11]:
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']

In [12]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [13]:
matcher.add('newproduct',None,*phrase_patterns)

In [14]:
found_matches  = matcher(doc)

In [15]:
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [16]:
from spacy.tokens import Span

In [17]:
PROD = doc.vocab.strings[u"PRODUCT"]

In [18]:
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [19]:
new_ents = [Span(doc,match[1],match[2],label=PROD) for match in found_matches]

In [25]:
# manually add in own named entity
doc.ents = list(doc.ents) + new_ents

ValueError: [E103] Trying to set conflicting doc.ents: '(6, 8, 'PRODUCT')' and '(6, 8, 'PRODUCT')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap.

In [24]:
show_ents(doc)

vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum-cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)


In [5]:
doc = nlp(u"Originally I paid $29.95 for this car toy, but now it is marked down by 10 dollars")

In [8]:
len([ent for ent in doc.ents if ent.label_ == "MONEY"])

2

In [4]:
from spacy import displacy

In [6]:
doc = nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million."
         u"By contrast, Sony only sold 8 thousand Walkman music players.")

In [7]:
displacy.render(doc,style='ent',jupyter=True)

In [8]:
for sent in doc.sents:
    displacy.render(nlp(sent.text),style='ent',jupyter=True)

In [9]:
colors = {'ORG':'linear-gradient(90deg,#aa9cfc,#fc9ce7)'}
options = {'ents':['PRODUCT','ORG'],'colors':colors}

In [10]:
displacy.render(doc,style='ent',jupyter=True,options=options)

In [None]:
displacy.serve(doc,style='ent',options=options)




Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...



# Sentence segmentation

In [5]:
doc = nlp(u'This is the first sentence.  This is another sentence.  This is a third sentence.')

In [6]:
for sent in doc.sents:
    print(sent)

This is the first sentence.  
This is another sentence.  
This is a third sentence.


In [14]:
list(doc.sents)[0]

This is the first sentence.  

In [15]:
type(list(doc.sents)[0])

spacy.tokens.span.Span

In [16]:
doc = nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')

In [17]:
doc.text

'"Management is doing the right things; leadership is doing the right things." -Peter Drucker'

In [18]:
for sent in doc.sents:
    print(sent)
    print('\n')

"Management is doing the right things; leadership is doing the right things."


-Peter Drucker




# ADD A SEGMENTATION RULE

In [22]:
def set_custom_boundaries(doc):
    for token in doc:
        print(token)
        print(token.i)

In [23]:
set_custom_boundaries(doc)

"
0
Management
1
is
2
doing
3
the
4
right
5
things
6
;
7
leadership
8
is
9
doing
10
the
11
right
12
things
13
.
14
"
15
-Peter
16
Drucker
17
