In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
def show_ents(doc):
    if doc.ents :
        for ent in doc.ents:
            print(ent.text+'-'+ ent.label_+'-'+ str(spacy.explain(ent.label_)))
    else:
        print('No entities found')

In [12]:
doc = nlp('May I go to Washington, DC next May to see the Washington Monument?')

In [13]:
show_ents(doc)

Washington, DC-GPE-Countries, cities, states
next May-DATE-Absolute or relative dates or periods
the Washington Monument-ORG-Companies, agencies, institutions, etc.


In [14]:
obj = show_ents(doc)

Washington, DC-GPE-Countries, cities, states
next May-DATE-Absolute or relative dates or periods
the Washington Monument-ORG-Companies, agencies, institutions, etc.


## Entity annotations
`Doc.ents` are token spans with their own set of annotations.
<table>
<tr><td>`ent.text`</td><td>The original entity text</td></tr>
<tr><td>`ent.label`</td><td>The entity type's hash value</td></tr>
<tr><td>`ent.label_`</td><td>The entity type's string description</td></tr>
<tr><td>`ent.start`</td><td>The token span's *start* index position in the Doc</td></tr>
<tr><td>`ent.end`</td><td>The token span's *stop* index position in the Doc</td></tr>
<tr><td>`ent.start_char`</td><td>The entity text's *start* index position in the Doc</td></tr>
<tr><td>`ent.end_char`</td><td>The entity text's *stop* index position in the Doc</td></tr>
</table>


## NER Tags
Tags are accessible through the `.label_` property of an entity.
<table>
<tr><th>TYPE</th><th>DESCRIPTION</th><th>EXAMPLE</th></tr>
<tr><td>`PERSON`</td><td>People, including fictional.</td><td>*Fred Flintstone*</td></tr>
<tr><td>`NORP`</td><td>Nationalities or religious or political groups.</td><td>*The Republican Party*</td></tr>
<tr><td>`FAC`</td><td>Buildings, airports, highways, bridges, etc.</td><td>*Logan International Airport, The Golden Gate*</td></tr>
<tr><td>`ORG`</td><td>Companies, agencies, institutions, etc.</td><td>*Microsoft, FBI, MIT*</td></tr>
<tr><td>`GPE`</td><td>Countries, cities, states.</td><td>*France, UAR, Chicago, Idaho*</td></tr>
<tr><td>`LOC`</td><td>Non-GPE locations, mountain ranges, bodies of water.</td><td>*Europe, Nile River, Midwest*</td></tr>
<tr><td>`PRODUCT`</td><td>Objects, vehicles, foods, etc. (Not services.)</td><td>*Formula 1*</td></tr>
<tr><td>`EVENT`</td><td>Named hurricanes, battles, wars, sports events, etc.</td><td>*Olympic Games*</td></tr>
<tr><td>`WORK_OF_ART`</td><td>Titles of books, songs, etc.</td><td>*The Mona Lisa*</td></tr>
<tr><td>`LAW`</td><td>Named documents made into laws.</td><td>*Roe v. Wade*</td></tr>
<tr><td>`LANGUAGE`</td><td>Any named language.</td><td>*English*</td></tr>
<tr><td>`DATE`</td><td>Absolute or relative dates or periods.</td><td>*20 July 1969*</td></tr>
<tr><td>`TIME`</td><td>Times smaller than a day.</td><td>*Four hours*</td></tr>
<tr><td>`PERCENT`</td><td>Percentage, including "%".</td><td>*Eighty percent*</td></tr>
<tr><td>`MONEY`</td><td>Monetary values, including unit.</td><td>*Twenty Cents*</td></tr>
<tr><td>`QUANTITY`</td><td>Measurements, as of weight or distance.</td><td>*Several kilometers, 55kg*</td></tr>
<tr><td>`ORDINAL`</td><td>"first", "second", etc.</td><td>*9th, Ninth*</td></tr>
<tr><td>`CARDINAL`</td><td>Numerals that do not fall under another type.</td><td>*2, Two, Fifty-two*</td></tr>
</table>

In [19]:
from spacy.tokens import Span

In [21]:
org = doc.vocab.strings[u'ORG']

In [22]:
org

383

In [30]:
doc1 = nlp(u'Google is one of the leading companies, it has a revenue of 25760 crores USD as of 2021')

print(show_ents(doc1))

25760-DATE-Absolute or relative dates or periods
USD-ORG-Companies, agencies, institutions, etc.
2021-DATE-Absolute or relative dates or periods
None


In [32]:
from spacy.tokens import Span

# Get the hash value of the ORG entity label
ORG = doc.vocab.strings[u'ORG']  

# Create a Span for the new entity
new_ent = Span(doc, 0, 1, label=ORG)

# Add the entity to the existing Doc object
doc1.ents = list(doc1.ents) + [new_ent]

<font color=green>In the code above, the arguments passed to `Span()` are:</font>
-  `doc` - the name of the Doc object
-  `0` - the *start* index position of the span
-  `1` - the *stop* index position (exclusive)
-  `label=ORG` - the label assigned to our entity

In [33]:
show_ents(doc1)

Google-ORG-Companies, agencies, institutions, etc.
25760-DATE-Absolute or relative dates or periods
USD-ORG-Companies, agencies, institutions, etc.
2021-DATE-Absolute or relative dates or periods


In [37]:
doc3 = nlp(u'Our company plans to introduce a new vaccum-cleaner. '
          u'If successful, the vaccumcleaner will be our first product.')

In [38]:
show_ents(doc3)

first-ORDINAL-"first", "second", etc.


In [39]:
from spacy.matcher import PhraseMatcher

In [40]:
matcher = PhraseMatcher(nlp.vocab)

In [41]:
phrase_list = ['vaccumcleaner','vaccum-cleaner']

In [42]:
phrase_matcher = [nlp(text) for text in phrase_list]

In [43]:
matcher.add('newproduct',phrase_matcher)

In [46]:
found_matches = matcher(doc3)

In [47]:
found_matches

[(2689272359382549672, 7, 10), (2689272359382549672, 15, 16)]

In [49]:
prod = doc3.vocab.strings[u'PRODUCT']

In [50]:
prod

386

In [53]:
new_ents = [Span(doc3,match[1],match[2],label=prod) for match in found_matches]

In [54]:
new_ents

[vaccum-cleaner, vaccumcleaner]

In [55]:
doc3.ents = list(doc.ents)+new_ents

In [56]:
show_ents(doc3)

Our-ORG-Companies, agencies, institutions, etc.
vaccum-cleaner-PRODUCT-Objects, vehicles, foods, etc. (not services)
vaccumcleaner-PRODUCT-Objects, vehicles, foods, etc. (not services)


In [57]:
from spacy import displacy

In [58]:
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million. '
         u'By contrast, Sony sold only 7 thousand Walkman music players.')

displacy.render(doc, style='ent', jupyter=True)

In [59]:
for sent in doc.sents:
    displacy.render(nlp(sent.text), style='ent', jupyter=True)

In [60]:
colors = {'ORG': '#aa9cfc'}
options = {'ents':['PRODUCT','ORG'],'colors':colors}

In [61]:
displacy.render(doc,style='ent', jupyter=True,options=options)

In [66]:
displacy.serve(doc, style='ent',options=options,port= 8000)




Using the 'ent' visualizer
Serving on http://0.0.0.0:8000 ...



127.0.0.1 - - [20/Jun/2022 12:09:40] "GET / HTTP/1.1" 200 1175
127.0.0.1 - - [20/Jun/2022 12:09:40] "GET /favicon.ico HTTP/1.1" 200 1175


Shutting down server on port 8000.
