In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back ")

In [3]:
print(doc.text)

The quick brown fox jumped over the lazy dog's back 


In [4]:
print(doc[4])

jumped


In [6]:
print(doc[4].text, doc[4].pos_)

jumped VERB


In [9]:
print(doc[4].tag_) # fine-grained pos tag # past tense verb

VBD


In [10]:
doc[4].pos

100

In [16]:
for token in doc:
    print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{5}} {spacy.explain(token.tag_)}")

The        DET        DT    determiner
quick      ADJ        JJ    adjective
brown      ADJ        JJ    adjective
fox        NOUN       NN    noun, singular or mass
jumped     VERB       VBD   verb, past tense
over       ADP        IN    conjunction, subordinating or preposition
the        DET        DT    determiner
lazy       ADJ        JJ    adjective
dog        NOUN       NN    noun, singular or mass
's         PART       POS   possessive ending
back       NOUN       NN    noun, singular or mass


In [28]:
doc = nlp(u"I read books on NLP")

In [29]:
word = doc[1]
word

read

In [30]:
word.text

'read'

In [31]:
token = word

In [32]:
print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{5}} {spacy.explain(token.tag_)}")

read       VERB       VBD   verb, past tense


In [33]:
doc = nlp(u"I read a book on NLP.")

In [34]:
word = doc[1]

In [35]:
token = word
print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{5}} {spacy.explain(token.tag_)}")

read       VERB       VBD   verb, past tense


In [37]:
doc = nlp(u'I read books on NLP.')
r = doc[1]

print(f'{r.text:{10}} {r.pos_:{8}} {r.tag_:{6}} {spacy.explain(r.tag_)}')

read       VERB     VBD    verb, past tense


In [38]:
doc = nlp(u'I read a book on NLP.')
r = doc[1]

print(f'{r.text:{10}} {r.pos_:{8}} {r.tag_:{6}} {spacy.explain(r.tag_)}')

read       VERB     VBD    verb, past tense


In [39]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

# Count the frequencies of different coarse-grained POS tags:
POS_counts = doc.count_by(spacy.attrs.POS)
POS_counts

{90: 2, 84: 3, 92: 3, 100: 1, 85: 1, 94: 1, 97: 1}

In [40]:
doc.vocab[90].text

'DET'

In [51]:
for k, v in sorted(POS_counts.items()):
    print(f'{k:{3}}. {doc.vocab[k].text:{6}} :{v} {spacy.explain(r.tag_)}')

 84. ADJ    :3 verb, past tense
 85. ADP    :1 verb, past tense
 90. DET    :2 verb, past tense
 92. NOUN   :3 verb, past tense
 94. PART   :1 verb, past tense
 97. PUNCT  :1 verb, past tense
100. VERB   :1 verb, past tense


In [44]:
TAG_counts = doc.count_by(spacy.attrs.TAG)

In [52]:
for k,v in sorted(TAG_counts.items()):
    print(f'{k:{20}}. {doc.vocab[k].text:{4}}: {v} {spacy.explain(r.tag_)}')

                  74. POS : 1 verb, past tense
 1292078113972184607. IN  : 1 verb, past tense
10554686591937588953. JJ  : 3 verb, past tense
12646065887601541794. .   : 1 verb, past tense
15267657372422890137. DT  : 2 verb, past tense
15308085513773655218. NN  : 3 verb, past tense
17109001835818727656. VBD : 1 verb, past tense


In [47]:
# used more often get smaller Id, closer to the list

In [53]:
DEP_counts = doc.count_by(spacy.attrs.DEP)
for k,v in sorted(DEP_counts.items()):
    print(f'{k:{20}}. {doc.vocab[k].text:{4}}: {v} {spacy.explain(r.tag_)}')

                 402. amod: 3 verb, past tense
                 415. det : 2 verb, past tense
                 429. nsubj: 1 verb, past tense
                 439. pobj: 1 verb, past tense
                 440. poss: 1 verb, past tense
                 443. prep: 1 verb, past tense
                 445. punct: 1 verb, past tense
 8110129090154140942. case: 1 verb, past tense
 8206900633647566924. ROOT: 1 verb, past tense


In [54]:
# Visualize Part of Speech 

In [55]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back ")

In [56]:
from spacy import displacy

In [58]:
displacy.render(doc, style = 'dep', jupyter = True)

In [60]:
options = {'distance': 100, 'compact':'True', 'color':'yellow',
          'bg':'#09a3d5', 'font':'Times'}

In [61]:
displacy.render(doc, style = 'dep', jupyter = True, options = options)

In [62]:
doc2 = nlp("This is a sentence. This ia another sentence, possibly longer")

In [63]:
spans = list(doc2.sents)

In [64]:
displacy.serve(spans, style = 'dep', options = options)



OSError: [Errno 48] Address already in use

In [65]:
# http://127.0.0.1:5000/ to view (serve)

Named Entity Recognition

 NER : unstructured texts into pre-defined categories : person names, orgs, locations, medical records, time expressions, quantities, monetary values, percentages, etc 

In [68]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [119]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' - ' + ent.label_ + ' - ' + spacy.explain(ent.label_)+ ' - '+ str(ent.start) + ' - ' + str(ent.end))
            print(ent.start_char, ent.end_char) 
    else:
        print("No entities found")

In [120]:
doc = nlp(u"hi how are you?")

In [121]:
show_ents(doc)

No entities found


In [122]:
doc = nlp(u"May I go to Washington, DC next May to see the Washington Monument?")

In [123]:
show_ents(doc)

Washington, DC - GPE - Countries, cities, states - 4 - 7
12 26
next May - DATE - Absolute or relative dates or periods - 7 - 9
27 35
the Washington Monument - ORG - Companies, agencies, institutions, etc. - 11 - 14
43 66


In [129]:
doc = nlp(u"Can I please have 500 dollars Tesla stock?")

In [125]:
show_ents(doc)

500 dollars - MONEY - Monetary values, including unit - 4 - 6
18 29


In [126]:
doc = nlp(u"Can I please have 500 dollars Apple stock?")
show_ents(doc)

500 dollars - MONEY - Monetary values, including unit - 4 - 6
18 29
Apple - ORG - Companies, agencies, institutions, etc. - 6 - 7
30 35


In [127]:
# add named entities
from spacy.tokens import Span
ORG = doc.vocab.strings[u"ORG"]

In [128]:
ORG

383

In [130]:
new_ent = Span(doc,6, 7, label = ORG)

In [131]:
doc.ents = list(doc.ents) + [new_ent]

In [132]:
show_ents(doc)

500 dollars - MONEY - Monetary values, including unit - 4 - 6
18 29
Tesla - ORG - Companies, agencies, institutions, etc. - 6 - 7
30 35


In [134]:
# add several terms into NERS
# vaccum cleaner or vaccum-cleaner as the products

In [135]:
doc = nlp(u"Our company created a brand new vaccum cleaner."
         u"This new vaccum-cleaner is the best!")

In [136]:
show_ents(doc)

No entities found


In [137]:
from spacy.matcher import PhraseMatcher

In [138]:
matcher = PhraseMatcher(nlp.vocab)

In [139]:
phrase_list = ['vaccum cleaner', 'vaccum-cleaner']

In [141]:
phrase_pattern = [nlp(text) for text in phrase_list]

In [142]:
matcher.add('newproduct', None, *phrase_pattern)

In [143]:
found_matches = matcher(doc)

In [144]:
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [145]:
from spacy.tokens import Span

In [146]:
PROD = doc.vocab.strings[u"PRODUCT"]

In [147]:
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [149]:
new_ents = [Span(doc,match[1], match[2], label = PROD) for match in found_matches]

In [150]:
doc.ents = list(doc.ents) + new_ents

In [151]:
show_ents(doc)

vaccum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services) - 6 - 8
32 46
vaccum-cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services) - 11 - 14
56 70


In [152]:
#  count by NER
doc = nlp(u'Originally priced at $29.50, the sweater was marked down to five dollars.')

show_ents(doc)

29.50 - MONEY - Monetary values, including unit - 4 - 5
22 27
five dollars - MONEY - Monetary values, including unit - 12 - 14
60 72


In [158]:
len([ent for ent in doc.ents if ent.label_ == 'MONEY'])

2

In [160]:
spacy.__version__

'3.0.3'

In [174]:
doc = nlp(u'Originally priced at $29.50,\n    the sweater was marked down to five dollars.')
show_ents(doc)

29.50 - MONEY - Monetary values, including unit - 4 - 5
22 27
five dollars - MONEY - Monetary values, including unit - 13 - 15
64 76


In [179]:
nlp.remove_pipe("remove_whitespace_entities")

from spacy.language import Language

@Language.component("remove_whitespace_entities")
def remove_whitespace_entities(doc):
    doc.ents = [e for e in doc.ents if not e.text.isspace()]
    return doc

nlp.add_pipe("remove_whitespace_entities", after='ner')

<function __main__.remove_whitespace_entities(doc)>

In [180]:
doc = nlp(u'Originally priced at $29.50,\nthe sweater was marked down to five dollars.')

show_ents(doc)

29.50 - MONEY - Monetary values, including unit - 4 - 5
22 27
five dollars - MONEY - Monetary values, including unit - 13 - 15
60 72


In [181]:
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'ner', 'remove_whitespace_entities', 'attribute_ruler', 'lemmatizer']


Noun Chunks

Doc.noun_chunks are base noun phrases: token spans that include the noun and words describing the noun. Noun chunks cannot be nested, cannot overlap, and do not involve prepositional phrases or relative clauses.
Where Doc.ents rely on the ner pipeline component, Doc.noun_chunks are provided by the parser.

In [182]:

doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers.")

for chunk in doc.noun_chunks:
    print(chunk.text+' - '+chunk.root.text+' - '+chunk.root.dep_+' - '+chunk.root.head.text)

Autonomous cars - cars - nsubj - shift
insurance liability - liability - dobj - shift
manufacturers - manufacturers - pobj - toward


In [183]:
len(list(doc.noun_chunks))

3

In [185]:
len(doc.noun_chunks) # generator functions

TypeError: object of type 'generator' has no len()

Visualize NER

In [187]:
# treat as a big string
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million. '
         u'By contrast, Sony sold only 7 thousand Walkman music players.')

displacy.render(doc, style='ent', jupyter=True)

In [189]:
# sentence
for sent in doc.sents:
    displacy.render(nlp(sent.text), style='ent', jupyter=True)

In [190]:
doc2 = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million. '
         u'By contrast, my kids sold a lot of lemonade.')

In [191]:
for sent in doc2.sents:
    displacy.render(nlp(sent.text), style='ent', jupyter=True)



In [192]:
for sent in doc2.sents:
    docx = nlp(sent.text)
    if docx.ents:
        displacy.render(docx, style='ent', jupyter=True)
    else:
        print(docx.text)

By contrast, my kids sold a lot of lemonade.


In [194]:
# view specific entities
options = {'ents': ['ORG', 'PRODUCT']}

displacy.render(doc, style='ent', jupyter=True, options=options)

In [195]:

colors = {'ORG': 'linear-gradient(90deg, #aa9cfc, #fc9ce7)', 'PRODUCT': 'radial-gradient(yellow, green)'}

options = {'ents': ['ORG', 'PRODUCT'], 'colors':colors}

displacy.render(doc, style='ent', jupyter=True, options=options)

In [197]:
displacy.serve(doc, style='ent', options=options)

OSError: [Errno 48] Address already in use

http://127.0.0.1:5000

Sentence Segmentation

In [198]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [199]:
doc = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


doc.sents is a generator, so doc.sents[1], indexing will fail.
We just ietarte
doc[0] ok
but doc.sents[0] not ok

In [200]:
list(doc.sents)

[This is the first sentence.,
 This is another sentence.,
 This is the last sentence.]

In [202]:
list(doc.sents)[0]  # you can get sentence this way

This is the first sentence.

In [204]:
type(list(doc.sents)[0]) # not string, but span

spacy.tokens.span.Span

In [206]:
doc_sents = [sent for sent in doc.sents]
doc_sents

[This is the first sentence.,
 This is another sentence.,
 This is the last sentence.]

In [207]:
print(doc_sents[1].start, doc_sents[1].end)

6 11


In [209]:
doc2 = nlp(u'This is a sentence. This is a sentence. This is a sentence.')

for token in doc2:
    print(token.is_sent_start, ' '+token.text)

True  This
False  is
False  a
False  sentence
False  .
True  This
False  is
False  a
False  sentence
False  .
True  This
False  is
False  a
False  sentence
False  .


In [210]:
doc3 = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')

for sent in doc3.sents:
    print(sent)

"Management is doing things right; leadership is doing the right things."
-Peter Drucker


In [211]:
# originally, split by period and space, we just add a new one based on ;

from spacy.language import Language

@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
    for token in doc[:-1]:  # ensure i+1 doesn't get past the index range
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc

nlp.add_pipe("set_custom_boundaries", before='parser')

nlp.pipe_names


['tok2vec',
 'tagger',
 'set_custom_boundaries',
 'parser',
 'ner',
 'attribute_ruler',
 'lemmatizer']

In [212]:
doc4 = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')

for sent in doc4.sents:
    print(sent)

"Management is doing things right;
leadership is doing the right things."
-Peter Drucker


In [213]:
# And yet the new rule doesn't apply to the older Doc object:
for sent in doc3.sents:
    print(sent)

"Management is doing things right; leadership is doing the right things."
-Peter Drucker


In [214]:
doc3[7]

leadership

In [215]:
doc3[7].is_sent_start = True

ValueError: [E043] Refusing to write to token.sent_start if its document is parsed, because this may cause inconsistent state.

In [216]:
nlp = spacy.load('en_core_web_sm')  # reset to the original

mystring = u"This is a sentence. This is another.\n\nThis is a \nthird sentence."

# SPACY DEFAULT BEHAVIOR:
doc = nlp(mystring)

for sent in doc.sents:
    print([token.text for token in sent])

['This', 'is', 'a', 'sentence', '.']
['This', 'is', 'another', '.']
['\n\n', 'This', 'is', 'a', '\n', 'third', 'sentence', '.']


In [217]:
print(mystring)

This is a sentence. This is another.

This is a 
third sentence.


In [1]:
# complete changing the rules
