In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [7]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [9]:
print(doc.text)

The quick brown fox jumped over the lazy dog's back.


In [12]:
print(doc[4].pos_,doc[4].tag_)

VERB VBD


In [22]:
for token in doc:
    print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}')

The        DET      DT     determiner
quick      ADJ      JJ     adjective (English), other noun-modifier (Chinese)
brown      ADJ      JJ     adjective (English), other noun-modifier (Chinese)
fox        NOUN     NN     noun, singular or mass
jumped     VERB     VBD    verb, past tense
over       ADP      IN     conjunction, subordinating or preposition
the        DET      DT     determiner
lazy       ADJ      JJ     adjective (English), other noun-modifier (Chinese)
dog        NOUN     NN     noun, singular or mass
's         PART     POS    possessive ending
back       NOUN     NN     noun, singular or mass
.          PUNCT    .      punctuation mark, sentence closer


In [23]:
POS_counts = doc.count_by(spacy.attrs.POS)

In [24]:
POS_counts

{90: 2, 84: 3, 92: 3, 100: 1, 85: 1, 94: 1, 97: 1}

In [31]:
doc.vocab[84].text

'ADJ'

In [39]:
for key,value in sorted(POS_counts.items()):
    print(f"{key}. {doc.vocab[key].text:{5}} - {value}")

84. ADJ   - 3
85. ADP   - 1
90. DET   - 2
92. NOUN  - 3
94. PART  - 1
97. PUNCT - 1
100. VERB  - 1


In [40]:
TAG_counts = doc.count_by(spacy.attrs.TAG)

for key,value in sorted(TAG_counts.items()):
    print(f"{key}. {doc.vocab[key].text:{5}} - {value}")

74. POS   - 1
1292078113972184607. IN    - 1
10554686591937588953. JJ    - 3
12646065887601541794. .     - 1
15267657372422890137. DT    - 2
15308085513773655218. NN    - 3
17109001835818727656. VBD   - 1


In [43]:
from spacy import displacy
displacy.render(doc, style='dep', jupyter=True, options={'distance': 70})

In [45]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
           print(ent.text + " - " + ent.label_ + " - " + str(spacy.explain(ent.label_))) 
    else:
        print("No entities")

In [46]:
doc = nlp(u"Hi how are you?")
show_ents(doc)

No entities


In [48]:
doc = nlp(u"may I go to Washington,DC next May to see the Washington Monument")
show_ents(doc)

Washington,DC - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
the Washington Monument - ORG - Companies, agencies, institutions, etc.


In [49]:
doc = nlp(u"Tesla to build U.K. factory for $6 million")
show_ents(doc)

U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [53]:
from spacy.tokens import Span

ORG = doc.vocab.strings[u"ORG"]

In [51]:
ORG

383

In [54]:
new_ent = Span(doc,0,1,label=ORG)

In [55]:
doc.ents = list(doc.ents) + [new_ent]

In [56]:
show_ents(doc)

Tesla - ORG - Companies, agencies, institutions, etc.
U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [57]:
doc = nlp(u"Our company created a brand new vacuum cleaner."
          u"This new vacuum-cleaner is the best in show")

In [58]:
show_ents(doc)

No entities


In [59]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [60]:
phrase_list = ['vacuum cleaner','vacuum-cleaner']

In [61]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [62]:
matcher.add('newproduct',phrase_patterns)

In [64]:
found_matches = matcher(doc)
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [65]:
from spacy.tokens import Span

PROD = doc.vocab.strings[u"PRODUCT"]

In [68]:
new_ents =  [Span(doc,match[1],match[2],label=PROD) for match in found_matches]

In [69]:
doc.ents = list(doc.ents) + new_ents

In [70]:
show_ents(doc)

vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum-cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)


In [71]:
doc = nlp(u'Originally priced at $29.50, the sweater was marked down to five dollars.')

show_ents(doc)

29.50 - MONEY - Monetary values, including unit
five dollars - MONEY - Monetary values, including unit


In [72]:
len([ent for ent in doc.ents if ent.label_ == 'MONEY'])

2

In [86]:
from spacy import displacy

doc = nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million."
          u"By contrast, Sony only sold 8 thousand Walkman music players.")

In [81]:
displacy.render(doc,style='ent',jupyter=True)

In [82]:
for sent in doc.sents:
    displacy.render(nlp(sent.text),style='ent',jupyter=True)

In [89]:
options = {'ents':['PRODUCT','ORG']}

displacy.render(doc,style='ent',jupyter=True,options=options)

In [102]:
doc = nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million."
          u"By contrast, Sony only sold 8 thousand Walkman music players.")

PROD = doc.vocab.strings[u"PRODUCT"]

from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

phrase_list = ['Walkman']
phrase_patterns = [nlp(text) for text in phrase_list]

matcher.add('newproduct',phrase_patterns)

found_matches = matcher(doc)

new_ents =  [Span(doc,match[1],match[2],label=PROD) for match in found_matches]

doc.ents = list(doc.ents) + new_ents

displacy.render(doc,style='ent',jupyter=True,options=options)

In [103]:
doc = nlp(u"This is the first sentence. This is another sentence. This is the last sentence.")

for sent in doc.sents:
    print(sent.text)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [106]:
list(doc.sents)

[This is the first sentence.,
 This is another sentence.,
 This is the last sentence.]

In [107]:
doc = nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')

In [108]:
doc.text

'"Management is doing the right things; leadership is doing the right things." -Peter Drucker'

In [110]:
for sent in doc.sents:
    print(sent,end='\n\n')

"Management is doing the right things; leadership is doing the right things."

-Peter Drucker



In [120]:
from spacy.language import Language

@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if(token.text == ';'):
            doc[token.i+1].is_sent_start = True
    return doc

In [122]:
nlp.add_pipe("set_custom_boundaries", before="parser")

nlp.pipe_names

['tok2vec',
 'tagger',
 'set_custom_boundaries',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner']

In [124]:
doc2 = nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')

for sent in doc2.sents:
    print(sent,end='\n\n')

"Management is doing the right things;

leadership is doing the right things."

-Peter Drucker



In [125]:
nlp = spacy.load('en_core_web_sm')

In [127]:
mystring = u"This is a sentence. This is another.\n\nThis is a\nthird sentence"
print(mystring)

This is a sentence. This is another.

This is a
third sentence


In [128]:
doc = nlp(mystring)

for sent in doc.sents:
    print(sent)

This is a sentence.
This is another.


This is a
third sentence


In [134]:
from spacy.language import Language

@Language.component("newline_segmenter")
def split_on_newlines(doc):
    start = 0
    seen_newline = False
    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start = word.i
            seen_newline = False
        elif word.text.startswith('\n'): 
            seen_newline = True
    yield doc[start:]
    new_docs = [span for span in doc]
    return Doc(doc.vocab, words=new_docs)

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Add the custom component for newline-based segmentation
nlp.add_pipe("newline_segmenter")

# Add the default sentencizer component for other sentence boundaries
if "sentencizer" not in nlp.pipe_names:
    nlp.add_pipe("sentencizer")


In [135]:
doc = nlp(mystring)

for sent in doc.sents:
    print(sent)

ValueError: [E005] Pipeline component 'newline_segmenter' returned <class 'generator'> instead of a Doc. If you're using a custom component, maybe you forgot to return the processed Doc?