In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")
print(doc.text)

The quick brown fox jumped over the lazy dog's back.


In [3]:
# a word's type: 
# tags could be found in 'https://spacy.io/api/annotation#pos-tagging'
doc[4].pos_, doc[4].tag_

('VERB', 'VBD')

In [4]:
# for tags explanation spacy.explain method can be used
spacy.explain('VBD')

'verb, past tense'

In [5]:
# iterating over tokens and printing details
for token in doc:
    print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

The        DET        DT         determiner
quick      ADJ        JJ         adjective
brown      ADJ        JJ         adjective
fox        PROPN      NNP        noun, proper singular
jumped     VERB       VBD        verb, past tense
over       ADP        IN         conjunction, subordinating or preposition
the        DET        DT         determiner
lazy       ADJ        JJ         adjective
dog        NOUN       NN         noun, singular or mass
's         PART       POS        possessive ending
back       NOUN       NN         noun, singular or mass
.          PUNCT      .          punctuation mark, sentence closer


In [6]:
# comparing different tenses 
doc1 = nlp(u"I read books on NLP.")
token1 = doc1[1]
print(f"{token1.text:{10}} {token1.pos_:{10}} {token1.tag_:{10}} {spacy.explain(token1.tag_)}")

read       VERB       VBD        verb, past tense


In [7]:
doc2 = nlp(u"I read a book on NLP.")
token2 = doc2[1]
print(f"{token2.text:{10}} {token2.pos_:{10}} {token2.tag_:{10}} {spacy.explain(token2.tag_)}")

read       VERB       VBD        verb, past tense


In [8]:
# counting the words according to their type
pos_counts = doc.count_by(spacy.attrs.POS)
pos_counts

{90: 2, 84: 3, 96: 1, 100: 1, 85: 1, 92: 2, 94: 1, 97: 1}

In [9]:
# type of a word by its POS
doc.vocab[84].text

'ADJ'

In [10]:
# type of a word by its part of speech
doc[2].pos_

'ADJ'

In [11]:
# listing the counts and type of each pos
for k,v in sorted(pos_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

84. ADJ   3
85. ADP   1
90. DET   2
92. NOUN  2
94. PART  1
96. PROPN 1
97. PUNCT 1
100. VERB  1


In [12]:
# listing the counts and type of each tag
tag_counts = doc.count_by(spacy.attrs.TAG)
for k,v in sorted(tag_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

74. POS   1
1292078113972184607. IN    1
10554686591937588953. JJ    3
12646065887601541794. .     1
15267657372422890137. DT    2
15308085513773655218. NN    2
15794550382381185553. NNP   1
17109001835818727656. VBD   1


In [13]:
# listing the counts and type of each dep
dep_counts = doc.count_by(spacy.attrs.DEP)
for k,v in sorted(dep_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

402. amod  3
415. det   2
429. nsubj 1
439. pobj  1
440. poss  1
443. prep  1
445. punct 1
8110129090154140942. case  1
8206900633647566924. ROOT  1


### visualizing POS

In [14]:
from spacy import displacy

In [15]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [16]:
# defaulşt syntactic visualization
displacy.render(doc, style='dep', jupyter=True)

In [17]:
# modifying the visual
options = {'distance':100, 'compact':'True','color':'white', 'bg':'#09a3d5', 'font':'Times'}
displacy.render(doc, style='dep', jupyter=True, options=options)

In [18]:
doc3 = nlp(u"This is a sentence. This is another sentence, possibly longer than other.")
spans = list(doc3.sents)

In [19]:
# open a web browser and hit '127.0.0.1:5000' to display it
# displacy.serve(spans, style='dep', options={'distance':100})




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


### named entity recognition (NER)        

In [20]:
def show_ents(doc):
    if doc.ents:
        for entity in doc.ents:
            print(entity.text + " - " + entity.label_ + " - " + spacy.explain(entity.label_) )
    else:
        print("No entity found!")

In [21]:
doc4 = nlp(u"Hi, how are you?")
show_ents(doc4)

No entity found!


In [22]:
doc5 = nlp(u"May I go to Washington, DC next May to see the Washington Monument? or Can I have 500 dollars Microsoft stock?")
show_ents(doc5)

Washington, DC - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
the Washington Monument - ORG - Companies, agencies, institutions, etc.
500 dollars - MONEY - Monetary values, including unit
Microsoft - ORG - Companies, agencies, institutions, etc.


In [23]:
doc6 = nlp(u"Google to build a U.K. facotyr for $6 million.")
show_ents(doc6)
# notice that 'Google' isn't recognized as an 'ORG' entity

U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [24]:
# adding a word to the NER dictionary
from spacy.tokens import Span
# obtaining hash value of the entity label
ORG = doc6.vocab.strings[u"ORG"]
# creating a span for 'Google' in doc5 start=0, end=1
new_ent = Span(doc6, 0 , 1, label=ORG)
# adding entity to the entity list 
doc6.ents = list(doc6.ents) + [new_ent]
show_ents(doc6)

Google - ORG - Companies, agencies, institutions, etc.
U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [25]:
doc7 = nlp(u"Our company created a brand new vacuum cleaner."
          u"This new vacuum-cleaner is the best in show.")
show_ents(doc7)

No entity found!


In [26]:
from spacy.matcher import PhraseMatcher
# creating matcher object
matcher = PhraseMatcher(nlp.vocab)
# creating phrase list
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']
# creating phrase patterns for phrase list
phrase_patterns = [nlp(text) for text in phrase_list]
# adding phrase patterns to matcher
matcher.add('newproduct', None, *phrase_patterns)
# finding the matches
found_matches = matcher(doc7)
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [27]:
from spacy.tokens import Span
PROD = doc7.vocab.strings[u"PRODUCT"]
new_ents = [Span(doc7, match[1], match[2], label=PROD) for match in found_matches]
doc7.ents = list(doc7.ents) + new_ents
show_ents(doc7)

vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum-cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)


In [28]:
doc8 = nlp(u"Originally I paid $29.95 for a toy and now it is marked as 15 dollars.")

In [29]:
money_entities = [ent for ent in doc8.ents if ent.label_ == "MONEY"]
money_entities

[29.95, 15 dollars]

### visualizing NER

In [30]:
from spacy import displacy
doc9 = nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million."
          u"By contrast, Sony only sold 8 thousand Walkman music players.")

In [31]:
displacy.render(doc9, style = 'ent', jupyter=True)

In [32]:
# displaying the visual sentence by sentence
for sent in doc9.sents:
    displacy.render(sent, style = 'ent', jupyter=True)

In [33]:
# defining option s dictionary to change options of visual
colors = {'ORG':'red'}
options = {'ents':['PRODUCT', 'ORG'], 'colors' : colors}
displacy.render(doc9, style = 'ent', jupyter=True, options=options)

### sentence segmentation

In [34]:
doc10 = nlp(u"This is the first sentence. This is the second. This is the last one.")

In [35]:
for sent in doc10.sents:
    print(sent)

This is the first sentence.
This is the second.
This is the last one.


In [36]:
# this gives error because it is a generator
# doc10.sents[0]
# instead, below line should be used
list(doc10.sents)[0]

This is the first sentence.

###### adding new saegmentation rule

In [37]:
# for some situations dot is not the correct sentence seperator
doc11 = nlp(u'"Management is doing the right things; leadership is doing the right things." - Peter Ducker')
for sent in doc11.sents:
    print(sent)

"Management is doing the right things; leadership is doing the right things."
- Peter Ducker


In [38]:
# defining new sentence seperator
def set_custom_boundaries(doc):
    for token in doc[:-1]: # taking all the tokens except last one
        if token.text == ';':
            doc[token.i+1].is_sent_start = True # assigning start of a sentence
    return doc

In [39]:
# adding new step to the pipeline
nlp.add_pipe(set_custom_boundaries, before='parser')
nlp.pipe_names

['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [40]:
doc11 = nlp(u'"Management is doing the right things; leadership is doing the right things." - Peter Ducker')
for sent in doc11.sents:
    print(sent)

"Management is doing the right things;
leadership is doing the right things."
- Peter Ducker


###### changing saegmentation rules

In [41]:
# loading en_core_web_sm library again to discard changes
nlp = spacy.load('en_core_web_sm')
doc12 = nlp(u"This is a sentence. This is another \n\nThis is \nthird sentence.")
doc12

This is a sentence. This is another 

This is 
third sentence.

In [42]:
for sent in doc12.sents:
    print(sent)

This is a sentence.
This is another 


This is 
third sentence.


In [43]:
from spacy.pipeline import SentenceSegmenter

In [44]:
# if we need to segment the sentences by a defined segmentator, we need to create new segmentation rule
def split_on_new_lines(doc):
    start = 0
    seen_new_line = False
    for word in doc:
        if seen_new_line:
            yield doc[start:word.i]
            start = word.i
            seen_new_line = False
        elif word.text.startswith('\n'):
            seen_new_line = True    
    yield doc[start:]

In [45]:
# adding created segmentator method to pipe
# creating doc object to apply the chan
sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_new_lines)
nlp.add_pipe(sbd)
doc13 =  nlp(u"This is a sentence. This is another \n\nThis is \nthird sentence.")

In [46]:
for sent in doc13.sents:
    print(sent)

This is a sentence. This is another 


This is 

third sentence.
