In [272]:
import spacy

In [273]:
nlp = spacy.load('en_core_web_sm')

In [276]:
doc = nlp(u'Bank of America is looking to buy a U.S. startup for 6 millions and sell it to 100 Billions')

In [277]:
for token in doc:
    print(token.text, token)

Bank Bank
of of
America America
is is
looking looking
to to
buy buy
a a
U.S. U.S.
startup startup
for for
6 6
millions millions
and and
sell sell
it it
to to
100 100
Billions Billions


In [278]:
for token in doc:
    print(token.text, token.pos_)

Bank PROPN
of ADP
America PROPN
is VERB
looking VERB
to PART
buy VERB
a DET
U.S. PROPN
startup NOUN
for ADP
6 NUM
millions NOUN
and CCONJ
sell VERB
it PRON
to ADP
100 NUM
Billions NOUN


In [279]:
for token in doc:
    print(token.text, token.pos_, token.dep_)

Bank PROPN nsubj
of ADP prep
America PROPN pobj
is VERB aux
looking VERB ROOT
to PART aux
buy VERB xcomp
a DET det
U.S. PROPN compound
startup NOUN dobj
for ADP prep
6 NUM nummod
millions NOUN pobj
and CCONJ cc
sell VERB conj
it PRON dobj
to ADP prep
100 NUM nummod
Billions NOUN pobj


In [280]:
'''ner: name entity recognizer'''
nlp.pipeline


[('tagger', <spacy.pipeline.pipes.Tagger at 0x1a2b5cad30>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x1a2ca60228>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x1a2ca60288>)]

In [281]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [282]:
doc2 = nlp(u"Tesla isn't looking into         startups anymore.")

In [283]:
for token in doc2:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is VERB aux
n't ADV neg
looking VERB ROOT
into ADP prep
         SPACE 
startups NOUN pobj
anymore ADV advmod
. PUNCT punct


### Spans

#### Large documents can be difficult to work with. A span slices the large document in of form of Doc[start:stop]

In [284]:
doc3 = nlp(u'Health officials in Dallas City and Los Angeles County are signaling a change in local strategy when it comes to coronavirus testing, recommending that doctors avoid testing patients except in cases where a test result would significantly change the course of treatment. A news release from the Los Angeles Department of Public Health this week advised doctors not to test those experiencing only mild respiratory symptoms unless “a diagnostic result will change clinical management or inform public health response. The recommendation reflects a "shifting from a strategy of case containment to slowing disease transmission and averting excess morbidity and mortality," according to the statement.The guidance said coronavirus testing at L.A. County public health labs will prioritized those with symptoms, health care workers, residents of long-term care facilities, paramedics and other high-risk situations. Others are encouraged to simply stay at home. At about the same time, the New York City Department of Health directed all healthcare facilities to immediately stop testing non-hospitalized patients for Covid-19.')

In [287]:
shift_quote = doc3[87:107]

In [288]:
'''this is the span of the document'''
print(shift_quote)

"shifting from a strategy of case containment to slowing disease transmission and averting excess morbidity and mortality,"


In [289]:
type(shift_quote)

spacy.tokens.span.Span

In [290]:
type(doc3)

spacy.tokens.doc.Doc

In [291]:
doc4 = nlp(u"This is the first sentence. This is the second sentence. This is another sentence. This is the last sentence")

In [292]:
for sentence in doc4.sents:
    print(sentence)

This is the first sentence.
This is the second sentence.
This is another sentence.
This is the last sentence


In [293]:
doc4[6]

This

In [294]:
doc4[6].is_sent_start

True

### Tokens are the basic building blocks of the document object, which helps to understand the meaning of the text, which is is derived from token and shows the relationship to one to another

#### prefix: char at the beginning, suffix: at the end, infix: char in between

In [295]:
mystring = '"We\'re moving to S.F.!"'

In [296]:
mystring

'"We\'re moving to S.F.!"'

In [297]:
print(mystring)

"We're moving to S.F.!"


In [298]:
doc5 = nlp(mystring)

In [299]:
for token in doc5:
    print(token.text)

"
We
're
moving
to
S.F.
!
"


In [300]:
doc6 = nlp(u"please send-email to emailme@krishana.com, and visit me at http://www.krishanagyanwali.com!!")

In [301]:
for t in doc6:
    print(t)

please
send
-
email
to
emailme@krishana.com
,
and
visit
me
at
http://www.krishanagyanwali.com
!
!


In [302]:
doc7 = nlp(u"A 5km SF cab charges $12.45, let's have a party")

In [303]:
for t in doc7:
    print(t)

A
5
km
SF
cab
charges
$
12.45
,
let
's
have
a
party


In [304]:
len(doc7)

14

In [305]:
len(doc7.vocab)

631

In [306]:
"""document object can't be reassigned"""

"document object can't be reassigned"

In [307]:
doc7[0] = 'bill'

TypeError: 'spacy.tokens.doc.Doc' object does not support item assignment

In [310]:
doc8 = nlp(u"Bank of America to build a Tech Center in Plano for $20 Million dollars!!!")

In [311]:
for token in doc8:
    print(token.text, end="|")

Bank|of|America|to|build|a|Tech|Center|in|Plano|for|$|20|Million|dollars|!|!|!|

In [312]:
for entity in doc8.ents:
    print(entity)
    print(str(spacy.explain(entity.label_)))
    print(entity.label_, '\n')

Bank of America
Companies, agencies, institutions, etc.
ORG 

Tech Center
Buildings, airports, highways, bridges, etc.
FAC 

Plano
Countries, cities, states
GPE 

$20 Million dollars
Monetary values, including unit
MONEY 



In [313]:
doc9 = nlp(u"Autonomous cars shift insurance liability towards manufacturers.")

In [314]:
for chunk in doc9.noun_chunks:
    print(chunk)

Autonomous cars
insurance liability
manufacturers


In [315]:
from spacy import displacy
"""built in visitor"""

'built in visitor'

In [316]:
doc = nlp(u"Apple is going to build a U.K. factory for $6 million.")

In [317]:
"""style -> dep syntactic dependency, ent -> entity"""
displacy.render(doc, style='dep', jupyter=True, options={'distance':70})

In [318]:
doc = nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million")

In [319]:
displacy.render(doc, style='ent')

In [320]:
doc_1 = nlp(u"This is a sentence")

In [None]:
displacy.serve(doc, style='dep')

  "__main__", mod_spec)



Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...



### Stemming 
#### searching for the key word-write, it might return writes, writing, wrote, in this case, "write" is the stem for 'write, writing, writes...'
#### It is a method for cataloging the related words, it essentially chops off letters from end until it reaches the stem is reached. in some cases this may not handle all cases, at that point lemmatization comes into picture
#### common and effective stemming tools is Porter's Algorithms developed by Martin Porter in 1980, five phases of word reduction each with its own set of mapping rule, Snowball is the of stemming language also developed by Martin Porter, this algorithm is called "English Stemmer" or "Porter2 Stemmer"

In [None]:
import nltk

In [None]:
from nltk.stem.porter import PorterStemmer

In [None]:
p_stemmer = PorterStemmer()

In [None]:
words = ['run', 'runner', 'running', 'ran', 'easily', 'fairly', 'fairness']

In [None]:
for word in words:
    print(word + '------>' + p_stemmer.stem(word))

In [None]:
from nltk.stem.snowball import SnowballStemmer

In [None]:
s_stemmer = SnowballStemmer(language='english')

In [None]:
for word in words:
    print(words, '----->', s_stemmer.stem(word))

In [17]:
words = ['generous', 'generation', 'generously', 'generate']

In [18]:
for word in words:
    print(words, '----->', s_stemmer.stem(word))

['generous', 'generation', 'generously', 'generate'] -----> generous
['generous', 'generation', 'generously', 'generate'] -----> generat
['generous', 'generation', 'generously', 'generate'] -----> generous
['generous', 'generation', 'generously', 'generate'] -----> generat


#### Lemmatization
#### lemmatization looks beyond reduction, and consider language's full vocabulary to apply a morphological analysis to the word. Lemma of 'was' is 'be', the lemma of mice is 'mouse', lemma of meeting might be 'meet' or 'meeting depending on its use in the sentence. Spacy only has lemmatization, it doesn't have stemming like NLTK has

In [22]:
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today")

In [23]:
for token in doc1:
    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)

I 	 PRON 	 561228191312463089 	 -PRON-
am 	 VERB 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 ADP 	 16950148841647037698 	 because
I 	 PRON 	 561228191312463089 	 -PRON-
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 ADP 	 10066841407251338481 	 since
I 	 PRON 	 561228191312463089 	 -PRON-
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


In [31]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [32]:
doc2 = nlp(u"I saw ten mice today!")

In [34]:
show_lemmas(doc2)

I            PRON   561228191312463089     -PRON-
saw          VERB   11925638236994514241   see
ten          NUM    7970704286052693043    ten
mice         NOUN   1384165645700560590    mouse
today        NOUN   11042482332948150395   today
!            PUNCT  17494803046312582752   !


#### Stop words
#### words like 'a', 'the'.. should be filter words, stacy holds 326 english stopwords

In [37]:
print(nlp.Defaults.stop_words)

{'latterly', 'him', "'ve", 'themselves', 'only', 'although', 'off', 'always', 'quite', 'they', '’s', 'ourselves', 'everywhere', 'through', 'put', 'whereafter', 'about', 'where', 'twenty', 'mine', 'doing', 'anyone', 'two', 'hers', 'thereby', 'throughout', 'to', 'does', 'empty', 'again', 'namely', '’re', 'further', 'bottom', 'am', 'after', 'hence', 'show', 'whole', 'you', 'your', 'five', '’m', 'though', 'once', 'before', 'otherwise', 'whither', 'out', 'but', 'into', 'i', 'them', 'up', 'together', 'go', 'from', 'three', 'whose', 'noone', 'be', 'yourselves', 'thus', 'ca', 'third', 'alone', 'been', 'never', 'all', 'front', 'thence', 'me', 'someone', 'between', 'not', 'nothing', 'very', 'almost', 'made', 'than', 'among', 'down', 'per', 'must', 'hereby', '‘ve', 'n’t', '’ve', 'over', 'whence', 'hereupon', 'beforehand', 'how', 'even', 'it', 'when', 'anyhow', 'can', 'regarding', 'therein', 'cannot', 'towards', 'other', 'wherein', 'nine', 'above', 're', 'n‘t', 'ten', 'onto', 'was', 'latter', 'cal

In [38]:
len(nlp.Defaults.stop_words)

326

In [39]:
nlp.vocab['is']

<spacy.lexeme.Lexeme at 0x1a2b1dc8b8>

In [40]:
nlp.vocab['mystery'].is_stop

False

In [41]:
nlp.vocab['is'].is_stop

True

#### You can manually add stop words

In [42]:
nlp.Defaults.stop_words.add('btw')

In [43]:
nlp.vocab['btw'].is_stop

True

In [44]:
len(nlp.Defaults.stop_words)

327

In [45]:
nlp.Defaults.stop_words.add('DDowddwo')

In [46]:
nlp.vocab['DDowddwo'].is_stop

False

In [47]:
nlp.vocab['DDowddwo'].is_stop = True

In [48]:
nlp.vocab['DDowddwo'].is_stop

True

In [49]:
nlp.Defaults.stop_words.remove('DDowddwo')

In [51]:
nlp.vocab['DDowddwo'].is_stop = False

In [52]:
nlp.vocab['DDowddwo'].is_stop

False

#### Vocabulary and Phrase Matching with Spacy

In [55]:
"""Rule matching, like regex but powerful"""

'Rule matching, like regex but powerful'

In [56]:
from spacy.matcher import Matcher

In [57]:
matcher = Matcher(nlp.vocab)

In [59]:
# SolarPower
pattern1 = [{'LOWER': 'solarpower'}]
# Solar-power
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]
# Solar Power
pattern3 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]

In [60]:
matcher.add('SolarPower', None,pattern1, pattern2, pattern3 )

In [61]:
doc = nlp(u"The Solar Power continues to grow solarpower for betterment of solar-power")

In [62]:
found_matches = matcher(doc)

In [64]:
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 6, 7), (8656102463236116519, 10, 13)]


In [65]:
found_matches

[(8656102463236116519, 1, 3),
 (8656102463236116519, 6, 7),
 (8656102463236116519, 10, 13)]

In [66]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id] # get string representation
    span = doc[start:end] # get the start and end of matched span
    print(match_id, start, end, span.text)

8656102463236116519 1 3 Solar Power
8656102463236116519 6 7 solarpower
8656102463236116519 10 13 solar-power


In [67]:
"""remove pattern or matcher"""

'remove pattern or matcher'

In [68]:
matcher.remove('SolarPower')

In [69]:
#solarpower SolarPower
pattern1 = [{'LOWER': 'solarpower'}]
# solar--_+.Power
pattern4 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP': '*'}, {'LOWER': 'power'}]

In [77]:
matcher.add('SolarPower', None, pattern1, pattern4)

In [78]:
doc_match = nlp(u"Solar--power is solarpower, solar__*power is a solar")

In [79]:
found_matches = matcher(doc_match)

In [80]:
print(found_matches)

[(8656102463236116519, 0, 3), (8656102463236116519, 4, 5)]


#### Phrase matcher

In [139]:
from spacy.matcher import PhraseMatcher

In [140]:
matcher = PhraseMatcher(nlp.vocab)

In [141]:
with open('./book.txt') as f:
    doc3 = nlp(f.read())

In [142]:
phrase_list = ['orchard-planting', 'pointed out the room', 'Eastern Home']

In [143]:
phrase_pattern = [nlp(text) for text in phrase_list]

In [144]:
phrase_pattern

[orchard-planting, pointed out the room, Eastern Home]

In [145]:
type(phrase_pattern)

list

In [146]:
type(phrase_pattern[0])

spacy.tokens.doc.Doc

In [147]:
matcher.add('storymatcher', None, *phrase_pattern)

In [148]:
found_matches = matcher(doc3)

In [149]:
found_matches

[(14996842965689958555, 44649, 44652), (14996842965689958555, 79085, 79089)]

In [150]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc3[start:end]
    print(match_id, string_id, start, end, span.text)

14996842965689958555 storymatcher 44649 44652 orchard-planting
14996842965689958555 storymatcher 79085 79089 pointed out the room


In [271]:
for match_id, start, end in found_matches:
    span = doc3[start-5:end+10]
    print(start, end, span.text)

44649 44652 little has been done in orchard-planting. Figs
are readily grown and it is said
79085 79089 before the house, and pointed out the room the great writer
occupied during his stay in Monterey


### Parts of Speech tag - POS and NER- Name Entity Recognition

In [154]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back")

In [155]:
print(doc.text)

The quick brown fox jumped over the lazy dog's back


In [156]:
doc[4]

jumped

In [157]:
doc[4].pos_

'VERB'

In [160]:
print(doc[4].tag_)
#'''it return verb in past tense fine grained tag'''

VBD


In [168]:
for token in doc:
    print(f"{token.text:{20}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

The                  DET        DT         determiner
quick                ADJ        JJ         adjective
brown                ADJ        JJ         adjective
fox                  NOUN       NN         noun, singular or mass
jumped               VERB       VBD        verb, past tense
over                 ADP        IN         conjunction, subordinating or preposition
the                  DET        DT         determiner
lazy                 ADJ        JJ         adjective
dog                  NOUN       NN         noun, singular or mass
's                   PART       POS        possessive ending
back                 NOUN       NN         noun, singular or mass


In [195]:
# Spacy is smart enough to read the context
doc = nlp(u"I read books on NLP.")

In [196]:
word = doc[1]

In [197]:
word.text

'read'

In [198]:
token = word
print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

read       VERB       VBP        verb, non-3rd person singular present


In [208]:
doc1 = nlp(u"I read a book on NLP.")

In [209]:
token = doc1[1]
print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

read       VERB       VBP        verb, non-3rd person singular present


In [214]:
# part of speech count
doc = nlp(u"The quick brown fox jumped over the lazy dog's back")
pos_count = doc.count_by(spacy.attrs.POS)

In [215]:
pos_count

{90: 2, 84: 3, 92: 3, 100: 1, 85: 1, 94: 1}

In [218]:
doc.vocab[84].text

'ADJ'

In [219]:
doc.vocab[92].text

'NOUN'

In [221]:
len(doc.vocab)

10086

In [222]:
# Visualize parts of speech
doc = nlp(u"The quick brown fox jumped over the lazy dog")

In [225]:
from spacy import displacy
displacy.render(doc, style='dep', )

In [232]:
options = {'distance': 100, 'compact': 'True', 'color': 'yellow', 'bg': '#09a3d5', 'font': 'Times'}

In [233]:
displacy.render(doc, style='dep', options = options )

In [234]:
## NER: Name Entity Recognition
# It loces and classify named entity such as person's name, company, location, 
# medical codes, time expression, quantities, values, percentage, etc


In [247]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text, ent.label, str(spacy.explain(ent.label_)))
    else: 
        print('No entities found')

In [248]:
doc = nlp(u"Hi, how are you?")

In [249]:
show_ents(doc)

No entities found


In [252]:
doc = nlp(u"My I got to Washington, DC next May to see the Washington Monument")

In [253]:
show_ents(doc)

Washington, DC 384 Countries, cities, states
next May 391 Absolute or relative dates or periods
the Washington Monument 383 Companies, agencies, institutions, etc.


In [256]:
doc = nlp(u"Can I buy 800 dollars stocks of Microsoft")

In [258]:
show_ents(doc)

800 dollars 394 Monetary values, including unit
Microsoft 383 Companies, agencies, institutions, etc.


In [259]:
doc = nlp(u"Can I buy 800 dollars stocks of Tesla")

In [260]:
show_ents(doc)

800 dollars 394 Monetary values, including unit


In [261]:
from spacy.tokens import Span
ORG = doc.vocab.strings[u"ORG"]

In [262]:
ORG

383

In [267]:
new_ent = Span(doc, 7,8, label=ORG)

In [268]:
new_ent

Tesla

In [269]:
doc.ents = list(doc.ents) + [new_ent]

In [270]:
show_ents(doc)

800 dollars 394 Monetary values, including unit
Tesla 383 Companies, agencies, institutions, etc.


In [None]:
# add multi term entities 