In [1]:
import spacy

In [None]:
#!python -m spacy download en_core_web_sm

In [4]:
nlp=spacy.load('en_core_web_sm')

## Tokenization

In [5]:
#Create a Doc object
doc = nlp(u'Corona will go very soon. Do not get panic, maintain social distancing and follow the instructions. Cases in U.S. have reduced in last 48 hours')

#Print each token separately
for token in doc:
    print(token.text, token.pos_, token.dep_)

Corona PROPN nsubj
will AUX aux
go VERB ROOT
very ADV advmod
soon ADV advmod
. PUNCT punct
Do AUX aux
not PART neg
get VERB ROOT
panic NOUN dobj
, PUNCT punct
maintain VERB conj
social ADJ amod
distancing NOUN dobj
and CCONJ cc
follow VERB conj
the DET det
instructions NOUN dobj
. PUNCT punct
Cases NOUN nsubj
in ADP prep
U.S. PROPN pobj
have AUX aux
reduced VERB ROOT
in ADP prep
last ADJ amod
48 NUM nummod
hours NOUN pobj


In [6]:
nlp.pipeline


[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x17f12a260>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x17f129120>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x17ed398c0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x17e2609c0>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x17f2a7440>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x17ed39d20>)]

In [7]:
doc2 = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc2:
    print(token.text, token.pos_, token.dep_)
    

Apple PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.K. PROPN dobj
startup VERB dep
for ADP prep
$ SYM quantmod
1 NUM compound
billion NUM pobj


## POS Tagging 

In [8]:
doc4 = nlp(u"Apple isn't looking at buying U.K. startup.")
for token in doc4:     
    print(token.text, token.pos_)

Apple PROPN
is AUX
n't PART
looking VERB
at ADP
buying VERB
U.K. PROPN
startup NOUN
. PUNCT


In [9]:
spacy.explain('PROPN')

'proper noun'

## Span (slicing)

In [10]:
doc5 = nlp(u"Apple isn't looking at buying U.K. startup.")
sliced_text = doc5[4:7]
sliced_text

at buying U.K.

## Sentence 

In [11]:
doc6 = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')
for sent in doc6.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


## Lemmatization
Reduce the form of a word  by applying morphological analysis

In [12]:
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today")
for token in doc1:
    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)

I 	 PRON 	 4690420944186131903 	 I
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 4690420944186131903 	 I
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 4690420944186131903 	 I
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


## Stop Words

In [14]:
nlp.Defaults.stop_words

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [15]:
nlp.vocab['fifteen'].is_stop

True

### Add a stop word

In [16]:
#Add the word to the set of stop words. Use lowercase!
nlp.Defaults.stop_words.add('btw') #alwasy use lowercase while adding the stop words

In [17]:
#Set the stop_word tag on the lexeme
nlp.vocab['btw'].is_stop = True

In [19]:
nlp.vocab['btw'].is_stop

True

### Removing a stop word

In [21]:
#Remove the word from the set of stop words from the list
nlp.Defaults.stop_words.remove('without')

KeyError: 'without'

In [22]:
#Remove the stop_word tag from the lexeme
nlp.vocab['without'].is_stop = False

In [25]:
nlp.vocab['without'].is_stop

False

## Vocabulary and matching

### Rule-based matching

In [27]:
#Import the Matcher library
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

In [30]:
#Creating patterns
# We want to develop a matcher that find all forms of spelling of united states
# pattern1 looks for a single token whose lowercase text reads ‘unitedstates’
# pattern2 looks for two adjacent tokens that read ‘united’ and ‘states’ in that order
# pattern3 looks for three adjacent tokens, with a middle token that can be any punctuation.*
pattern1 = [{'LOWER': 'unitedstates'}]
pattern2 = [{'LOWER': 'united'}, {'LOWER': 'states'}]
pattern3 = [{'LOWER': 'united'}, {'IS_PUNCT': True}, {'LOWER': 'states'}]
patterns=[pattern1,pattern2,pattern3]

In [33]:
#add patterns to the matcher
matcher.add('UnitedStates', patterns)


In [34]:
# Applying the matcher
doc = nlp(u'The United States of America is a country consisting of 50 independent states. The first constitution of the UnitedStates was adopted in 1788. The current United-States flag was designed by a high school student – Robert G. Heft.')

```matcher``` returns a list of tuples. Each tuple contains (ID for the matcher, start token, end token) that map to the span ```doc[start:end]```

In [35]:
found_matches = matcher(doc)
print(found_matches)

[(15845173719804281779, 1, 3), (15845173719804281779, 19, 20), (15845173719804281779, 27, 30)]


### Setting pattern options and quantifier

In [36]:
#Redefine the patterns:
pattern1 = [{'LOWER': 'unitedstates'}]
#Token rule is made optional by passing an 'OP:'*' argument. '
pattern2 = [{'LOWER': 'united'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'states'}]

# These quantifiers can be added to the OP key
# !: Negate the pattern, by requiring it to match exactly 0 times
# ?: Make the pattern optional, by allowing it to match 0 or 1 time
# +: require the pattern to match 1 or more times
# *: Allow the pattern to match zero or more times 

In [37]:
#Remove the old patterns to avoid duplication:
matcher.remove('UnitedStates')

In [38]:
#Add the new set of patterns to the 'SolarPower' matcher:
matcher.add('someNameToMatcher', [ pattern1, pattern2])

In [40]:
doc = nlp(u'United--States has the world’s largest coal reserves.')
found_matches = matcher(doc)
print(found_matches)

[(14270899081666383025, 0, 3)]


### Lemma as a key

In [45]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LEMMA': 'power'}] # CHANGE THIS PATTERN
#Remove the old patterns to avoid duplication:
matcher.remove('someNameToMatcher') #remove the previously added matcher name
#Add the new set of patterns to the 'SolarPower' matcher:
matcher.add('SolarPower',[pattern1, pattern2])


In [46]:
doc2 = nlp(u'Solar-powered energy runs solar-powered cars.')
found_matches = matcher(doc2)
print(found_matches)

[(8656102463236116519, 0, 3), (8656102463236116519, 5, 8)]


In [None]:
# Other tokens that can be put in
# ORTH - The exact verbatim text of a token
# LOWER - The lowercase form of a token text
# LENGTH - the length of the token text
# IS_ALPHA, IS ASCII, IS_DIGIT - Token text consists of alphanumeric, characters, ASCII characters, digits
# IS_LOWER, IS_UPPER, IS_TITLE - Token text in lowercase, uppercase,titlecase
# IS_PUNCT, IS_SPACE, IS_STOP - Token is punctuation, whitespace, stop word
# LIKE_NUM, LIKE_URL, LIKE_EMAIL - Token text resembles a number, URL, email 
# POS, TAG, DEP, LEMMA, SHAPE - The token's simple and extended POS tag, dependency label, lemma, shape
# ENT_TYPE - The token's entity label

### Token wildcard
`[{'ORTH':'#'],{}]` matches whatever token follow the hashtag

## Phrase Matcher
The PhraseMatcher lets you efficiently match large terminology lists. While the Matcher lets you match sequences based on lists of token descriptions, the PhraseMatcher accepts match patterns in the form of Doc objects. 

--> Instead of matching a token, match a phrase that's turned into an NLP doc

In [47]:
#Perform standard imports, reset nlp
import spacy
nlp = spacy.load('en_core_web_sm')

In [60]:
# Import the PhraseMatcher library
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)
#create the list of words to match
fruit_list = ['apple','orange','banana',]
#obtain doc object for each word in the list and store it in a list
patterns = [nlp(fruit) for fruit in fruit_list]
#add the pattern to the matcher
matcher.add("FRUIT_PATTERN", patterns)
#process some text
doc = nlp("An orange contains citric acid and an apple contains oxalic acid")
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)


orange
apple


In [61]:
def on_match(matcher, doc, id, matches):
    print('Matched!', matches)

matcher = PhraseMatcher(nlp.vocab)
matcher.add("OBAMA", [nlp("Barack Obama")], on_match=on_match)
matcher.add("HEALTH", [nlp("health care reform"), nlp("healthcare reform")], on_match=on_match)
doc = nlp("Barack Obama urges Congress to find courage to defend his healthcare reforms")
matches = matcher(doc)

Matched! [(7732777389095836264, 0, 2)]


## Viewing Token 
spaCy encodes all strings to hash values to reduce memory usage and improve efficiency. So to get the readable string representation of an attribute, we need to add an underscore _ to its name: Note that token.pos and token.tag return integer hash values; by adding the underscores we get the text equivalent that lives in doc.vocab.



In [62]:
nlp=spacy.load('en_core_web_sm')

In [63]:
doc=nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [67]:
#{10} is the space between
for token in doc:
    print(f'{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}')

The        DET        DT         determiner
quick      ADJ        JJ         adjective (English), other noun-modifier (Chinese)
brown      ADJ        JJ         adjective (English), other noun-modifier (Chinese)
fox        NOUN       NN         noun, singular or mass
jumped     VERB       VBD        verb, past tense
over       ADP        IN         conjunction, subordinating or preposition
the        DET        DT         determiner
lazy       ADJ        JJ         adjective (English), other noun-modifier (Chinese)
dog        NOUN       NN         noun, singular or mass
's         PART       POS        possessive ending
back       NOUN       NN         noun, singular or mass
.          PUNCT      .          punctuation mark, sentence closer


### Counting POS Tags

In [69]:
POS_counts=doc.count_by(spacy.attrs.POS)
POS_counts

{90: 2, 84: 3, 92: 3, 100: 1, 85: 1, 94: 1, 97: 1}

In [70]:
#DET apperas twice in the doc
doc.vocab[90].text

'DET'

In [71]:
### Create a frequency list of POS tags from the entire document

In [74]:
for k,v in sorted (POS_counts.items()):
    print(f'{k}. {doc.vocab[k].text:{5}}: {v}')

84. ADJ  : 3
85. ADP  : 1
90. DET  : 2
92. NOUN : 3
94. PART : 1
97. PUNCT: 1
100. VERB : 1


In [77]:
DEP_counts = doc.count_by(spacy.attrs.DEP)
for k,v in sorted (DEP_counts.items()):
    print(f'{k}. {doc.vocab[k].text:{4}}: {v}')

402. amod: 3
415. det : 2
429. nsubj: 1
439. pobj: 1
440. poss: 1
443. prep: 1
445. punct: 1
8110129090154140942. case: 1
8206900633647566924. ROOT: 1


## Visualising POS

In [79]:
from spacy import displacy

In [80]:
displacy.render(doc,style='dep',jupyter=True,options={'distance':95})

In [81]:
for token in doc: 
    print(f'{token.text:{10}} {token.pos_:{7}} {token.dep_:{7}} {spacy.explain(token.dep_)}')

The        DET     det     determiner
quick      ADJ     amod    adjectival modifier
brown      ADJ     amod    adjectival modifier
fox        NOUN    nsubj   nominal subject
jumped     VERB    ROOT    None
over       ADP     prep    prepositional modifier
the        DET     det     determiner
lazy       ADJ     amod    adjectival modifier
dog        NOUN    poss    possession modifier
's         PART    case    case marking
back       NOUN    pobj    object of preposition
.          PUNCT   punct   punctuation


In [None]:
https://ashutoshtripathi.com/2020/04/27/named-entity-recognition-ner-using-spacy-nlp-part-4/