In [1]:
# importing spacy library and creating a modelnl
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
# a text object is created with nlp model
# u stands for uniform string
doc = nlp(u"Tesla is looking at buying U.S. startup for $6 million. It couldn't be real.")

In [3]:
# it is possible to loop over tokens 
# pos: part of speech, pos_: type of the words, dep_: syntactic dependency
for token in doc:
    print(token.text, token.pos, token.pos_, token.dep_)

Tesla 96 PROPN nsubj
is 87 AUX aux
looking 100 VERB ROOT
at 85 ADP prep
buying 100 VERB pcomp
U.S. 96 PROPN compound
startup 92 NOUN dobj
for 85 ADP prep
$ 99 SYM quantmod
6 93 NUM compound
million 93 NUM pobj
. 97 PUNCT punct
It 95 PRON nsubj
could 100 VERB aux
n't 94 PART neg
be 87 AUX ROOT
real 84 ADJ acomp
. 97 PUNCT punct


<img src="sources/pipeline.png" width="1000">

In [4]:
# listing the pipeline that nlp model applies to a piece of text
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x294dc06cdf0>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x294dbf5cf40>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x294dbf5cee0>)]

In [5]:
# pipe names
nlp.pipe_names

['tagger', 'parser', 'ner']

In [6]:
# words could be analyzed individually
doc[2], doc[2].pos_

(looking, 'VERB')

In [7]:
# part of speech tagging
doc[0].pos_
# https://spacy.io/api/annotation#pos-tagging

'PROPN'

In [8]:
# dependencies
doc[0].dep_

'nsubj'

In [9]:
# explaining pos
spacy.explain('PROPN')

'proper noun'

### Additional Token Attributes

|Tag|Description|doc2[0].tag|
|:------|:------:|:------|
|`.text`|The original word text<!-- .element: style="text-align:left;" -->|`Tesla`|
|`.lemma_`|The base form of the word|`tesla`|
|`.pos_`|The simple part-of-speech tag|`PROPN`/`proper noun`|
|`.tag_`|The detailed part-of-speech tag|`NNP`/`noun, proper singular`|
|`.shape_`|The word shape – capitalization, punctuation, digits|`Xxxxx`|
|`.is_alpha`|Is the token an alpha character?|`True`|
|`.is_stop`|Is the token part of a stop list, i.e. the most common words of the language?|`False`|

In [10]:
# Lemmas (the base form of the word):
print(doc[2].text)
print(doc[2].lemma_)

looking
look


In [11]:
# Simple Parts-of-Speech & Detailed Tags:
print(doc[2].pos_)
print(doc[2].tag_ + ' / ' + spacy.explain(doc[4].tag_))

VERB
VBG / verb, gerund or present participle


In [12]:
# Word Shapes:
print(doc[0].text+': '+doc[0].shape_)
print(doc[5].text+' : '+doc[5].shape_)

Tesla: Xxxxx
U.S. : X.X.


In [13]:
# Boolean Values:
print(doc[0].is_alpha)
print(doc[0].is_stop)

True
False


In [14]:
doc2 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [15]:
# taking a span from a doc
life_quote = doc2[16:30]
print(life_quote)

"Life is what happens to us while we are making other plans"


In [16]:
# spacy understands when a piece of doc is taken as span
type(doc2), type(life_quote)

(spacy.tokens.doc.Doc, spacy.tokens.span.Span)

In [17]:
# sents method seperates the sentences
doc3 = nlp(u"This is the first sentence. This is the second! This is the last.")
for sentence in doc3.sents:
    print(sentence)

This is the first sentence.
This is the second!
This is the last.


In [18]:
# is_sent_start is used for to chechk start of a sentence
doc3[6].is_sent_start

True

In [19]:
# Create a Doc object and explore tokens
mystring = '"We\'re moving to L.A.!"'
doc4 = nlp(mystring)

for token in doc4:
    print(token.text, end=' | ')

" | We | 're | moving | to | L.A. | ! | " | 

<img src="sources/tokenization.png" width="800">

-  **Prefix**:	Character(s) at the beginning &#9656; `$ ( “ ¿`
-  **Suffix**:	Character(s) at the end &#9656; `km ) , . ! ”`
-  **Infix**:	Character(s) in between &#9656; `- -- / ...`
-  **Exception**: Special-case rule to split a string into several tokens or prevent a token from being split when punctuation rules are applied &#9656; `St. U.S.`

In [20]:
# spacy can tokenize the complex sentences
mystring = '"We\'re moving to L.A.!". Our website is www.website.com. Visit us very-fast!'
doc4 = nlp(mystring)
for token in doc4:
    print(token)

"
We
're
moving
to
L.A.
!
"
.
Our
website
is
www.website.com
.
Visit
us
very
-
fast
!


In [21]:
# number of tokens in a sentence
len(doc4)

20

In [22]:
# printing all the tokens in one line is possible
doc5 = nlp(u'Apple to build a Hong Kong factory for $6 miillion.')
for token in doc5:
    print(token, end=' | ')

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | miillion | . | 

In [23]:
doc5[2:5]

build a Hong

### named entities
https://spacy.io/usage/linguistic-features#named-entities

In [24]:
# listing the entities and their types and explanations
for entity in doc5.ents:
    print(entity)
    print(entity.label_)
    print(str(spacy.explain(entity.label_)))
    print('\n')

Apple
ORG
Companies, agencies, institutions, etc.


Hong Kong
GPE
Countries, cities, states


$6
MONEY
Monetary values, including unit




In [25]:
# noun chunks in a text
doc6 = nlp(u'Autonomous cars shift insurance liability toward manufacturers.')
for chunk in doc6.noun_chunks:
    print(chunk)
# https://spacy.io/usage/linguistic-features#noun-chunks

Autonomous cars
insurance liability
manufacturers


### visualization

In [26]:
from spacy import displacy
# https://spacy.io/usage/visualizers

In [27]:
# visualizing a sentence with syntactic dependency
doc7 = nlp(u'Apple is going to build a U.K. factory for $6 million.')
displacy.render(doc7, style='dep', jupyter=True, options={'distance':80})

In [28]:
# visualization of entities in a sentence by highlighting method
doc8 = nlp(u'Over the last quarter, Apple sold nearly 20 thousands iPods for a profit of $6 million.')
displacy.render(doc8, style='ent', jupyter=True)

In [29]:
# showing visualizations on web
# displacy.serve(doc8, style='dep')
# go to 'http://127.0.0.1:5000/' address in the web browser

### stemming

![stemming1.png](sources/stemming1.png)

In [30]:
# stemming is finding the roots of words
# creating PorterStemmer object
# spacy doesn't have stemming libraries
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

In [31]:
words = ['run','runner','ran','runs','easily','fairly','fairness']

In [32]:
# stemming words with PorterStemmer
# stemmers have some rules to convert words
# be careful about words end with 'li'
p_stemmer = PorterStemmer()
for word in words:
    print(word + '--->' + p_stemmer.stem(word))

run--->run
runner--->runner
ran--->ran
runs--->run
easily--->easili
fairly--->fairli
fairness--->fair


In [33]:
# stemming words with SnowballStemmer 
# snowball stemmer looks better than PorterStemmer
s_stemmer = SnowballStemmer(language='english')
for word in words:
    print(word + '--->' + s_stemmer.stem(word))

run--->run
runner--->runner
ran--->ran
runs--->run
easily--->easili
fairly--->fair
fairness--->fair


In [34]:
words2 = ['generous', 'generation', 'generously','generate']

In [35]:
# different types of word which looks same needs to be different
for word in words2:
    print(word + '--->' + s_stemmer.stem(word))

generous--->generous
generation--->generat
generously--->generous
generate--->generat


### lemmatization

In [36]:
nlp = spacy.load('en_core_web_sm')

In [37]:
# lemmatization of a sentence
# lemmatization is to apply a morphological analysis to words
doc9 = nlp(u'I am a runner running in a race because I love to run since I ran today')
for token in doc9:
        print(f'{token.text:{10}} {token.pos:{5}} {token.lemma:<{22}} {token.lemma_}')

I             95 561228191312463089     -PRON-
am            87 10382539506755952630   be
a             90 11901859001352538922   a
runner        92 12640964157389618806   runner
running      100 12767647472892411841   run
in            85 3002984154512732771    in
a             90 11901859001352538922   a
race          92 8048469955494714898    race
because       98 16950148841647037698   because
I             95 561228191312463089     -PRON-
love         100 3702023516439754181    love
to            94 3791531372978436496    to
run          100 12767647472892411841   run
since         98 10066841407251338481   since
I             95 561228191312463089     -PRON-
ran          100 12767647472892411841   run
today         92 11042482332948150395   today


### stop words

In [38]:
# spaCy's built-in stopwords
nlp = spacy.load('en_core_web_sm')
print(nlp.Defaults.stop_words)

{'somewhere', 'will', "'ll", 'someone', 'same', '’ll', 'when', 'now', 'namely', 'through', 'beyond', 'very', 'anything', 'n’t', 'get', 'also', 'ours', 'all', 'forty', 'then', 'doing', 'less', 'whereas', 'off', 'again', 'name', 'full', 'under', 'she', 'always', 'keep', 'both', 'please', 'itself', 'too', 'without', 'or', 'ca', 'serious', 'out', 'whom', 'ten', 'beforehand', 'several', 'hereby', 'was', 'though', 'about', 'front', 'everyone', 'go', 'such', 'quite', 'eight', 'herself', 'their', 'among', 'for', 'seem', 'twenty', '’s', 'the', 'but', 'whither', 'thereby', 'up', 'once', 'if', 'onto', 'behind', 'something', 'hundred', '‘d', "n't", 'seeming', 'might', 'used', 'not', 'we', 'at', 'enough', 'show', '’ve', 'whenever', 'take', 'anyhow', "'m", 'over', 'this', 'n‘t', 'somehow', 'whereafter', 'an', "'ve", 'into', 'thru', 'afterwards', 'none', 'amount', 'where', 'besides', 'whose', 'be', 'few', 'most', 'others', 'part', 'nothing', 'just', 'after', 'via', 'and', 'often', 'me', 'his', 'moreo

In [39]:
# NLTK's built-in stopwords
from nltk.corpus import stopwords
stopWords = stopwords.words('english')
len(stopWords)

179

In [40]:
# Scikit-learn's built-in stopwords
from sklearn.feature_extraction import text
stopwords = text.ENGLISH_STOP_WORDS
len(stopwords)

318

In [41]:
# there are 326 defined stop words in spacy
len(nlp.Defaults.stop_words)

326

In [42]:
# checking weather a verb is a stop word or not
nlp.vocab['is'].is_stop

True

In [43]:
# adding a stop word to the list manually
nlp.Defaults.stop_words.add('btw')
nlp.vocab['btw'].is_stop = True
nlp.vocab['btw'].is_stop

True

In [44]:
# removing a stop word from the list
nlp.Defaults.stop_words.remove('btw')
nlp.vocab['btw'].is_stop = False

### phrase matching and vocabulary

In [45]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

In [46]:
# defining patterns
# solarpower
pattern1 = [{'LOWER': 'solarpower'}]
# solar power
pattern2 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]
# solar-power
pattern3 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]

In [47]:
# defining a matcher object
matcher.add('SolarPower', None, pattern1, pattern2, pattern3)

In [48]:
# creating a matcher object
doc10 = nlp(u'The Solar Power industry continuous to grow a solarpower increases. Solar-power is awesome.')
found_matches = matcher(doc10)

In [49]:
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 8, 9), (8656102463236116519, 11, 14)]


In [50]:
# printing the matches 
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc10[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 8 9 solarpower
8656102463236116519 SolarPower 11 14 Solar-power


In [51]:
# removing a pattern
matcher.remove('SolarPower')

In [52]:
# Solarpower, solarPower etc.
pattern4 = [{'LOWER':'solarpower'}]
# solar-power, solar power, solar-=)power etc.
# OP parameter allows to add punctuations
pattern5 = [{'LOWER':'solar'},{'IS_PUNCT':True, 'OP':'*'},{'LOWER':'power'}]

This found both two-word patterns, with and without the hyphen!

The following quantifiers can be passed to the `'OP'` key:
<table><tr><th>OP</th><th>Description</th></tr>

<tr ><td><span >\!</span></td><td>Negate the pattern, by requiring it to match exactly 0 times</td></tr>
<tr ><td><span >?</span></td><td>Make the pattern optional, by allowing it to match 0 or 1 times</td></tr>
<tr ><td><span >\+</span></td><td>Require the pattern to match 1 or more times</td></tr>
<tr ><td><span >\*</span></td><td>Allow the pattern to match zero or more times</td></tr>
</table>


In [53]:
matcher.add('SolarPower', None, pattern4, pattern5)
doc11 = nlp(u'Solar--power is solar power yay!')
found_matches2 = matcher(doc11)
print(found_matches2)

[(8656102463236116519, 0, 3), (8656102463236116519, 4, 6)]


## Other token attributes
Besides lemmas, there are a variety of token attributes we can use to determine matching rules:
<table><tr><th>Attribute</th><th>Description</th></tr>

<tr ><td><span >`ORTH`</span></td><td>The exact verbatim text of a token</td></tr>
<tr ><td><span >`LOWER`</span></td><td>The lowercase form of the token text</td></tr>
<tr ><td><span >`LENGTH`</span></td><td>The length of the token text</td></tr>
<tr ><td><span >`IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`</span></td><td>Token text consists of alphanumeric characters, ASCII characters, digits</td></tr>
<tr ><td><span >`IS_LOWER`, `IS_UPPER`, `IS_TITLE`</span></td><td>Token text is in lowercase, uppercase, titlecase</td></tr>
<tr ><td><span >`IS_PUNCT`, `IS_SPACE`, `IS_STOP`</span></td><td>Token is punctuation, whitespace, stop word</td></tr>
<tr ><td><span >`LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`</span></td><td>Token text resembles a number, URL, email</td></tr>
<tr ><td><span >`POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE`</span></td><td>The token's simple and extended part-of-speech tag, dependency label, lemma, shape</td></tr>
<tr ><td><span >`ENT_TYPE`</span></td><td>The token's entity label</td></tr>

</table>

### Token wildcard
You can pass an empty dictionary `{}` as a wildcard to represent **any token**. For example, you might want to retrieve hashtags without knowing what might follow the `#` character:
>`[{'ORTH': '#'}, {}]`

In [54]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [55]:
# reading a text file
with open('sources/reaganomics.txt') as f:
    doc12 = nlp(f.read())

In [56]:
# creating a match phrases list
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']

# converting each phrase to a Doc object
phrase_patterns = [nlp(text) for text in phrase_list]

# Pass each Doc object into matcher (note the use of the asterisk!):
matcher.add('EconMatcher', None, *phrase_patterns)

# Build a list of matches:
found_matches3 = matcher(doc12)
print(found_matches3)

[(3680293220734633682, 41, 45), (3680293220734633682, 49, 53), (3680293220734633682, 54, 56), (3680293220734633682, 61, 65), (3680293220734633682, 673, 677), (3680293220734633682, 2987, 2991)]


In [57]:
# by changing the start and end point the context could be showed
for match_id, start, end in found_matches3:
    string_id = nlp.vocab.strings[match_id]
    span = doc12[start-2:end+3]             
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 associated with supply-side economics, referred to
3680293220734633682 EconMatcher 49 53 to as trickle-down economics or voodoo economics
3680293220734633682 EconMatcher 54 56 economics or voodoo economics by political opponents
3680293220734633682 EconMatcher 61 65 , and free-market economics by political advocates
3680293220734633682 EconMatcher 673 677 from the supply-side economics movement, which
3680293220734633682 EconMatcher 2987 2991 as "trickle-down economics", due
