In [1]:
# importing spacy library and creating a modelnl
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
# a text object is created with nlp model
# u stands for uniform string
doc = nlp(u"Tesla is looking at buying U.S. startup for $6 million. It couldn't be real.")

In [3]:
# it is possible to loop over tokens 
# pos: part of speech, pos_: type of the words, dep_: syntactic dependency
for token in doc:
    print(token.text, token.pos, token.pos_, token.dep_)

Tesla 96 PROPN nsubj
is 87 AUX aux
looking 100 VERB ROOT
at 85 ADP prep
buying 100 VERB pcomp
U.S. 96 PROPN compound
startup 92 NOUN dobj
for 85 ADP prep
$ 99 SYM quantmod
6 93 NUM compound
million 93 NUM pobj
. 97 PUNCT punct
It 95 PRON nsubj
could 100 VERB aux
n't 94 PART neg
be 87 AUX ROOT
real 84 ADJ acomp
. 97 PUNCT punct


In [4]:
# listing the pipeline that nlp model applies to a piece of text
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x201863f5850>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x201862e3580>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x201862e3520>)]

In [5]:
# words could be analyzed individually
doc[2], doc[2].pos_

(looking, 'VERB')

In [6]:
doc2 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [7]:
# takşng a span from a doc
life_quote = doc2[16:30]
print(life_quote)

"Life is what happens to us while we are making other plans"


In [8]:
# spacy understands when a piece of doc is taken as span
type(doc2), type(life_quote)

(spacy.tokens.doc.Doc, spacy.tokens.span.Span)

In [9]:
# sents method seperates the sentences
doc3 = nlp(u"This is the first sentence. This is the second! This is the last.")
for sentence in doc3.sents:
    print(sentence)

This is the first sentence.
This is the second!
This is the last.


In [10]:
# is_sent_start is used for to chechk start of a sentence
doc3[6].is_sent_start

True

In [11]:
# spacy can tokenize the complex sentences
mystring = '"We\'re moving to L.A.!". Our website is www.website.com. Visit us very-fast!'
doc4 = nlp(mystring)
for token in doc4:
    print(token)

"
We
're
moving
to
L.A.
!
"
.
Our
website
is
www.website.com
.
Visit
us
very
-
fast
!


In [12]:
# number of tokens in a sentence
len(doc4)

20

In [13]:
# printing all the tokens in one line is possible
doc5 = nlp(u'Apple to build a Hong Kong factory for $6 miillion.')
for token in doc5:
    print(token, end=' | ')

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | miillion | . | 

In [14]:
# listing the entities and their types and explanations
for entity in doc5.ents:
    print(entity)
    print(entity.label_)
    print(str(spacy.explain(entity.label_)))
    print('\n')

Apple
ORG
Companies, agencies, institutions, etc.


Hong Kong
GPE
Countries, cities, states


$6
MONEY
Monetary values, including unit




In [15]:
# noun chunks in a text
doc6 = nlp(u'Autonomous cars shift insurance liability toward manufacturers.')
for chunk in doc6.noun_chunks:
    print(chunk)

Autonomous cars
insurance liability
manufacturers


### visualization

In [16]:
from spacy import displacy

In [17]:
# visualizing a sentence with syntactic dependency
doc7 = nlp(u'Apple is going to build a U.K. factory for $6 million.')
displacy.render(doc7, style='dep', jupyter=True, options={'distance':80})

In [18]:
# visualization of entities in a sentence by highlighting method
doc8 = nlp(u'Over the last quarter, Apple sold nearly 20 thousands iPods for a profit of $6 million.')
displacy.render(doc8, style='ent', jupyter=True)

In [19]:
# showing visualizations on web
displacy.serve(doc8, style='dep')
# go to 'http://127.0.0.1:5000/' address in the web browser




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


### stemming

In [20]:
# creating PorterStemmer object
# spacy doesn't have stemming libraries
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

In [21]:
words = ['run','runner','ran','runs','easily','fairly','fairness']

In [22]:
# stemming words with PorterStemmer
# stemmers have some rules to convert words
# be careful about words end with 'li'
p_stemmer = PorterStemmer()
for word in words:
    print(word + '--->' + p_stemmer.stem(word))

run--->run
runner--->runner
ran--->ran
runs--->run
easily--->easili
fairly--->fairli
fairness--->fair


In [23]:
# stemming words with SnowballStemmer 
# snowball stemmer looks better than PorterStemmer
s_stemmer = SnowballStemmer(language='english')
for word in words:
    print(word + '--->' + s_stemmer.stem(word))

run--->run
runner--->runner
ran--->ran
runs--->run
easily--->easili
fairly--->fair
fairness--->fair


In [24]:
words2 = ['generous', 'generation', 'generously','generate']

In [25]:
# different types of word which looks same needs to be different
for word in words2:
    print(word + '--->' + s_stemmer.stem(word))

generous--->generous
generation--->generat
generously--->generous
generate--->generat


### lemmatization

In [26]:
nlp = spacy.load('en_core_web_sm')

In [27]:
# lemmatization of a sentence
# lemmatization is finding the roots of words
doc9 = nlp(u'I am a runner running in a race because I love to run since I ran today')
for token in doc9:
        print(f'{token.text:{10}} {token.pos:{5}} {token.lemma:<{22}} {token.lemma_}')

I             95 561228191312463089     -PRON-
am            87 10382539506755952630   be
a             90 11901859001352538922   a
runner        92 12640964157389618806   runner
running      100 12767647472892411841   run
in            85 3002984154512732771    in
a             90 11901859001352538922   a
race          92 8048469955494714898    race
because       98 16950148841647037698   because
I             95 561228191312463089     -PRON-
love         100 3702023516439754181    love
to            94 3791531372978436496    to
run          100 12767647472892411841   run
since         98 10066841407251338481   since
I             95 561228191312463089     -PRON-
ran          100 12767647472892411841   run
today         92 11042482332948150395   today


### stop words

In [28]:
# stop words in english
nlp = spacy.load('en_core_web_sm')
print(nlp.Defaults.stop_words)

{'everywhere', 'noone', 'before', 'keep', 'together', 'used', 'empty', 'during', 'towards', 'themselves', 'hence', 'until', 'my', 'since', 'due', 'much', 'mine', 'almost', 'enough', '’m', 'whole', 'am', 'whenever', 'along', 'could', 'so', 'made', 'never', 'being', 'anyway', '‘m', 'between', 'all', 'this', 'across', 'ever', 'least', '’d', 'therefore', 'first', 'no', 'anywhere', 'after', 'moreover', 'by', 'too', 'does', 'whence', 'himself', 'bottom', 'each', 'afterwards', 'thereafter', 'meanwhile', 'twelve', 'full', 'her', 'else', 'when', 'such', '’ve', 'really', 'everyone', 'onto', 'something', 'toward', 'top', 'whither', 'for', 'except', 'three', 'cannot', 'side', 'was', 'whoever', 'us', 'nevertheless', 'also', 'thence', 'various', 'them', 'last', 'which', 'always', 'front', 'other', 'third', "'re", '’re', 'upon', 'be', 'or', 'own', 'whom', 'few', 'even', 'he', 'yet', 'must', 'quite', 'using', 'done', 'amount', 'about', 'another', 'perhaps', 'there', 'through', 'indeed', '‘s', 'him', '

In [29]:
# there are 326 defined stop words in spacy
len(nlp.Defaults.stop_words)

326

In [30]:
# checking weather a verb is a stop word or not
nlp.vocab['is'].is_stop

True

In [31]:
# adding a stop word to the list manually
nlp.Defaults.stop_words.add('btw')
nlp.vocab['btw'].is_stop = True
nlp.vocab['btw'].is_stop

True

In [32]:
# removing a stop word from the list
nlp.Defaults.stop_words.remove('btw')
nlp.vocab['btw'].is_stop = False

### phrase matching and vocabulary

In [33]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

In [34]:
# defining patterns
# solarpower
pattern1 = [{'LOWER': 'solarpower'}]
# solar power
pattern2 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]
# solar-power
pattern3 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]

In [35]:
# defining a matcher object
matcher.add('SolarPower', None, pattern1, pattern2, pattern3)

In [36]:
# creating a matcher object
doc10 = nlp(u'The Solar Power industry continuous to grow a solarpower increases. Solar-power is awesome.')
found_matches = matcher(doc10)

In [37]:
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 8, 9), (8656102463236116519, 11, 14)]


In [38]:
# printing the matches 
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc10[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 8 9 solarpower
8656102463236116519 SolarPower 11 14 Solar-power


In [39]:
# removing a pattern
matcher.remove('SolarPower')

In [40]:
# Solarpower, solarPower etc.
pattern4 = [{'LOWER':'solarpower'}]
# solar-power, solar power, solar-=)power etc.
# OP parameter allows to add punctuations
pattern5 = [{'LOWER':'solar'},{'IS_PUNCT':True, 'OP':'*'},{'LOWER':'power'}]

In [41]:
matcher.add('SolarPower', None, pattern4, pattern5)
doc11 = nlp(u'Solar--power is solar power yay!')
found_matches2 = matcher(doc11)
print(found_matches2)

[(8656102463236116519, 0, 3), (8656102463236116519, 4, 6)]


In [42]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [43]:
# reading a text file
with open('reaganomics.txt') as f:
    doc12 = nlp(f.read())

In [44]:
# creating a match phrases list
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']

# converting each phrase to a Doc object
phrase_patterns = [nlp(text) for text in phrase_list]

# Pass each Doc object into matcher (note the use of the asterisk!):
matcher.add('EconMatcher', None, *phrase_patterns)

# Build a list of matches:
found_matches3 = matcher(doc12)
print(found_matches3)

[(3680293220734633682, 41, 45), (3680293220734633682, 49, 53), (3680293220734633682, 54, 56), (3680293220734633682, 61, 65), (3680293220734633682, 673, 677), (3680293220734633682, 2987, 2991)]


In [47]:
# by changing the start and end point the context could be showed
for match_id, start, end in found_matches3:
    string_id = nlp.vocab.strings[match_id]
    span = doc12[start-2:end+3]             
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 associated with supply-side economics, referred to
3680293220734633682 EconMatcher 49 53 to as trickle-down economics or voodoo economics
3680293220734633682 EconMatcher 54 56 economics or voodoo economics by political opponents
3680293220734633682 EconMatcher 61 65 , and free-market economics by political advocates
3680293220734633682 EconMatcher 673 677 from the supply-side economics movement, which
3680293220734633682 EconMatcher 2987 2991 as "trickle-down economics", due
