In [1]:
import spacy

In [2]:
nlp = spacy.load('en')

In [3]:
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')

In [4]:
for token in doc:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is VERB aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [5]:
nlp.pipeline

[('tagger', <spacy.pipeline.Tagger at 0x1db11e07c88>),
 ('parser', <spacy.pipeline.DependencyParser at 0x1db11e0a3a8>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x1db11e0a948>)]

In [6]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [7]:
doc2 = nlp(u"Tesla isn't looking into startups anymore.")

In [8]:
for token in doc2:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is VERB aux
n't ADV neg
looking VERB ROOT
into ADP prep
startups NOUN pobj
anymore ADV advmod
. PUNCT punct


In [9]:
doc2[0].pos_

'PROPN'

In [10]:
doc2[0].dep_

'nsubj'

In [11]:
nlp = spacy.load('en_core_web_sm')

In [12]:
mystring = '"We\'re moving to L.A.!"'

In [13]:
print(mystring)

"We're moving to L.A.!"


In [14]:
doc = nlp(mystring)

In [15]:
for token in doc:
    print(token.text)

"
We
're
moving
to
L.A.
!
"


In [16]:
doc2 = nlp(u"We're here to help! Send snail-mail email support@oursite.com or visit us at http://www.oursite.com!")

In [17]:
for t in doc2:
    print(t)

We
're
here
to
help
!
Send
snail
-
mail
email
support@oursite.com
or
visit
us
at
http://www.oursite.com
!


In [18]:
doc3 = nlp(u"A 5km NYC cab ride costs $10.30")

In [19]:
for t in doc3:
    print(t)

A
5
km
NYC
cab
ride
costs
$
10.30


In [20]:
doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.")

In [21]:
for t in doc4:
    print(t)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.


In [22]:
len(doc4)

11

In [23]:
doc4.vocab

<spacy.vocab.Vocab at 0x1db11ce8248>

In [24]:
len(doc4.vocab)

57852

In [25]:
doc5 = nlp(u"It is better to give than receive.")

In [26]:
doc5[0]

It

In [27]:
doc5[2:5]

better to give

In [28]:
doc8 = nlp(u"Apple to build a Hong Kong factory for $6 million.")

In [29]:
for token in doc8:
    print(token.text, end=' | ')

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | . | 

In [30]:
for entity in doc8.ents:
    print(entity)
    print(entity.label_)
    print(str(spacy.explain(entity.label_)))
    print('\n')

Apple
ORG
Companies, agencies, institutions, etc.


Hong Kong
GPE
Countries, cities, states


$6 million
MONEY
Monetary values, including unit




In [31]:
doc9 = nlp(u"Autonomous cars shift insurance liability towards manufacturers.")

In [32]:
for chunk in doc9.noun_chunks:
    print(chunk)

Autonomous cars
insurance liability
manufacturers


In [33]:
from spacy import displacy

In [34]:
doc = nlp(u"Apple is going to build a U.K. factory for $6 million.")

In [35]:
displacy.render(doc, style='dep', jupyter=True, options={'distance':110})

In [36]:
doc = nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.")

In [37]:
displacy.render(doc, style='ent', jupyter=True)

In [38]:
# doc = nlp(u"This is a sentence.")
# displacy.serve(doc, style='dep')

<h1>Stemming & Lammatization</h1>

<h2>Porter Stemmer & Snowball Stemmer</h2>

<h4>Porter Stemmer</h4>

<p>Porter's Algorithm was developed by Martin Porter in 1980.</p>
<p>The algorithm employs five phases of word reduction, each with its own mapping rules.</p>


<h4>Snowball Stemmer</h4>

<p>A stemming language also developed by Martin Porter.</p>
<p>This algorithm is called the "English Stemmer" or "Porter2 Stemmer"</p>
<p>It offers a slight improvement to the original Porter stemmer, both in logic and speed.</p>


<h1>Examples:</h1>

In [41]:
import nltk

In [43]:
from nltk.stem.porter import PorterStemmer

In [44]:
p_stemmer = PorterStemmer()

In [50]:
words = ['run', 'runner', 'ran', 'runs', 'easily', 'fairly', 'fairness']

In [51]:
for word in words:
    print(word + '  ------>  ' + p_stemmer.stem(word))

run  ------>  run
runner  ------>  runner
ran  ------>  ran
runs  ------>  run
easily  ------>  easili
fairly  ------>  fairli
fairness  ------>  fair


In [52]:
from nltk.stem.snowball import SnowballStemmer

In [53]:
s_stemmer = SnowballStemmer(language='english')

In [54]:
for word in words:
    print(word + '  ------>  ' + s_stemmer.stem(word))

run  ------>  run
runner  ------>  runner
ran  ------>  ran
runs  ------>  run
easily  ------>  easili
fairly  ------>  fair
fairness  ------>  fair


In [55]:
words = ['generous', 'generation', 'generously', 'generate']

In [56]:
for word in words:
    print(word + '  ------>  ' + s_stemmer.stem(word))

generous  ------>  generous
generation  ------>  generat
generously  ------>  generous
generate  ------>  generat


<h2>Lemmatization</h2>

In [57]:
import spacy

In [58]:
nlp = spacy.load("en_core_web_sm")

In [60]:
doc1 = nlp(u"I am a runner in a race because I love to run since I ran today.")

In [61]:
for token in doc1:
    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)

I 	 PRON 	 561228191312463089 	 -PRON-
am 	 VERB 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 ADP 	 16950148841647037698 	 because
I 	 PRON 	 561228191312463089 	 -PRON-
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 ADP 	 10066841407251338481 	 since
I 	 PRON 	 561228191312463089 	 -PRON-
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today
. 	 PUNCT 	 12646065887601541794 	 .


In [62]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [63]:
doc2 = nlp(u"I saw ten mice today!")

In [64]:
show_lemmas(doc2)

I            PRON   561228191312463089     -PRON-
saw          VERB   11925638236994514241   see
ten          NUM    7970704286052693043    ten
mice         NOUN   1384165645700560590    mouse
today        NOUN   11042482332948150395   today
!            PUNCT  17494803046312582752   !


<h2>Stop Words</h2>

In [65]:
print(nlp.Defaults.stop_words)

{'twenty', 'out', 'eleven', 'amongst', 'an', 'indeed', 'keep', 'therefore', 'seemed', 'although', 'off', 'any', 'anywhere', 'still', 'but', 'front', 'however', 'about', 'must', 'him', 'forty', 'no', 'above', 'much', 'up', 'meanwhile', 'not', 'us', 'give', 'whatever', 'nor', 'without', 'just', 'hundred', 'throughout', 'wherever', 'every', 'seem', 'whole', 'sixty', 'besides', 'well', 'whenever', 'via', 'who', 'many', 'hers', 'his', 'six', 'that', 'their', 'whoever', 'ever', 'become', 'four', 'further', 'all', 'beforehand', 'down', 'which', 'when', 'they', 'before', 'top', 'though', 'becomes', 'herself', 'same', 'or', 'becoming', 'to', 'both', 'made', 'yourself', 'moreover', 'our', 'its', 'you', 'among', 'because', 'name', 'otherwise', 'myself', 'be', 'less', 'a', 'eight', 'ten', 'put', 'whether', 'almost', 'thru', 'me', 'anyone', 'now', 'either', 'two', 'latterly', 'elsewhere', 'it', 'see', 'is', 'through', 'such', 'whereby', 'over', 'say', 'my', 'should', 'what', 'this', 'seems', 'them'

In [66]:
len(nlp.Defaults.stop_words)

305

In [67]:
nlp.vocab['nine'].is_stop

True

In [68]:
nlp.vocab['mystery'].is_stop

False

In [69]:
nlp.Defaults.stop_words.add('btw')

In [70]:
nlp.vocab['btw'].is_stop = True

In [71]:
len(nlp.Defaults.stop_words)

306

In [72]:
nlp.vocab['btw'].is_stop

True

In [73]:
nlp.Defaults.stop_words.remove('beyond')

In [74]:
nlp.vocab['beyond'].is_stop = False

In [75]:
nlp.vocab['beyond'].is_stop

False

<h2>Phrase Matching & Vocab</h2>

In [76]:
from spacy.matcher import Matcher

In [77]:
matcher = Matcher(nlp.vocab)

In [78]:
# SolarPower
pattern1 = [{'LOWER': 'solarpower'}]
# Solar-power
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]
# Solar power
pattern3 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]

In [79]:
matcher.add('SolarPower', None, pattern1, pattern2, pattern3)

In [80]:
doc = nlp(u"The Solar Power industry continues to grow as solarpower increases. Solar-power is amazing.")

In [81]:
found_matches = matcher(doc)

In [82]:
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 8, 9), (8656102463236116519, 11, 14)]


In [83]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 8 9 solarpower
8656102463236116519 SolarPower 11 14 Solar-power


In [84]:
matcher.remove('SolarPower')

In [85]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP': '*'}, {'LOWER': 'power'}]

In [86]:
matcher.add('SolarPower', None, pattern1, pattern2)

In [87]:
doc2 = nlp(u"Solar--power is solarpower yaya!")

In [88]:
found_matches = matcher(doc2)

In [89]:
print(found_matches)

[(8656102463236116519, 0, 3), (8656102463236116519, 4, 5)]


In [90]:
from spacy.matcher import PhraseMatcher

In [91]:
matcher = PhraseMatcher(nlp.vocab)

In [92]:
with open('reaganomics.txt') as f:
    doc3 = nlp(f.read())

In [93]:
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']

In [94]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [95]:
type(phrase_patterns[0])

spacy.tokens.doc.Doc

In [96]:
matcher.add('EconMatcher', None, *phrase_patterns)

In [97]:
found_matches = matcher(doc3)

In [98]:
found_matches

[(3680293220734633682, 41, 45),
 (3680293220734633682, 49, 53),
 (3680293220734633682, 54, 56),
 (3680293220734633682, 61, 65),
 (3680293220734633682, 673, 677),
 (3680293220734633682, 2985, 2989)]

In [101]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc3[start-5:end+5]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 policies are commonly associated with supply-side economics, referred to as trickle
3680293220734633682 EconMatcher 49 53 economics, referred to as trickle-down economics or voodoo economics by political
3680293220734633682 EconMatcher 54 56 trickle-down economics or voodoo economics by political opponents, and
3680293220734633682 EconMatcher 61 65 by political opponents, and free-market economics by political advocates.


3680293220734633682 EconMatcher 673 677 attracted a following from the supply-side economics movement, which formed in
3680293220734633682 EconMatcher 2985 2989 became widely known as "trickle-down economics", due to the
