In [None]:
import spacy

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
doc = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [None]:
doc2 = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

In [None]:
for sentence in doc2.sents:
    print(sentence)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [None]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7fc70654b390>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7fc705866708>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7fc705866768>)]

In [None]:
nlp.pipe_names

['tagger', 'parser', 'ner']

## Tokenization

Process of breaking the original text into component pieces (tokens)
Basic building blocks of doc object

In [None]:
mystring = '"We\'re moving to L.A. !"'

In [None]:
print(mystring)

"We're moving to L.A. !"


In [None]:
doc = nlp(mystring)

In [None]:
for token in doc:
    print(token.text)

"
We
're
moving
to
L.A.
!
"


In [None]:
doc2 = nlp(u"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com")

In [None]:
for t in doc2:
    print(t)

We
're
here
to
help
!
Send
snail
-
mail
,
email
support@oursite.com
or
visit
us
at
http://www.oursite.com


In [None]:
len(doc2)

18

In [None]:
doc2.vocab

<spacy.vocab.Vocab at 0x7fc7066ed448>

In [None]:
len(doc2.vocab)

533

In [None]:
doc2[2]

here

In [None]:
doc[3:9]

moving to L.A. !"

Doc objects cannot be reassigned

In [None]:
doc3 = nlp(u"Apple to build a Hong Kong factory for $6 million")

In [None]:
for token in doc3:
    print(token.text, end=" | ")

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 

In [None]:
for entity in doc3.ents:
    print(entity)
    print(entity.label_)
    print(str(spacy.explain(entity.label_)))
    print("\n")

Apple
ORG
Companies, agencies, institutions, etc.


Hong Kong
GPE
Countries, cities, states


$6 million
MONEY
Monetary values, including unit




Spacy is smart enough to recognize names.

In [None]:
doc4 = nlp(u"Autonomous cars shift insurance liability towards manufacturers.")

In [None]:
for chunk in doc4.noun_chunks:
    print(chunk)

Autonomous cars
insurance liability
manufacturers


## Tokenization visualized

In [None]:
from spacy import displacy

In [None]:
doc5 = nlp(u"Apple is going to build a U.K. factory for $6 million")

In [None]:
displacy.render(doc5, style="dep",jupyter = True, options={"distance":70})

In [None]:
doc6 = nlp(u"Over the last quarter Apple sold nearly 20 thousands iPods for profit for $6 million")

In [None]:
displacy.render(doc6, style="ent", jupyter=True)

# Stemming


*   Process of finding base words. For example base word of boating can boat.
*   Chops off ends of words until the base/root (stem) word is achieved

*   Popular stemming algorithms - Porter and Porter 2
*   Spacy does not have any stemming methods, so we will use nltk







In [None]:
import nltk

In [None]:
from nltk.stem.porter import PorterStemmer

In [None]:
p_stemmer = PorterStemmer()

In [None]:
words = ["run","runner","ran","runs","easily","fairly"]

In [None]:
for word in words:
    print(word+"----->"+p_stemmer.stem(word))

run----->run
runner----->runner
ran----->ran
runs----->run
easily----->easili
fairly----->fairli


In [None]:
from nltk.stem.snowball import SnowballStemmer

In [None]:
s_stemmer = SnowballStemmer(language="english")

In [None]:
for word in words:
    print(word+"----->"+s_stemmer.stem(word))

run----->run
runner----->runner
ran----->ran
runs----->run
easily----->easili
fairly----->fair


In [None]:
words = ["generous","generation","generously","generate"]

In [None]:
for word in words:
    print(word+"----->"+s_stemmer.stem(word))

generous----->generous
generation----->generat
generously----->generous
generate----->generat


# Lemmatization

* Much more sophisticated than stemming
* Looks beyond text reduction and it is much more informative
* Spacy only has lemmatization libraries
* For example lemma of mice is mouse

In [None]:
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today")

In [None]:
for token in doc1:
    print(token.text, "\t",token.pos_,"\t",token.lemma,"\t",token.lemma_)

I 	 PRON 	 561228191312463089 	 -PRON-
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 561228191312463089 	 -PRON-
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 561228191312463089 	 -PRON-
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


You can see running, ran is reduced to lemma run.

In [None]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [None]:
doc2 = nlp(u"I saw ten mice today!")

In [None]:
show_lemmas(doc2)

I            PRON   561228191312463089     -PRON-
saw          VERB   11925638236994514241   see
ten          NUM    7970704286052693043    ten
mice         NOUN   1384165645700560590    mouse
today        NOUN   11042482332948150395   today
!            PUNCT  17494803046312582752   !


# Stop words

* Words like "a" and "the" does not require any tagging and is removed before processing

* Spacy has built-in 305 stop words

In [None]:
print(nlp.Defaults.stop_words)

{'something', 'nevertheless', 'whose', 'used', 'go', 'such', 'per', 'every', 'whither', 'towards', 'another', 'off', 'nor', 'whoever', "'s", '‘m', 'therefore', 'three', 'back', 'and', 'have', 'me', 'thence', 'while', 'show', 'they', 'anyway', 'must', 'you', 'latter', 'am', 'wherein', 'is', 'perhaps', 'sixty', 'part', 'side', 'thereafter', 'becoming', 'made', '’m', 'their', 'nine', 'until', 'seem', 'someone', 'whenever', 'than', 'does', 'for', 'neither', 'thereby', 'was', 'whatever', 'everything', 'at', 'yours', 'too', 'none', 'front', 'take', 'noone', 'any', 'did', 'throughout', 'who', 'several', 'she', 'eight', 'somehow', 'within', 'seemed', 'are', 'just', 'many', 'through', 'might', 'done', 'one', 'sometimes', 'in', 'onto', 'around', 'therein', 'whereas', '’ll', 'without', 'because', 'almost', 'though', 'using', 'were', 'most', 'i', 'keep', 'less', 'very', 'please', 're', 'would', 'ever', 'where', 'yourself', "'m", 'been', 'latterly', 'either', 'somewhere', 'whether', 'alone', 'befor

In [None]:
print(len(nlp.Defaults.stop_words))

326


In [None]:
nlp.vocab["is"].is_stop

True

In [None]:
# Let us add btw (by the way) into the default stop words.

nlp.Defaults.stop_words.add("btw")

In [None]:
nlp.vocab["btw"].is_stop = True

In [None]:
print(len(nlp.Defaults.stop_words))     # btw added

327


# Phrase matching and vocabulary

We will identify and label specific phrases that match patterns we can define ourselves.

In [None]:
from spacy.matcher import Matcher

In [None]:
matcher = Matcher(nlp.vocab)

Spacey offers a rule matching tool called matcher and that allows you to build a library of token patterns then match those patterns against a doc object to return a list of found matches.

In [None]:
# We will read a document and we are going to find solar power.

# If we transform the word Solarpower into lowercase, we must be able to find it
pattern1 = [{"LOWER":'solarpower'}] 

# Solar-power
pattern2 = [{"LOWER":'solar'},{"IS_PUNCT":True},{'LOWER':'power'}]

# Solar power
pattern3 = [{"LOWER":'solar'},{'LOWER':'power'}]

In [None]:
matcher.add('SolarPower', None, pattern1, pattern2, pattern3)

In [None]:
doc = nlp(u"The Solar Power industry continues to grow as solarpower increases. Solar-power is amazing.")

In [None]:
found_matches = matcher(doc)

In [None]:
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 8, 9), (8656102463236116519, 11, 14)]


In [None]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 8 9 solarpower
8656102463236116519 SolarPower 11 14 Solar-power


In [None]:
# Let us remove these patterns

matcher.remove('SolarPower')

In [None]:
# This is going to find solarpower SolarPower
pattern1 = [{"LOWER":'solarpower'}] 

# This is going to find solar(* Any kind of punctuation)power
pattern2 = [{"LOWER":'solar'},{"IS_PUNCT":True,'OP':'*'},{'LOWER':'power'}] # Allows the pattern to match multiple times

In [None]:
matcher.add('SolarPower',None,pattern1, pattern2)

In [None]:
doc2 = nlp(u"Solar--power is solarpower yay!")

In [None]:
found_matches = matcher(doc2)

In [None]:
print(found_matches)

[(8656102463236116519, 0, 3), (8656102463236116519, 4, 5)]


In [None]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc2[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 0 3 Solar--power
8656102463236116519 SolarPower 4 5 solarpower


In [None]:
from spacy.matcher import PhraseMatcher

In [None]:
matcher = PhraseMatcher(nlp.vocab)

In [None]:
with open("reaganomics.txt",encoding='cp1252') as f:
    doc3 = nlp(f.read())

In [None]:
phrase_list = ["voodoo economics","supply-side economics","tricle-down economics","free-market economics"]

In [None]:
phrase_patterns = [nlp(text) for text in phrase_list] # Creating a doc object of the patterns in the above list

In [None]:
matcher.add('EconMatcher',None,*phrase_patterns)

In [None]:
found_matches = matcher(doc3)

In [None]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc3[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 supply-side economics
3680293220734633682 EconMatcher 54 56 voodoo economics
3680293220734633682 EconMatcher 61 65 free-market economics
3680293220734633682 EconMatcher 673 677 supply-side economics


In [None]:
# Let us see the association of words OR let us see the words around our defined pattern

for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc3[start-5:end+5]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 policies are commonly associated with supply-side economics, referred to as trickle
3680293220734633682 EconMatcher 54 56 trickle-down economics or voodoo economics by political opponents, and
3680293220734633682 EconMatcher 61 65 by political opponents, and free-market economics by political advocates.


3680293220734633682 EconMatcher 673 677 attracted a following from the supply-side economics movement, which formed in


In [None]:
for i in found_matches:
    print(i)
    print(i[1])
    print(doc[1])
    break

(3680293220734633682, 41, 45)
41
Solar


In [None]:
from collections import defaultdict

In [None]:
x = defaultdict(list,{'chicken parmigiana': [4, 5, 4, 5, 5,5, 5, 5,4,4,4,3,4,5,5,4,5],'steak and cheese': [5, 5, 5, 5, 4, 5, 5, 5, 5]})

In [None]:
y = {}
for i,j in x.items():
    print(sum(j)/len(j))
    y[i] = sum(j)/len(j)

4.470588235294118
4.888888888888889


In [None]:
print(y.values())

dict_values([4.470588235294118, 4.888888888888889])


In [None]:
min(y.values())

4.470588235294118