In [1]:
# note:
# https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html
#Stemming usually refers to a crude heuristic process that chops off the ends of words in the hope of achieving this goal correctly most of the time, and often includes the removal of derivational affixes. Lemmatization usually refers to doing things properly with the use of a vocabulary and morphological analysis of words, normally aiming to remove inflectional endings only and to return the base or dictionary form of a word, 
# which is known as the lemma 

In [3]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [4]:
doc1 = nlp("I am a runner running in a race because i love to run and i ran today")

In [5]:
for token in doc1:
    print(token.text, "\t", token.pos_, '\t', token.lemma, '\t', token.lemma_)

I 	 PRON 	 4690420944186131903 	 I
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
i 	 PRON 	 4690420944186131903 	 I
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
and 	 CCONJ 	 2283656566040971221 	 and
i 	 PRON 	 4690420944186131903 	 I
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


In [6]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [8]:
doc2 = nlp(u"I saw eighteen mice today!")

show_lemmas(doc2)

I            PRON   4690420944186131903    I
saw          VERB   11925638236994514241   see
eighteen     NUM    9609336664675087640    eighteen
mice         NOUN   1384165645700560590    mouse
today        NOUN   11042482332948150395   today
!            PUNCT  17494803046312582752   !


In [9]:
doc3 = nlp(u"I am meeting him tomorrow at the meeting.")

show_lemmas(doc3)

I            PRON   4690420944186131903    I
am           AUX    10382539506755952630   be
meeting      VERB   6880656908171229526    meet
him          PRON   1655312771067108281    he
tomorrow     NOUN   3573583789758258062    tomorrow
at           ADP    11667289587015813222   at
the          DET    7425985699627899538    the
meeting      NOUN   14798207169164081740   meeting
.            PUNCT  12646065887601541794   .


In [10]:
doc4 = nlp(u"That's an enormous automobile")

show_lemmas(doc4)

That         DET    4380130941430378203    that
's           AUX    10382539506755952630   be
an           DET    15099054000809333061   an
enormous     ADJ    17917224542039855524   enormous
automobile   NOUN   7211811266693931283    automobile


In [11]:
# Stopword

In [13]:
print(nlp.Defaults.stop_words) # set

{'namely', 'somehow', 'such', 'your', 'but', "'d", 'nowhere', 'thereupon', 'one', 'all', 'and', 'say', 'seem', 'twenty', 'because', 'move', 'five', 'many', 'he', 'due', 'made', 'quite', 'ca', 'which', 'last', 'could', 'out', 'too', 'formerly', 'whom', 'nobody', 'even', 'no', 'eleven', 'several', "'ll", 'becomes', 'along', 'did', 'have', 'seemed', 'became', 'before', 'back', 'or', 'themselves', '‘m', 'fifteen', 'eight', 'after', 'few', 'either', 'then', 'thereby', 'however', 'their', 'while', 'hers', 'is', 'already', 'just', 'both', 'own', 'well', 'than', 'what', 'upon', 'otherwise', 'thus', 'two', 'whatever', 'against', 'per', 'n‘t', 'wherever', 'ourselves', 'me', 'over', 'whoever', 'go', 'show', '‘ve', 'almost', 'always', 'until', 'only', 'whither', 'become', 'third', 'anywhere', 'ever', 'herself', 'she', 'within', 'does', 'a', 'thence', 'amount', 'elsewhere', 'although', 'besides', 'anything', 'yours', 'anyway', 'becoming', 'except', 'behind', 'often', 'whereby', 'be', 'doing', '’ve'

In [14]:
len(nlp.Defaults.stop_words)

326

In [16]:
nlp.vocab['is'].is_stop

True

In [17]:
nlp.vocab['mystery'].is_stop

False

In [18]:
nlp.Defaults.stop_words.add('btw')

In [19]:
nlp.vocab['btw'].is_stop = True

In [20]:
len(nlp.Defaults.stop_words)

327

In [21]:
nlp.vocab['btw'].is_stop 

True

In [22]:
# remove stopwords
nlp.Defaults.stop_words.remove('btw')
nlp.vocab['btw'].is_stop = False
len(nlp.Defaults.stop_words)

326

In [23]:
nlp.vocab['btw'].is_stop

False

In [24]:
# Phrase matching

In [25]:
# rule-matching rules
from spacy.matcher import Matcher

In [26]:
matcher = Matcher(nlp.vocab)

In [28]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]
pattern3 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]

matcher.add('SolarPower', [pattern1, pattern2, pattern3])

In [29]:
doc = nlp(u'The Solar Power industry continues to grow as demand \
for solarpower increases. Solar-power cars are gaining popularity.')

In [30]:
found_matches = matcher(doc)
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 13, 16)]


In [31]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 10 11 solarpower
8656102463236116519 SolarPower 13 16 Solar-power


In [33]:
# 'OP':'* optional, with or without punctuations, match 0 or more times
# Redefine the patterns:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'power'}]

# Remove the old patterns to avoid duplication:
matcher.remove('SolarPower')

# Add the new set of patterns to the 'SolarPower' matcher:
matcher.add('SolarPower', [pattern1, pattern2])

In [34]:
found_matches = matcher(doc)
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 13, 16)]


In [35]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 10 11 solarpower
8656102463236116519 SolarPower 13 16 Solar-power


In [36]:
# OP : ! not equal
# be careful of lemma, maybe get what you expect

In [37]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LEMMA': 'power'}] # CHANGE THIS PATTERN

# Remove the old patterns to avoid duplication:
matcher.remove('SolarPower')

# Add the new set of patterns to the 'SolarPower' matcher:
matcher.add('SolarPower',  [pattern1, pattern2])

In [38]:
doc2 = nlp(u'Solar-powered energy runs solar-powered cars.')
found_matches = matcher(doc2)
print(found_matches)

[(8656102463236116519, 0, 3), (8656102463236116519, 5, 8)]


In [39]:
for match_id, start, end in found_matches:
    span = doc2[start:end]
    print(span)

Solar-powered
solar-powered


In [40]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'power'}]
pattern3 = [{'LOWER': 'solarpowered'}]
pattern4 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'powered'}]

# Remove the old patterns to avoid duplication:
matcher.remove('SolarPower')

# Add the new set of patterns to the 'SolarPower' matcher:
matcher.add('SolarPower', [pattern1, pattern2, pattern3, pattern4])

In [41]:
found_matches = matcher(doc2)
print(found_matches)


[(8656102463236116519, 0, 3), (8656102463236116519, 5, 8)]


In [42]:
# [{'ORTH': '#'}, {}]
# You can pass an empty dictionary {} as a wildcard to represent any token. For example, you might want to retrieve hashtags without knowing what might follow the # character:

In [43]:
# PhraseMatcher

In [44]:
from spacy.matcher import PhraseMatcher

In [45]:
matcher = PhraseMatcher(nlp.vocab)

In [59]:
# with open('../TextFiles/reaganomics.txt', encoding='utf_16') as f:
#     doc3 = nlp(f.read())

In [50]:
import os
print(os.getcwd())

/Users/ivytong/Documents/GitHub/BigDataCourses_Spark_Hadoop/NLP_COURSE/01-NLP-Python-Basics


In [54]:
# First, create a list of match phrases:
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']

# Next, convert each phrase to a Doc object:
phrase_patterns = [nlp(text) for text in phrase_list]

# Pass each Doc object into matcher (note the use of the asterisk!):
matcher.add('VoodooEconomics', None, *phrase_patterns)

# Build a list of matches:
# matches = matcher(doc3)

In [55]:
phrase_patterns

[voodoo economics,
 supply-side economics,
 trickle-down economics,
 free-market economics]

In [56]:
type(phrase_patterns)

list

In [57]:
matches = matcher(doc3)

In [58]:
matches

[]

In [None]:
sents = [sent for sent in doc3.sents]

# In the next section we'll see that sentences contain start and end token values:
print(sents[0].start, sents[0].end)
for sent in sents:
    if matches[4][1] < sent.end:  # this is the fifth match, that starts at doc3[673]
        print(sent)
        break