In [44]:
import spacy

In [45]:
nlp = spacy.load('en_core_web_sm')

In [46]:
from spacy.matcher import Matcher

In [47]:
matcher = Matcher(nlp.vocab)

In [48]:
# Creating Patters for 'SolarPower'

pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]
pattern3 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]

In [49]:
matcher.add('SolarPower', None, pattern1, pattern2, pattern3)

In [50]:
doc = nlp(u'The Solar Power industry continues to grow as demand\
for solarpower increases. Solar-power cars are gaining popularity.')

In [51]:
found_matches = matcher(doc)
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 9, 10), (8656102463236116519, 12, 15)]


In [52]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start: end]
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 9 10 solarpower
8656102463236116519 SolarPower 12 15 Solar-power


In [53]:
# Remove matcher 'SolarPower'

In [54]:
matcher.remove('SolarPower')

In [55]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT' : True, 'OP': '*'},{'LOWER': 'power'}]

In [56]:
matcher.add('SolarPower', None, pattern1, pattern2)

In [57]:
doc2 = nlp(u'Solar--Power is solarpower for cars!')

In [58]:
matches_found = matcher(doc2)
print(matches_found)

[(8656102463236116519, 0, 3), (8656102463236116519, 4, 5)]


# Phrase Matching

In [69]:
import spacy
from spacy.matcher import PhraseMatcher

In [70]:
matcher = PhraseMatcher(nlp.vocab)

In [71]:
with open(r'../TextFiles/reaganomics.txt', encoding='latin1') as f:
    doc3 = nlp(f.read())

In [72]:
# Create a list of match phrases

phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', \
              'free-market economics']

In [73]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [74]:
type(phrase_patterns[0])

spacy.tokens.doc.Doc

In [75]:
phrase_patterns[0]

voodoo economics

In [76]:
print(phrase_patterns[0])

voodoo economics


In [77]:
#Observation: No quotes in output since type =  spacy.token.doc.Doc (and not string)

In [78]:
phrase_patterns

[voodoo economics,
 supply-side economics,
 trickle-down economics,
 free-market economics]

In [79]:
#Pass each Doc object into matcher (note the use of asterisk!)
matcher.add('EconMatcher', None, *phrase_patterns)

In [80]:
found_matches = matcher(doc3)

In [81]:
found_matches

[(3680293220734633682, 41, 45),
 (3680293220734633682, 49, 53),
 (3680293220734633682, 54, 56),
 (3680293220734633682, 61, 65),
 (3680293220734633682, 673, 677),
 (3680293220734633682, 2984, 2988)]

In [83]:
# To find out where it is actually matching

for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc3[start : end]
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 supply-side economics
3680293220734633682 EconMatcher 49 53 trickle-down economics
3680293220734633682 EconMatcher 54 56 voodoo economics
3680293220734633682 EconMatcher 61 65 free-market economics
3680293220734633682 EconMatcher 673 677 supply-side economics
3680293220734633682 EconMatcher 2984 2988 trickle-down economics


In [84]:
# To see tokens surrounding the term you are looking for

for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc3[start - 5 : end + 5]
    print(match_id, string_id, start, end, span.text)


3680293220734633682 EconMatcher 41 45 policies are commonly associated with supply-side economics, referred to as trickle
3680293220734633682 EconMatcher 49 53 economics, referred to as trickle-down economics or voodoo economics by political
3680293220734633682 EconMatcher 54 56 trickle-down economics or voodoo economics by political opponents, and
3680293220734633682 EconMatcher 61 65 by political opponents, and free-market economics by political advocates.


3680293220734633682 EconMatcher 673 677 attracted a following from the supply-side economics movement, which formed in
3680293220734633682 EconMatcher 2984 2988 became widely known as "trickle-down economics", due to the
