In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
%%latex
$\textbf{Rule-based Matching}$

<IPython.core.display.Latex object>

In [5]:
from spacy.matcher import Matcher

In [6]:
matcher = Matcher(nlp.vocab)

In [7]:
#SolarPower
pattern1 = [{'LOWER': 'solarpower'}]

#Solar-power
pattern2 =[{'LOWER':'solar'}, {'IS_PUNCT': True}, {'LOWER':'power'}]

#Solar power
pattern3 = [{'LOWER':'solar'}, {'LOWER':'power'}]

In [8]:
#add the patterns to our matcher
matcher.add('SolarPower', None, pattern1, pattern2, pattern3)

In [9]:
doc = nlp(u"The Solar Power industry continuers to grow a solarpower increases. Solar-power is amazing")

In [10]:
found_matches = matcher(doc)

In [11]:
print(found_matches) #results = (strind id, start, stop)

[(8656102463236116519, 1, 3), (8656102463236116519, 8, 9), (8656102463236116519, 11, 14)]


In [19]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 8 9 solarpower
8656102463236116519 SolarPower 11 14 Solar-power


In [20]:
#remove from the matcher
matcher.remove('SolarPower')

In [21]:
#solarpower SolarPower
pattern1 = [{'LOWER':'solarpower'}]

#solar.power is grab solar+ anything other word it is linked to by any type of connector
pattern2 = [{'LOWER':'solar'}, {'IS_PUNCT':True, 'OP':'*'}, {'LOWER':'power'}]

In [22]:
matcher.add('SolarPower', None, pattern1, pattern2, pattern3)

In [23]:
doc2 = nlp(u"Solar--power is solarpower yay")

In [26]:
found_matches = matcher(doc2)

In [27]:
print(found_matches) #results = (strind id, start, stop)

[(8656102463236116519, 0, 3), (8656102463236116519, 4, 5)]


In [28]:
%%latex
$\textbf{Phrase Matching}$

<IPython.core.display.Latex object>

In [29]:
from spacy.matcher import PhraseMatcher

In [30]:
matcher = PhraseMatcher(nlp.vocab)

In [32]:
with open('../TextFiles/reaganomics.txt') as f:
    doc3 = nlp(f.read())

In [34]:
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']

In [37]:
#convert each phrase to a doc object
phrase_patterns = [nlp(text) for text in phrase_list]

In [38]:
phrase_patterns[0]

voodoo economics

In [36]:
type(phrase_patterns[0])

spacy.tokens.doc.Doc

In [39]:
matcher.add('EconMatcher', None, *phrase_patterns) #grab each individual doc and pass it as a pattern matcher

In [40]:
found_matches = matcher(doc3)

In [41]:
found_matches

[(3680293220734633682, 41, 45),
 (3680293220734633682, 49, 53),
 (3680293220734633682, 54, 56),
 (3680293220734633682, 61, 65),
 (3680293220734633682, 673, 677),
 (3680293220734633682, 2985, 2989)]

In [43]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc3[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 supply-side economics
3680293220734633682 EconMatcher 49 53 trickle-down economics
3680293220734633682 EconMatcher 54 56 voodoo economics
3680293220734633682 EconMatcher 61 65 free-market economics
3680293220734633682 EconMatcher 673 677 supply-side economics
3680293220734633682 EconMatcher 2985 2989 trickle-down economics


In [44]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc3[start-4:end+10]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 are commonly associated with supply-side economics, referred to as trickle-down economics or voodoo
3680293220734633682 EconMatcher 49 53 , referred to as trickle-down economics or voodoo economics by political opponents, and free-
3680293220734633682 EconMatcher 54 56 -down economics or voodoo economics by political opponents, and free-market economics by
3680293220734633682 EconMatcher 61 65 political opponents, and free-market economics by political advocates.

The four pillars of Reagan
3680293220734633682 EconMatcher 673 677 a following from the supply-side economics movement, which formed in opposition to Keynesian demand-
3680293220734633682 EconMatcher 2985 2989 widely known as "trickle-down economics", due to the significant cuts in the upper
