# Rules based matching

In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
from spacy.matcher import Matcher

In [3]:
matcher = Matcher(nlp.vocab)

In [4]:
pattern1 = [{"LOWER": "solarpower"}] # SolarPower
pattern2 = [{"LOWER": "solar"}, {"IS_PUNCT": True}, {"LOWER": "power"}] # Solar-power (any punctuation mark between solar and power)
pattern3 = [{"LOWER": "solar"}, {"LOWER": "power"}] # Solar power

## Add  Matcher

In [5]:
matcher.add("SolarPower", [pattern1, pattern2, pattern3])
#               |             |__________|________|__________ Patterns
#               |_____________________________________________ Matcher name (anything)

In [6]:
doc = nlp(u'The Solar Power industry continues to grow as demand \
for solarpower increases. (SolaR:pOwEr) Solar-power cars are gaining popularity.')

In [7]:
found_matches = matcher(doc)

In [8]:
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 14, 17), (8656102463236116519, 18, 21)]


In [9]:
# (8656102463236116519, 1, 3)
#             |         |  |_ End token (exclude 3)
#             |         |____ Start token (each words in text have token start from 0 ['The' have token 0])
#             |______________ Match ID

In [10]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(f"{match_id:<{25}} {string_id:{15}} {start:{10}} {end:{10}} \t {span.text}")

8656102463236116519       SolarPower               1          3 	 Solar Power
8656102463236116519       SolarPower              10         11 	 solarpower
8656102463236116519       SolarPower              14         17 	 SolaR:pOwEr
8656102463236116519       SolarPower              18         21 	 Solar-power


## Remove matcher

In [11]:
matcher.remove("SolarPower")

### Add new

In [12]:
# * : Allow the pattern to match zero or more times
patterns = [
    [{"LOWER": "solar"}, {"IS_PUNCT": True, "OP": '*'}, {"LOWER": "power"}] # Solar-power, Solar--power ...
    ]
matcher.add("SolarPower", patterns)

In [13]:
doc2 = nlp(u'Solar--power is solar-power or solarpower.')

In [14]:
found_matches = matcher(doc2)

In [15]:
print(found_matches)

[(8656102463236116519, 0, 3), (8656102463236116519, 4, 7)]


In [16]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc2[start:end]                    # get the matched span
    print(f"{match_id:<{25}} {string_id:{15}} {start:{10}} {end:{10}} \t {span.text}")

8656102463236116519       SolarPower               0          3 	 Solar--power
8656102463236116519       SolarPower               4          7 	 solar-power


# Phrasematcher

In [17]:
from spacy.matcher import PhraseMatcher

In [18]:
matcher = PhraseMatcher(nlp.vocab)

In [19]:
with open("reaganomics.txt") as f:
    doc3 = nlp(f.read())

In [20]:
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']

In [22]:
# Next, convert each phrase to a Doc object (list of doc objects)
phrase_patterns = [nlp(text) for text in phrase_list]

In [23]:
matcher.add('VoodooEconomics', None, *phrase_patterns)

In [24]:
matches = matcher(doc3)

In [25]:
matches

[(3473369816841043438, 41, 45),
 (3473369816841043438, 49, 53),
 (3473369816841043438, 54, 56),
 (3473369816841043438, 61, 65),
 (3473369816841043438, 673, 677),
 (3473369816841043438, 2987, 2991)]

In [29]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc3[start:end]                    # get the matched span
    print(f"{match_id:<{25}} {string_id:{15}} {start:{10}} {end:{10}} \t {span.text}")

3473369816841043438       VoodooEconomics         41         45 	 supply-side economics
3473369816841043438       VoodooEconomics         49         53 	 trickle-down economics
3473369816841043438       VoodooEconomics         54         56 	 voodoo economics
3473369816841043438       VoodooEconomics         61         65 	 free-market economics
3473369816841043438       VoodooEconomics        673        677 	 supply-side economics
3473369816841043438       VoodooEconomics       2987       2991 	 trickle-down economics


In [32]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc3[start - 5:end + 5]                    # get the matched span
    print(f"{match_id:<{25}} {string_id:{15}} {start:{10}} {end:{10}} \t {span.text}")

3473369816841043438       VoodooEconomics         41         45 	 policies are commonly associated with supply-side economics, referred to as trickle
3473369816841043438       VoodooEconomics         49         53 	 economics, referred to as trickle-down economics or voodoo economics by political
3473369816841043438       VoodooEconomics         54         56 	 trickle-down economics or voodoo economics by political opponents, and
3473369816841043438       VoodooEconomics         61         65 	 by political opponents, and free-market economics by political advocates.


3473369816841043438       VoodooEconomics        673        677 	 attracted a following from the supply-side economics movement, which formed in
3473369816841043438       VoodooEconomics       2987       2991 	 became widely known as "trickle-down economics", due to the
