# Vocabulary Matching

In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

## Rule-Based Matching

In [8]:
pattern1 = [{'LOWER':'solarpower'}]
pattern2 = [{'LOWER':'solar'},{'LOWER':'power'}]
pattern3 = [{'LOWER':'solar'},{'IS_PUNCT': True},{'LOWER':'power'}]
matcher.add('SolarPower', [pattern1, pattern2, pattern3])

`pattern1` looks for a single token whose lowercase text reads 'solarpower' $\\$
`pattern2` looks for two adjacent tokens that read 'solar' and 'power' in that order $\\$
`pattern3` looks for tokens whose lowercase text reads power then punctuation then power again.

In [9]:
doc = nlp(u'The Solar Power industry continues to grow as demand \
    for solarpower increases. Solar-power cars are gaining popularity.')

found_matches = matcher(doc)
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 11, 12), (8656102463236116519, 14, 17)]


`matcher` returns a list of tuples. Each tuple contains an ID for the match, with start and end tokens that map to the span `doc[start:end]`

In [10]:
for match_id,start,end in found_matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id,string_id,start,end,span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 11 12 solarpower
8656102463236116519 SolarPower 14 17 Solar-power


In [25]:
matcher.remove('West Point')

pattern1 = [{'IS_STOP':'ADJ','OP':'*'}]
pattern2 = [{'POS':'NOUN'},{'OP':'+'}]
pattern3 = [{'POS':'PROPN'},{'OP': '*'}]
matcher.add('Random', [pattern3])

In [26]:
doc = nlp(
    u"West Point, officially known as the United States Military Academy, \
        is a prestigious military institution located in West Point, New York. \
        Established in 1802, it is one of the oldest military academies in the world. \
        West Point's mission is to educate and train future leaders of the U.S. Army,\
        emphasizing academic excellence, physical fitness, and moral-ethical development. Its graduates, known as cadets, earn a bachelor's degree \
        and are commissioned as officers in the Army. The academy is renowned for its rigorous academic programs, military discipline, and traditions."
)
found_matches = matcher(doc)
print(found_matches)

for match_id,start,end in found_matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id,string_id,start,end,span.text)

[(3428331172314779660, 0, 1), (3428331172314779660, 0, 2), (3428331172314779660, 1, 2), (3428331172314779660, 0, 3), (3428331172314779660, 1, 3), (3428331172314779660, 0, 4), (3428331172314779660, 1, 4), (3428331172314779660, 0, 5), (3428331172314779660, 1, 5), (3428331172314779660, 0, 6), (3428331172314779660, 1, 6), (3428331172314779660, 0, 7), (3428331172314779660, 1, 7), (3428331172314779660, 0, 8), (3428331172314779660, 1, 8), (3428331172314779660, 7, 8), (3428331172314779660, 0, 9), (3428331172314779660, 1, 9), (3428331172314779660, 7, 9), (3428331172314779660, 8, 9), (3428331172314779660, 0, 10), (3428331172314779660, 1, 10), (3428331172314779660, 7, 10), (3428331172314779660, 8, 10), (3428331172314779660, 9, 10), (3428331172314779660, 0, 11), (3428331172314779660, 1, 11), (3428331172314779660, 7, 11), (3428331172314779660, 8, 11), (3428331172314779660, 9, 11), (3428331172314779660, 10, 11), (3428331172314779660, 0, 12), (3428331172314779660, 1, 12), (3428331172314779660, 7, 12)

In [27]:
alist = [1,2,3,4,5]
alist[0:3]

[1, 2, 3]