# Vocabulary and Matching

## Rule based matching

In [1]:
# import spacy

In [17]:
from spacy.matcher import Matcher
from spacy import load
nlp = load('en_core_web_md')

In [18]:
matcher = Matcher(nlp.vocab)

In [19]:
pat1 = [{'LOWER':'hello'}, {'LOWER':'world'}]
pat2 = [{'LOWER': 'hello'}, {'IS_PUNCT': True}, {'LOWER': 'world'}]

In [22]:
s1 = 'Hello world'
matcher.add(s1, [pat1, pat2])

In [27]:
doc = nlp(" Hello World are the first two words. ")
doc

 Hello World are the first two words. 

In [28]:
find_matches = matcher(doc)

In [29]:
print(find_matches)

[(6760932446404879437, 1, 3)]


In [30]:
for match_id, start, end in find_matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id, string_id, start, end, span)

6760932446404879437 Hello world 1 3 Hello World


## Phrase Matching

In [42]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)
text = """
Barack Hussein Obama II (/bəˈrɑːk huːˈseɪn oʊˈbɑːmə/ (listen) bə-RAHK hoo-SAYN oh-BAH-mə;[1] born August 4, 1961) is an American former politician who served as the 44th president of the United States from 2009 to 2017. A member of the Democratic Party, he was the first African-American president of the United States.[2] Obama previously served as a U.S. senator representing Illinois from 2005 to 2008 and as an Illinois state senator from 1997 to 2004, and worked as a civil rights lawyer before holding public office.
"""

In [43]:
phrase_list = ["Barack", "United States", "American"]

In [44]:
phrase_patterns = [nlp(text) for text in phrase_list]
phrase_patterns

[Barack, United States, American]

In [45]:
type(phrase_patterns[0])

spacy.tokens.doc.Doc

In [46]:
matcher.add("TerminologyList", phrase_patterns)

In [47]:
doc = nlp(text)

In [48]:
find_matches = matcher(doc)

In [49]:
print(find_matches)

[(3766102292120407359, 1, 2), (3766102292120407359, 32, 33), (3766102292120407359, 43, 45), (3766102292120407359, 63, 64)]


In [50]:
for match_id, start, end in find_matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id, string_id, start, end, span)

3766102292120407359 TerminologyList 1 2 Barack
3766102292120407359 TerminologyList 32 33 American
3766102292120407359 TerminologyList 43 45 United States
3766102292120407359 TerminologyList 63 64 American
