In [1]:
import spacy



In [2]:
from spacy.matcher import Matcher

In [5]:
nlp = spacy.load("en_core_web_sm")

matcher = Matcher(nlp.vocab)

patterns = [{"LIKE_EMAIL": True}]
matcher.add("EMAIL_ADDRESS", [patterns])

doc = nlp("This is an email address: wmattingly@aol.com")
matches = matcher(doc)

matches

[(16571425990740197027, 6, 7)]

In [6]:
# Lexeme, start token, end token
nlp.vocab[matches[0][0]].text

'EMAIL_ADDRESS'

##  Attributes Taken by Matcher

- ORTH - The exact verbatim of a token (str)
- TEXT - The exact verbatim of a token (str)
- LOWER - The lowercase form of the token text (str)
- LENGTH - The length of the token text (int)
- IS_ALPHA
- IS_ASCII
- IS_DIGIT
- IS_LOWER
- IS_UPPER
- IS_TITLE
- IS_PUNCT
- IS_SPACE
- IS_STOP
- IS_SENT_START
- LIKE_NUM
- LIKE_URL
- LIKE_EMAIL
- SPACY
- POS
- TAG
- MORPH
- DEP
- LEMMA
- SHAPE
- ENT_TYPE
- _ - Custom extension attributes (Dict[str, Any])
- OP


## Applied Matcher


In [11]:
with open ("wiki_mlk.txt", "r") as f:
    text = f.read()

text[:100]

'Martin Luther King Jr. (born Michael King Jr.; January 15, 1929 – April 4, 1968) was an American Bap'

## Grabbing all Proper Nouns

In [12]:
nlp = spacy.load("en_core_web_sm")

matcher = Matcher(nlp.vocab)
patterns = [{'POS':'PROPN'}]
matcher.add("PROPER_NOUNS", [patterns])

doc = nlp(text)
matches = matcher(doc)
print(len(matches))
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

103
(3232560085755078826, 0, 1) Martin
(3232560085755078826, 1, 2) Luther
(3232560085755078826, 2, 3) King
(3232560085755078826, 3, 4) Jr.
(3232560085755078826, 6, 7) Michael
(3232560085755078826, 7, 8) King
(3232560085755078826, 8, 9) Jr.
(3232560085755078826, 10, 11) January
(3232560085755078826, 15, 16) April
(3232560085755078826, 49, 50) King


###  Improving it with Multi-Word Tokens

In [13]:
matcher = Matcher(nlp.vocab)
patterns = [{'POS':'PROPN', 'OP':'+'}]
matcher.add("PROPER_NOUNS", [patterns])

doc = nlp(text)
matches = matcher(doc)
print(len(matches))
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

176
(3232560085755078826, 0, 1) Martin
(3232560085755078826, 0, 2) Martin Luther
(3232560085755078826, 1, 2) Luther
(3232560085755078826, 0, 3) Martin Luther King
(3232560085755078826, 1, 3) Luther King
(3232560085755078826, 2, 3) King
(3232560085755078826, 0, 4) Martin Luther King Jr.
(3232560085755078826, 1, 4) Luther King Jr.
(3232560085755078826, 2, 4) King Jr.
(3232560085755078826, 3, 4) Jr.


### Greedy Keyword Argument

In [14]:
matcher = Matcher(nlp.vocab)
patterns = [{'POS':'PROPN', 'OP':'+'}]
matcher.add("PROPER_NOUNS", [patterns], greedy='LONGEST')

doc = nlp(text)
matches = matcher(doc)
print(len(matches))
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

62
(3232560085755078826, 83, 88) Martin Luther King Sr.
(3232560085755078826, 469, 474) Martin Luther King Jr. Day
(3232560085755078826, 536, 541) Martin Luther King Jr. Memorial
(3232560085755078826, 0, 4) Martin Luther King Jr.
(3232560085755078826, 128, 132) Southern Christian Leadership Conference
(3232560085755078826, 247, 251) Director J. Edgar Hoover
(3232560085755078826, 6, 9) Michael King Jr.
(3232560085755078826, 325, 328) Nobel Peace Prize
(3232560085755078826, 422, 425) James Earl Ray
(3232560085755078826, 463, 466) Congressional Gold Medal


### Sorting it to Apperance

In [15]:
matcher = Matcher(nlp.vocab)
patterns = [{'POS':'PROPN', 'OP':'+'}]
matcher.add("PROPER_NOUNS", [patterns], greedy='LONGEST')

doc = nlp(text)
matches = matcher(doc)
matches.sort(key=lambda x: x[1])
print(len(matches))
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

62
(3232560085755078826, 0, 4) Martin Luther King Jr.
(3232560085755078826, 6, 9) Michael King Jr.
(3232560085755078826, 10, 11) January
(3232560085755078826, 15, 16) April
(3232560085755078826, 49, 50) King
(3232560085755078826, 69, 71) Mahatma Gandhi
(3232560085755078826, 83, 88) Martin Luther King Sr.
(3232560085755078826, 89, 90) King
(3232560085755078826, 113, 114) King
(3232560085755078826, 117, 118) Montgomery


### Adding in Sequences

In [16]:
matcher = Matcher(nlp.vocab)
patterns = [{'POS':'PROPN', 'OP':'+'},
            {'POS':'VERB'}]
matcher.add("PROPER_NOUNS", [patterns], greedy='LONGEST')

doc = nlp(text)
matches = matcher(doc)
matches.sort(key=lambda x: x[1])
print(len(matches))
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

8
(3232560085755078826, 49, 51) King advanced
(3232560085755078826, 89, 91) King participated
(3232560085755078826, 113, 115) King led
(3232560085755078826, 167, 169) King helped
(3232560085755078826, 198, 200) SCLC put
(3232560085755078826, 247, 252) Director J. Edgar Hoover considered
(3232560085755078826, 322, 324) King won
(3232560085755078826, 485, 488) United States beginning


## Finding Quotes and Speakers

In [17]:
import json

with open ("alice.json", "r") as f:
    data = json.load(f)

In [18]:
text = data[0][2][0]
text

"Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, `and what is the use of a book,' thought Alice `without pictures or conversation?'"

In [19]:
text = data[0][2][0].replace( "`", "'")
text

"Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?'"

In [21]:
matcher = Matcher(nlp.vocab)
patterns = [{'ORTH':"'"},
            {'IS_ALPHA':True, 'OP':'+'},
            {'IS_PUNCT':True, 'OP':'*'},
            {'ORTH':"'"}
            ]
matcher.add("PROPER_NOUNS", [patterns], greedy='LONGEST')

doc = nlp(text)
matches = matcher(doc)
matches.sort(key=lambda x:x[1])
print(len(matches))
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

2
(3232560085755078826, 47, 58) 'and what is the use of a book,'
(3232560085755078826, 60, 67) 'without pictures or conversation?'


### Find Speaker

In [23]:
speak_lemmas = ['think', 'say']
text = data[0][2][0].replace( "`", "'")

matcher = Matcher(nlp.vocab)
patterns_1 = [{'ORTH': "'"},
              {'IS_ALPHA': True, "OP": "+"},
              {'IS_PUNCT': True, "OP": "*"},

              {'ORTH': "'"},
              {"POS": "VERB", "LEMMA": {"IN": speak_lemmas}},
              {"POS": "PROPN", "OP": "+"},
              {'ORTH': "'"},

              {'IS_ALPHA': True, "OP": "+"},
              {'IS_PUNCT': True, "OP": "*"},
              {'ORTH': "'"}]
matcher.add("PROPER_NOUNS", [patterns_1], greedy='LONGEST')

doc = nlp(text)
matches = matcher(doc)
matches.sort(key=lambda x:x[1])
print(len(matches))
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

1
(3232560085755078826, 47, 67) 'and what is the use of a book,' thought Alice 'without pictures or conversation?'


In [24]:
for text in data[0][2]:
    text = text.replace("`", "'")
    doc = nlp(text)
    matches = matcher(doc)
    matches.sort(key = lambda x: x[1])
    print (len(matches))
    for match in matches[:10]:
        print (match, doc[match[1]:match[2]])

1
(3232560085755078826, 47, 67) 'and what is the use of a book,' thought Alice 'without pictures or conversation?'
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


### Adding More Patterns¶

In [25]:
speak_lemmas = ["think", "say"]
text = data[0][2][0].replace( "`", "'")
matcher = Matcher(nlp.vocab)
pattern1 = [{'ORTH': "'"}, {'IS_ALPHA': True, "OP": "+"}, {'IS_PUNCT': True, "OP": "*"}, {'ORTH': "'"},
            {"POS": "VERB", "LEMMA": {"IN": speak_lemmas}},
            {"POS": "PROPN", "OP": "+"}, {'ORTH': "'"}, {'IS_ALPHA': True, "OP": "+"},
            {'IS_PUNCT': True, "OP": "*"}, {'ORTH': "'"}]
pattern2 = [{'ORTH': "'"}, {'IS_ALPHA': True, "OP": "+"}, {'IS_PUNCT': True, "OP": "*"},
            {'ORTH': "'"}, {"POS": "VERB", "LEMMA": {"IN": speak_lemmas}},
            {"POS": "PROPN", "OP": "+"}]
pattern3 = [{"POS": "PROPN", "OP": "+"},{"POS": "VERB", "LEMMA": {"IN": speak_lemmas}},
            {'ORTH': "'"}, {'IS_ALPHA': True, "OP": "+"}, {'IS_PUNCT': True, "OP": "*"}, {'ORTH': "'"}]
matcher.add("PROPER_NOUNS", [pattern1, pattern2, pattern3], greedy='LONGEST')

for text in data[0][2]:
    text = text.replace("`", "'")
    doc = nlp(text)
    matches = matcher(doc)
    matches.sort(key = lambda x: x[1])
    print (len(matches))
    for match in matches[:10]:
        print (match, doc[match[1]:match[2]])

1
(3232560085755078826, 47, 67) 'and what is the use of a book,' thought Alice 'without pictures or conversation?'
0
0
0
0
0
1
(3232560085755078826, 0, 6) 'Well!' thought Alice
0
0
0
0
0
0
0
1
(3232560085755078826, 57, 68) 'which certainly was not here before,' said Alice
0
0
