In [227]:
import spacy

## Spacy Entity Ruler

In [228]:
nlp = spacy.load("en_core_web_sm")

In [229]:
text = "West Chestertenfieldville was referenced in Mr. Deeds."

In [230]:
doc = nlp(text)

In [231]:
for ent in doc.ents:
    print(ent.text,ent.label_)

West Chestertenfieldville LOC
Deeds PERSON


In [232]:
ruler = nlp.add_pipe("entity_ruler")

In [233]:
nlp.analyze_pipes() #with added entity ruler

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False},
  'entity_ruler': {'assigns': ['doc.ents', 'token.ent_type', 'token.ent_iob'],
   'requires': [],
   'scores': ['ents_f', 'ent

In [234]:
#add patterns
#list of dicts

In [235]:
patterns = [
    {"label": "GPE","pattern":"West Chestertenfieldville"}
]

In [236]:
ruler.add_patterns(patterns)

In [237]:
doc2 = nlp(text)
for ent in doc2.ents:
    print(ent.text,ent.label_)

West Chestertenfieldville LOC
Deeds PERSON


In [238]:
#nothing changes bcz entity ruler comes after ner

In [239]:
nlp2 = spacy.load("en_core_web_sm")

In [240]:
ruler = nlp2.add_pipe("entity_ruler",before = "ner")

In [241]:
ruler.add_patterns(patterns)

In [242]:
doc = nlp2(text)

In [243]:
for ent in doc.ents:
    print(ent.text,ent.label_)

West Chestertenfieldville GPE
Deeds PERSON


In [244]:
nlp2.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'entity_ruler': {'assigns': ['doc.ents', 'token.ent_type', 'token.ent_iob'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ent

In [245]:
#Now entity ruler comes before ner

In [246]:
nlp3 = spacy.load("en_core_web_sm")

In [247]:
ruler = nlp3.add_pipe("entity_ruler",before = "ner")

In [248]:
patterns = [
    {"label": "GPE","pattern":"West Chestertenfieldville"},
    {"label": "FILM","pattern": "Mr. Deeds"}
]

In [249]:
ruler.add_patterns(patterns)

In [250]:
doc = nlp3(text)

In [251]:
for ent in doc.ents:
    print(ent.text,ent.label_)

West Chestertenfieldville GPE
Mr. Deeds FILM


## Spacy Matcher

In [252]:
from spacy.matcher import Matcher

In [253]:
nlp = spacy.load("en_core_web_sm")

In [254]:
matcher = Matcher(nlp.vocab)
pattern = [{"LIKE_EMAIL":True}]
matcher.add("EMAIL_ADDRESS",[pattern]) #email address is the LABEL TO ADD

In [255]:
doc = nlp("This is an email address: wmattingly@aol.com")

In [256]:
matches = matcher( doc )

In [257]:
print(matches)

[(16571425990740197027, 6, 7)]


In [258]:
#1st is a Lexeme, 2nd is Start token ,3rd is End token

In [259]:
print(nlp.vocab[matches[0][0]].text)

EMAIL_ADDRESS


In [260]:
with open("wiki_mlk.txt","r") as f:
    text = f.read()

In [261]:
print(text)

Martin Luther King Jr. (born Michael King Jr.; January 15, 1929 â€“ April 4, 1968) was an American Baptist minister and activist who became the most visible spokesman and leader in the American civil rights movement from 1955 until his assassination in 1968. King advanced civil rights through nonviolence and civil disobedience, inspired by his Christian beliefs and the nonviolent activism of Mahatma Gandhi. He was the son of early civil rights activist and minister Martin Luther King Sr.

King participated in and led marches for blacks' right to vote, desegregation, labor rights, and other basic civil rights.[1] King led the 1955 Montgomery bus boycott and later became the first president of the Southern Christian Leadership Conference (SCLC). As president of the SCLC, he led the unsuccessful Albany Movement in Albany, Georgia, and helped organize some of the nonviolent 1963 protests in Birmingham, Alabama. King helped organize the 1963 March on Washington, where he delivered his famou

In [262]:
#Task e.g = extract all proper nouns

In [263]:
nlp = spacy.load("en_core_web_sm")

In [264]:
matcher = Matcher(nlp.vocab)

In [265]:
pattern = [{"POS": "PROPN"}]
matcher.add("PROPER_NOUN",[pattern])

In [266]:
doc = nlp(text)
matches = matcher(doc)
print (len(matches))

103


In [267]:
for match in matches [:10]:
    print (match,doc[match[1]:match[2]])

(451313080118390996, 0, 1) Martin
(451313080118390996, 1, 2) Luther
(451313080118390996, 2, 3) King
(451313080118390996, 3, 4) Jr.
(451313080118390996, 6, 7) Michael
(451313080118390996, 7, 8) King
(451313080118390996, 8, 9) Jr.
(451313080118390996, 10, 11) January
(451313080118390996, 16, 17) April
(451313080118390996, 50, 51) King


In [268]:
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN","OP":"+"}]
matcher.add("PROPER_NOUN",[pattern])
doc = nlp(text)
matches = matcher(doc)
print (len(matches))
for match in matches [:10]:
    print (match,doc[match[1]:match[2]])

176
(451313080118390996, 0, 1) Martin
(451313080118390996, 0, 2) Martin Luther
(451313080118390996, 1, 2) Luther
(451313080118390996, 0, 3) Martin Luther King
(451313080118390996, 1, 3) Luther King
(451313080118390996, 2, 3) King
(451313080118390996, 0, 4) Martin Luther King Jr.
(451313080118390996, 1, 4) Luther King Jr.
(451313080118390996, 2, 4) King Jr.
(451313080118390996, 3, 4) Jr.


In [269]:
#Here the problem is that it grabs all proper nouns as asked,but also any combination between them

In [270]:
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN","OP":"+"}]
matcher.add("PROPER_NOUN",[pattern],greedy = "LONGEST") #greedy = "LONGEST" grabs the highest token in combinations
doc = nlp(text)
matches = matcher(doc)
print (len(matches))
for match in matches [:10]:
    print (match,doc[match[1]:match[2]])

62
(451313080118390996, 84, 89) Martin Luther King Sr.
(451313080118390996, 470, 475) Martin Luther King Jr. Day
(451313080118390996, 537, 542) Martin Luther King Jr. Memorial
(451313080118390996, 0, 4) Martin Luther King Jr.
(451313080118390996, 129, 133) Southern Christian Leadership Conference
(451313080118390996, 248, 252) Director J. Edgar Hoover
(451313080118390996, 6, 9) Michael King Jr.
(451313080118390996, 326, 329) Nobel Peace Prize
(451313080118390996, 423, 426) James Earl Ray
(451313080118390996, 464, 467) Congressional Gold Medal


In [271]:
#now the problem is that its all out of order,it goes from largest to smallest

In [272]:
#But we can sort them

In [273]:
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN","OP":"+"}]
matcher.add("PROPER_NOUN",[pattern],greedy = "LONGEST") #greedy = "LONGEST" grabs the highest token in combinations
doc = nlp(text)
matches = matcher(doc)
matches.sort(key = lambda x:x[1]) #x[1] is the Start TOKEN
print (len(matches))
for match in matches [:10]:
    print (match,doc[match[1]:match[2]])

62
(451313080118390996, 0, 4) Martin Luther King Jr.
(451313080118390996, 6, 9) Michael King Jr.
(451313080118390996, 10, 11) January
(451313080118390996, 16, 17) April
(451313080118390996, 50, 51) King
(451313080118390996, 70, 72) Mahatma Gandhi
(451313080118390996, 84, 89) Martin Luther King Sr.
(451313080118390996, 90, 91) King
(451313080118390996, 114, 115) King
(451313080118390996, 118, 119) Montgomery


In [274]:
#If we want to find what nouns are followed by a verb

In [275]:
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN","OP":"+"},{"POS":"VERB"}] #sequence with propernoun + verb
matcher.add("PROPER_NOUN",[pattern],greedy = "LONGEST") #greedy = "LONGEST" grabs the highest token in combinations
doc = nlp(text)
matches = matcher(doc)
matches.sort(key = lambda x:x[1]) #x[1] is the Start TOKEN
print (len(matches))
for match in matches [:10]:
    print (match,doc[match[1]:match[2]])

8
(451313080118390996, 50, 52) King advanced
(451313080118390996, 90, 92) King participated
(451313080118390996, 114, 116) King led
(451313080118390996, 168, 170) King helped
(451313080118390996, 199, 201) SCLC put
(451313080118390996, 248, 253) Director J. Edgar Hoover considered
(451313080118390996, 323, 325) King won
(451313080118390996, 486, 489) United States beginning


In [276]:
import json
with open("alice.json","r") as f :
    data = json.load(f)

In [277]:

text = data [0][2][0]
print(text)

Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, `and what is the use of a book,' thought Alice `without pictures or conversation?'


In [278]:
#Lets say we want to know what person was the propernoun before the " "

In [279]:
text = text.replace ("`","'")

In [280]:
print(text)

Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?'


In [281]:
#capture all quotation marks in text

In [282]:
matcher = Matcher(nlp.vocab)
pattern = [
    {"ORTH":"'"},
    {"IS_ALPHA":True,"OP":"+"}, #it must be true 1 or more times
    {"IS_PUNCT": True,"OP":"*"}, #it can be true or not true
    {"ORTH":"'"}      
]
matcher.add("PROPER_NOUN",[pattern],greedy = "LONGEST") #greedy = "LONGEST" grabs the highest token in combinations
doc = nlp(text)
matches = matcher(doc)
matches.sort(key = lambda x:x[1])
print (len(matches))
for match in matches [:10]:
    print (match,doc[match[1]:match[2]])

2
(451313080118390996, 47, 58) 'and what is the use of a book,'
(451313080118390996, 60, 67) 'without pictures or conversation?'


In [283]:
speak_lemmas = ["think","say"] #to actually get who is talking
matcher = Matcher(nlp.vocab)
pattern = [
    {"ORTH":"'"},
    {"IS_ALPHA":True,"OP":"+"}, #it must be true 1 or more times
    {"IS_PUNCT": True,"OP":"*"}, #it can be true or not true
    {"ORTH":"'"},
    {"POS":"VERB","LEMMA":{"IN":speak_lemmas}}, #next token is going to be a verb inside speak_lemmas
    {"POS":"PROPN","OP":"+"}, #next is going to be a proper noun,with 1or more tokens
    {"ORTH":"'"},
    {"IS_ALPHA":True,"OP":"+"}, 
    {"IS_PUNCT": True,"OP":"*"}, 
    {"ORTH":"'"}
]

matcher.add("PROPER_NOUN",[pattern],greedy = "LONGEST") #greedy = "LONGEST" grabs the highest token in combinations
doc = nlp(text)
matches = matcher(doc)
matches.sort(key = lambda x:x[1])
print (len(matches))
for match in matches [:10]:
    print (match,doc[match[1]:match[2]])

1
(451313080118390996, 47, 67) 'and what is the use of a book,' thought Alice 'without pictures or conversation?'


In [284]:
for text in data[0][2]:
    text = text.replace ("`","'")
    doc = nlp(text)
    matches = matcher(doc)
    print (len(matches))
    matches.sort(key = lambda x:x[1])
    for match in matches [:10]:
        print (match,doc[match[1]:match[2]])

1
(451313080118390996, 47, 67) 'and what is the use of a book,' thought Alice 'without pictures or conversation?'
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [285]:
#We only grabbed one match,because we only made only one pattern

In [286]:
matcher = Matcher(nlp.vocab)
pattern1 = [{"ORTH":"'"},{'IS_ALPHA':True,"OP":"+"},
            {"IS_PUNCT":True,"OP":"*"},{"ORTH":"'"},
            {"POS":"VERB","LEMMA":{"IN":speak_lemmas}},
            {"POS": "PROPN", "OP": "+"},
            {'ORTH': "'"},{'IS_ALPHA': True, "OP": "+"}, 
            {'IS_PUNCT': True, "OP": "*"},{'ORTH': "'"}
           ]
pattern2 = [{"ORTH":"'"},{'IS_ALPHA':True,"OP":"+"},
            {"IS_PUNCT":True,"OP":"*"},{"ORTH":"'"},
            {"POS":"VERB","LEMMA":{"IN":speak_lemmas}},
            {"POS": "PROPN", "OP": "+"}]
pattern3 = [{"POS": "PROPN", "OP": "+"},{"POS": "VERB", "LEMMA": {"IN": speak_lemmas}},
            {'ORTH': "'"}, {'IS_ALPHA': True, "OP": "+"},
            {'IS_PUNCT': True, "OP": "*"}, {'ORTH': "'"}]
matcher.add("PROPER_NOUNS", [pattern1, pattern2, pattern3], greedy='LONGEST')
for text in data[0][2]:
    text = text.replace("`", "'")
    doc = nlp(text)
    matches = matcher(doc)
    matches.sort(key = lambda x: x[1])
    print (len(matches))
    for match in matches[:10]:
        print (match, doc[match[1]:match[2]])

1
(3232560085755078826, 47, 67) 'and what is the use of a book,' thought Alice 'without pictures or conversation?'
0
0
0
0
0
1
(3232560085755078826, 0, 6) 'Well!' thought Alice
0
0
0
0
0
0
0
1
(3232560085755078826, 57, 68) 'which certainly was not here before,' said Alice
0
0


## Custom components

In [287]:
from spacy.language import Language

In [288]:
nlp = spacy.load("en_core_web_sm")

In [289]:
doc = nlp("Britain is a place.Mary is a doctor")

In [290]:
for ent in doc.ents:
    print(ent.text,ent.label_)

Britain GPE


In [291]:
@Language.component("remove_gpe")
def remove_gpe(doc):
    original_ents = list(doc.ents)
    for ent in doc.ents:
        if ent.label_ == "GPE":
            original_ents.remove(ent)
    doc.ents = original_ents
    return(doc)

In [292]:
nlp.add_pipe("remove_gpe")

<function __main__.remove_gpe(doc)>

In [293]:
nlp.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False},
  'remove_gpe': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False}},
 'problems': {'tok2vec': [],
  

In [294]:
doc = nlp("Britain is a place.Mary is a doctor")
for ent in doc.ents:
    print(ent.text,ent.label_)