## Entity Ruler

In [1]:
import spacy

In [16]:
nlp = spacy.load("en_core_web_sm")
text = """
Former Malaysian minister Syed Saddiq gets 7 years' jail in Singapore
, US$2.1m fine and caning for corruption
"""

In [14]:
doc = nlp(text)

In [15]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Treblinka GPE
Poland GPE


In [17]:
nlp.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False}},
 'problems': {'tok2vec': [],
  'tagger': [],
  'parser': [],
  'attribute_ruler': [],
  'lemmatizer': [],
  'ner': []},
 'att

### Define a custom rule to match phone number

In [18]:
#Import the requisite library
import spacy

#Sample text
text = "This is a sample number (555) 555-5555."

#Build upon the spaCy Small Model
nlp = spacy.blank("en")

#Create the Ruler and Add it
ruler = nlp.add_pipe("entity_ruler")

#List of Entities and Patterns (source: https://spacy.io/usage/rule-based-matching)
patterns = [
                {"label": "PHONE_NUMBER", "pattern": [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "ddd"},
                {"ORTH": "-", "OP": "?"}, {"SHAPE": "dddd"}]}
            ]
#add patterns to ruler
ruler.add_patterns(patterns)



#create the doc
doc = nlp(text)

#extract entities
for ent in doc.ents:
    print (ent.text, ent.label_)

(555) 555-5555 PHONE_NUMBER


## Matcher

In [19]:
import spacy
from spacy.matcher import Matcher

In [20]:
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern = [{"LIKE_EMAIL": True}]
matcher.add("EMAIL_ADDRESS", [pattern])
doc = nlp("This is some email address abc@cd.com")
matches = matcher(doc)
print(matches)

[(16571425990740197027, 5, 6)]


In [21]:
nlp.vocab[matches[0][0]].text

'EMAIL_ADDRESS'

### Use Matcher to find nouns in Shakespeare

In [24]:
import wikipedia
wikipedia.set_lang("en")

text = wikipedia.summary("William Shakespeare")

In [25]:
text

'William Shakespeare (bapt.Tooltip baptised 26 April 1564 – 23 April 1616) was an English playwright, poet and actor. He is widely regarded as the greatest writer in the English language and the world\'s pre-eminent dramatist. He is often called England\'s national poet and the "Bard of Avon" (or simply "the Bard"). His extant works, including collaborations, consist of some 39 plays, 154 sonnets, three long narrative poems, and a few other verses, some of uncertain authorship. His plays have been translated into every major living language and are performed more often than those of any other playwright. Shakespeare remains arguably the most influential writer in the English language, and his works continue to be studied and reinterpreted.\nShakespeare was born and raised in Stratford-upon-Avon, Warwickshire. At the age of 18, he married Anne Hathaway, with whom he had three children: Susanna, and twins Hamnet and Judith. Sometime between 1585 and 1592, he began a successful career in 

In [26]:
len(text)

2496

In [30]:
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN", "OP": "+"}] # + allows for multi word tokens
matcher.add("PROPER_NOUNS", [pattern], greedy='LONGEST') # greedy matching
doc = nlp(text)
matches = matcher(doc)
# sort by sequence
matches.sort(key = lambda x: x[1])
print(len(matches))
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

42
(3232560085755078826, 0, 2) William Shakespeare
(3232560085755078826, 5, 6) Tooltip
(3232560085755078826, 8, 9) April
(3232560085755078826, 12, 13) April
(3232560085755078826, 49, 50) England
(3232560085755078826, 56, 57) Bard
(3232560085755078826, 58, 59) Avon
(3232560085755078826, 65, 66) Bard
(3232560085755078826, 123, 124) Shakespeare
(3232560085755078826, 146, 147) Shakespeare


### Finding Quotes and Speakers

In [35]:
import requests
import re

# URL for Alice in Wonderland text from Project Gutenberg
url = 'https://www.gutenberg.org/files/11/11-0.txt'

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    alice_text = response.text
    
    # Remove all newline characters and carriage returns
    alice_text = alice_text.replace('\n', ' ').replace('\r', ' ')
    
    # Use a regular expression to remove any characters that are not letters, numbers, spaces, or common punctuation
    alice_text_cleaned = re.sub(r'[^a-zA-Z0-9\s.,!?;:\'\"-]', '', alice_text)
    
    print(alice_text_cleaned[:1000])  # Print the first 1000 characters as a snippet
else:
    print("Failed to retrieve the text.")

The Project Gutenberg eBook of Alices Adventures in Wonderland, by Lewis Carroll    This eBook is for the use of anyone anywhere in the United States and  most other parts of the world at no cost and with almost no restrictions  whatsoever. You may copy it, give it away or re-use it under the terms  of the Project Gutenberg License included with this eBook or online at  www.gutenberg.org. If you are not located in the United States, you  will have to check the laws of the country where you are located before  using this eBook.    Title: Alices Adventures in Wonderland    Author: Lewis Carroll    Release Date: January, 1991 eBook 11  Most recently updated: October 12, 2020    Language: English    Character set encoding: UTF-8    Produced by: Arthur DiBianca and David Widger     START OF THE PROJECT GUTENBERG EBOOK ALICES ADVENTURES IN WONDERLAND     Illustration          Alices Adventures in Wonderland    by Lewis Carroll    THE MILLENNIUM FULCRUM EDITION 3.0    Contents     CHAPTER I. 

In [39]:
matcher = Matcher(nlp.vocab)
pattern = [{'ORTH': "'"}, {'IS_ALPHA': True, "OP": "+"}, {'IS_PUNCT': True, "OP": "*"}, {'ORTH': "'"}]
matcher.add("PROPER_NOUNS", [pattern], greedy='LONGEST')
doc = nlp(alice_text_cleaned)
matches = matcher(doc)
matches.sort(key = lambda x: x[1])
print (len(matches))
for match in matches[:10]:
    print (match, doc[match[1]:match[2]])

0


In [37]:
alice_text



In [40]:
len(alice_text_cleaned)

163858

In [42]:
with open('alice.txt', 'w') as f:
    f.write(alice_text_cleaned)

our file is different hence can't find the quotation mark pattern above

In [44]:
matcher = Matcher(nlp.vocab)
pattern1 = [{"POS": "VERB", "LEMMA": {"IN": speak_lemmas}}, {"POS": "PROPN", "OP": "+"}]
matcher.add("PROPER_NOUNS", [pattern1], greedy='LONGEST')
doc = nlp(alice_text_cleaned)
matches = matcher(doc)
matches.sort(key = lambda x: x[1])
print (len(matches))
for match in matches[:10]:
    print (match, doc[match[1]:match[2]])

139
(3232560085755078826, 369, 371) thought Alice
(3232560085755078826, 862, 864) thought Alice
(3232560085755078826, 2074, 2076) said Alice
(3232560085755078826, 2337, 2339) said Alice
(3232560085755078826, 2433, 2435) said Alice
(3232560085755078826, 2627, 2629) said Alice
(3232560085755078826, 3130, 3132) thought Alice
(3232560085755078826, 3343, 3345) said Alice
(3232560085755078826, 4361, 4363) said Alice
(3232560085755078826, 4630, 4632) said Alice


get the whole sentence where the pattern was found

In [45]:
for match_id, start, end in matches[:10]:
    span = doc[start:end]  # The matched span
    sentence = span.sent  # The sentence containing the matched span
    print(match_id, sentence)

3232560085755078826 the Rabbit-Hole      Alice was beginning to get very tired of sitting by her sister on the  bank, and of having nothing to do: once or twice she had peeped into  the book her sister was reading, but it had no pictures or  conversations in it, and what is the use of a book, thought Alice  without pictures or conversations?    
3232560085755078826 thought Alice to herself, after such a fall as this, I shall  think nothing of tumbling down stairs!
3232560085755078826 There seemed to be no use in waiting by the little door, so she went  back to the table, half hoping she might find another key on it, or at  any rate a book of rules for shutting people up like telescopes: this  time she found a little bottle on it, which certainly was not here  before, said Alice, and round the neck of the bottle was a paper  label, with the words DRINK ME, beautifully printed on it in large  letters.    
3232560085755078826 said Alice; I must be shutting up like a  telescope.    
323256