In [2]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy


In [3]:
nlp=spacy.load('en_core_web_sm')

In [4]:
doc=nlp('Hello world!')

In [5]:
for token in doc:
    print(token)

Hello
world
!


In [6]:
# token absed matching
pattern=[{"LOWER":"hello",'OP':'?'},{"IS_PUNCT":True, 'OP':'?'},{"LOWER":"world"}]

In [7]:
matcher=Matcher(nlp.vocab)
matcher.add('HelloWorld',None,pattern)

In [8]:
doc = nlp("Hello, world!")

In [9]:
matches=matcher(doc)

In [10]:
matches

[(15578876784678163569, 0, 3),
 (15578876784678163569, 1, 3),
 (15578876784678163569, 2, 3)]

In [11]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id,string_id,start,end,span.text)

15578876784678163569 HelloWorld 0 3 Hello, world
15578876784678163569 HelloWorld 1 3 , world
15578876784678163569 HelloWorld 2 3 world


In [12]:
text="my mobile number is 1234. I think its is wrong as it doesn't follow the rules. So the correct one is 9003469542"

In [13]:
import re

In [14]:
re.search(r'\d{10}',text)

<re.Match object; span=(101, 111), match='9003469542'>

In [15]:
# wildcard text
re.findall(r'm.....',text)

['my mob', 'mber i']

In [16]:
# Exclusion 

In [17]:
re.findall(r'[^\W]+',text)

['my',
 'mobile',
 'number',
 'is',
 '1234',
 'I',
 'think',
 'its',
 'is',
 'wrong',
 'as',
 'it',
 'doesn',
 't',
 'follow',
 'the',
 'rules',
 'So',
 'the',
 'correct',
 'one',
 'is',
 '9003469542']

In [18]:
text="Google announced new pixxel in Google I/O. Google I/O is a great place to get updates from Google."

In [19]:
#  #of times google appeared

In [20]:
pattern = [{'TEXT':'Google'},{'TEXT':'I'},{'TEXT':'/'},{'TEXT':'O'}]

In [21]:
def call_back(matcher,doc,i,matches):
    match_id, start, end = matches[i]
    entity = doc[start:end]
    print(entity.text)

In [22]:
matcher = Matcher(nlp.vocab)
matcher.add('Google', call_back, pattern)

In [23]:
doc=nlp(text)

In [24]:
matcher(doc)

Google I/O


[(11578853341595296054, 9, 13)]

In [25]:
# finding matched sentences
matched_sent = []

In [26]:
pattern= [{"LOWER":"facebook"},{"LEMMA":"be"},{"POS":"ADV","OP":"*"},{"POS":"ADJ"}]

In [27]:
def call_back_method_fb(matcher,doc,i,matches):
    matched_id, start, end =matches[i]
    span=doc[start:end]
    sent=span.sent
    
    match_ents=[{
        'start':span.start_char-sent.start_char,
        'end': span.end_char-sent.start_char,
        'label':'MATCH'
    }]
    matched_sent.append({'text':sent.text, 'ents':match_ents})

In [28]:
matcher.add("fb",call_back_method_fb,pattern)

In [29]:
doc=nlp("I'd say that facebook is evil.- Facebook is pretty cool, right?")

In [30]:
matches=matcher(doc)

In [31]:
matches

[(8017838677478259815, 7, 11)]

In [32]:
matched_sent

[{'text': 'evil.- Facebook is pretty cool, right?',
  'ents': [{'start': 7, 'end': 30, 'label': 'MATCH'}]}]

In [33]:
displacy.render(matched_sent,style='ent',manual=True)

## Extracting mobile numbers from text

In [None]:
# match pattern has to look out for number sequencs of certain length, surrounded by sepcific punctuation

In [43]:
pattern = [{"ORTH":"("},{"SHAPE":"ddd"},{"ORTH":")"},{"SHAPE":"dddd"},{"ORTH":"-","OP":"?"},{"SHAPE":"dddd"}]

In [44]:
matcher = Matcher(nlp.vocab)
matcher.add("PhoneNumber",None,pattern)

In [45]:
doc=nlp("call me at (123) 4560 7890")

In [46]:
print([t.text for t in doc])

['call', 'me', 'at', '(', '123', ')', '4560', '7890']


In [47]:
matches=matcher(doc)
matches

[(7978097794922043545, 3, 8)]

In [48]:
for match_id , start, end in matches:
    span = doc[start:end]
    print(span.text)

(123) 4560 7890


## Email Address matching

In [50]:
pattern = [{"TEXT":{"REGEX":"[a-zA-z0-9-_.]+@[a-zA-Z0-9-_.]+"}}]

In [52]:
matcher = Matcher(nlp.vocab)
matcher.add("Email",None,pattern)

In [53]:
text="Email to me  at kingis__King@gmai_l.com"

In [54]:
doc = nlp(text)

In [55]:
matches = matcher(doc)
matches

[(11010771136823990775, 5, 6)]

In [56]:
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

kingis__King@gmai_l.com


## Efficient Phrase Matching

In [None]:
# to match large terminology lists, as we can do using PhraseMatcher and create doc objects instead of token patterns,
# Doc patterns contain single or multiple tokens

In [58]:
from spacy.matcher import PhraseMatcher

In [59]:
matcher = PhraseMatcher(nlp.vocab)

In [60]:
terms = ['Barac Obama','ANGELA MERKEL','WASHINGTON D.C.']

In [61]:
pattern = [nlp.make_doc(text) for text in terms]

In [62]:
pattern

[Barac Obama, ANGELA MERKEL, WASHINGTON D.C.]

In [63]:
matcher.add('term',None, *pattern)

In [67]:
doc = nlp("German Chancellor Angela MERKEL and president Barac Obama had a stupednous conversation in WASHINGTON D.C.")

In [68]:
doc

German Chancellor Angela MERKEL and president Barac Obama had a stupednous conversation in WASHINGTON D.C.

In [70]:
matches= matcher(doc)

In [71]:
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

Barac Obama
WASHINGTON D.C.


In [72]:
matches

[(4519742297340331040, 6, 8), (4519742297340331040, 13, 15)]

## Custom Rule Based Entity Recognition

In [None]:
#Entity Ruler is a pipeline component that's typically added viap nlp.addIpipe when nlp obejct is called ona text.
# it will find amtches in doc and add them as entities to the  doc.ents 
# using the specified pattern albel asthe entity label.

In [73]:
from spacy.pipeline import EntityRuler

In [74]:
nlp = spacy.load('en_core_web_sm')

In [75]:
ruler = EntityRuler(nlp)

In [76]:
patterns = [{'label':'ORG','pattern':'KGP Talkie'},{'label':'GPE','pattern':[{'LOWER':'san'},{'LOWER':'francisco'}]}]

In [77]:
patterns

[{'label': 'ORG', 'pattern': 'KGP Talkie'},
 {'label': 'GPE', 'pattern': [{'LOWER': 'san'}, {'LOWER': 'francisco'}]}]

In [78]:
ruler.add_patterns(patterns)

In [79]:
nlp.add_pipe(ruler)

In [80]:
doc = nlp("KGP Talkie is open its first big office in san francisco")

In [81]:
doc

KGP Talkie is open its first big office in san francisco

In [82]:
for ent in doc.ents:
    print(ent.text,ent.label_)

KGP Talkie PERSON
first ORDINAL
san francisco GPE
