In [87]:
import spacy
import re
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy
nlp = spacy.load('en_core_web_sm')
from spacy.matcher import PhraseMatcher
from spacy.pipeline import EntityRuler

### **Using Linguistic annotations**

In [20]:
matcher = Matcher(nlp.vocab)

In [21]:
matched_sents = []

In [22]:
pattern = [{'LOWER': 'facebook'}, {'LEMMA': 'be'}, {"POS": 'ADV', 'OP': '*'}, {'POS': 'ADJ'}]

In [23]:
def callback_method(matcher, doc, i, matches): #necessary to process all the text together.
  matched_id, start, end = matches[i]
  span = doc[start:end]
  sent = span.sent

  match_ents = [{'start': span.start_char - sent.start_char, 'end': span.end_char - sent.start_char,
                 'label': 'MATCH'}]
  matched_sents.append({'text': sent.text, 'ents':match_ents})

In [24]:
matcher.add('fb', callback_method, pattern)

In [25]:
doc = nlp("I'd say that Facebook is evil. - Facebook is pretty cool, right?")

In [26]:
matches = matcher(doc)

In [27]:
matches

[(8017838677478259815, 4, 7), (8017838677478259815, 9, 13)]

In [28]:
matched_sents

[{'ents': [{'end': 29, 'label': 'MATCH', 'start': 13}],
  'text': "I'd say that Facebook is evil."},
 {'ents': [{'end': 23, 'label': 'MATCH', 'start': 0}],
  'text': 'Facebook is pretty cool, right?'}]

In [29]:
displacy.render(matched_sents, style='ent', manual=True)

'<div class="entities" style="line-height: 2.5; direction: ltr">I\'d say that \n<mark class="entity" style="background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Facebook is evil\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">MATCH</span>\n</mark>\n.</div>\n\n<div class="entities" style="line-height: 2.5; direction: ltr">\n<mark class="entity" style="background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Facebook is pretty cool\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">MATCH</span>\n</mark>\n, right?</div>'

### **Phone Numbers**

In [30]:
pattern = [{'ORTH': '('}, {'SHAPE': 'ddd'}, {'ORTH': ')'}, {'SHAPE': 'dddd'}, {'ORTH': '-', 'OP': '?'}, {'SHAPE': 'dddd'}]

In [31]:
matcher = Matcher(nlp.vocab)

In [32]:
matcher.add('Phonenumber', None, pattern)

In [36]:
doc = nlp('Call me at (123) 4562 7893')

In [37]:
print([t.text for t in doc])

['Call', 'me', 'at', '(', '123', ')', '4562', '7893']


In [38]:
matches = matcher(doc)
matches

[(2874678971812469239, 3, 8)]

In [41]:
for match_id, start, end in matches:
  span = doc[start:end]
  print(span.text)

(123) 4562 7893


### **Email address matching**

In [42]:
pattern = [{'TEXT': {'REGEX': '[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+'}}]

In [43]:
matcher = Matcher(nlp.vocab)

In [44]:
matcher.add('email', None, pattern)

In [47]:
doc = nlp('Email me at test@gmail.com and talk.me@gmail.com')

In [48]:
matches = matcher(doc)

In [49]:
matches

[(7320900731437023467, 3, 4), (7320900731437023467, 5, 6)]

In [50]:
for match_id, start, end in matches:
  span = doc[start:end]
  print(span.text)

test@gmail.com
talk.me@gmail.com


### **Hashtags and emoji detection on social media**

In [52]:
pos_emoji = ["😀", "😃", "😂", "🤣", "😊", "😍"]  # Positive emoji
neg_emoji = ["😞", "😠", "😩", "😢", "😭", "😒"]  # Negative emoji
pos_emoji

['😀', '😃', '😂', '🤣', '😊', '😍']

In [53]:
pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emoji]
neg_patterns = [[{'ORTH': emoji}] for emoji in neg_emoji]

In [54]:
pos_patterns

[[{'ORTH': '😀'}],
 [{'ORTH': '😃'}],
 [{'ORTH': '😂'}],
 [{'ORTH': '🤣'}],
 [{'ORTH': '😊'}],
 [{'ORTH': '😍'}]]

In [55]:
neg_patterns

[[{'ORTH': '😞'}],
 [{'ORTH': '😠'}],
 [{'ORTH': '😩'}],
 [{'ORTH': '😢'}],
 [{'ORTH': '😭'}],
 [{'ORTH': '😒'}]]

In [56]:
def label_sentiment(matcher, doc, i, matches):
  match_id, start, end = matches[i]
  if doc.vocab.strings[match_id] == 'HAPPY':
    doc.sentiment += 0.1
  elif doc.vocab.strings[match_id] == 'SAD':
    doc.sentiment -= 0.1

In [58]:
matcher = Matcher(nlp.vocab)

In [60]:
matcher.add('HAPPY', label_sentiment, *pos_patterns)
matcher.add('SAD', label_sentiment, *neg_patterns)

In [61]:
matcher.add('HASHTAG', None, [{'TEXT': '#'}, {'IS_ASCII': True}])

In [69]:
doc = nlp("Hello world 😀 #Krish")

In [70]:
matches = matcher(doc)

In [71]:
for match_id, start, end in matches:
  string_id = doc.vocab.strings[match_id]
  span = doc[start:end]
  print(string_id, span.text)

HAPPY 😀
HASHTAG #Krish


### **Efficient phrase matching**

In [75]:
matcher = PhraseMatcher(nlp.vocab)

In [76]:
terms = ['BARACK OBAMA', 'ANGELA MERKEL', "WASHINGTON D.C."]

In [77]:
pattern = [nlp.make_doc(text) for text in terms]

In [78]:
pattern

[BARACK OBAMA, ANGELA MERKEL, WASHINGTON D.C.]

In [80]:
matcher.add('TERM', None, *pattern)

In [82]:
doc = nlp("German Chancellor ANGELA MERKEL and US President BARACK OBAMA "
          "converse in the Oval Office inside the White House in WASHINGTON D.C.")

In [83]:
doc

German Chancellor ANGELA MERKEL and US President BARACK OBAMA converse in the Oval Office inside the White House in WASHINGTON D.C.

In [84]:
matches = matcher(doc)

In [85]:
matches

[(1187835807175779026, 2, 4),
 (1187835807175779026, 7, 9),
 (1187835807175779026, 19, 21)]

In [86]:
for match_id, start, end in matches:
  string_id = doc.vocab.strings[match_id]
  span = doc[start:end]
  print(string_id, span.text)

TERM ANGELA MERKEL
TERM BARACK OBAMA
TERM WASHINGTON D.C.


### **Custom rule based entity recognition**

In [88]:
nlp = spacy.load('en_core_web_sm')

In [89]:
ruler = EntityRuler(nlp)

In [90]:
patterns = [{'label': 'ORG', 'pattern': 'KRISH LTD'},
            {'label': 'GPE', 'pattern': [{'LOWER': 'san'}, {'LOWER': 'francisco'}]}]

In [91]:
patterns

[{'label': 'ORG', 'pattern': 'KRISH LTD'},
 {'label': 'GPE', 'pattern': [{'LOWER': 'san'}, {'LOWER': 'francisco'}]}]

In [92]:
ruler.add_patterns(patterns)

In [93]:
nlp.add_pipe(ruler)

In [94]:
doc = nlp('KRISH LTD is opening its first big office in San Francisco.')

In [95]:
doc

KRISH LTD is opening its first big office in San Francisco.

In [97]:
for ent in doc.ents:
  print(ent.text, ent.label_)

KRISH LTD ORG
first ORDINAL
San Francisco GPE
