For RegEx
https://www.programiz.com/python-programming/regex

In [1]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import span
from spacy import displacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
doc = nlp('HEllo WoRld!')

In [4]:
for token in doc:
    print(token.text)

HEllo
WoRld
!


In [5]:
pattern = [{'LOWER':'hello','OP':'?'},{'IS_PUNCT':True,'OP':'?'},{"LOWER":"world"}]

In [6]:
matcher = Matcher(nlp.vocab)
matcher.add('HelloWorld',None,pattern)

In [7]:
doc = nlp('HEllo. WOrld!')

In [8]:
matches = matcher(doc)

In [9]:
matches

[(15578876784678163569, 0, 3),
 (15578876784678163569, 1, 3),
 (15578876784678163569, 2, 3)]

In [10]:
for match_id,start,end in matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id,string_id,start,end,span.text)

15578876784678163569 HelloWorld 0 3 HEllo. WOrld
15578876784678163569 HelloWorld 1 3 . WOrld
15578876784678163569 HelloWorld 2 3 WOrld


### Regular Expression

In [11]:
text = "My phone number is 6274. Oops its wrong. My correct number is 62745131."

In [12]:
import re

In [13]:
re.search(r'\d{8}',text)

<re.Match object; span=(62, 70), match='62745131'>

In [14]:
re.findall(r'\d{3,8}', text)

['6274', '62745131']

In [15]:
re.findall(r'\w{4}', text)

['phon', 'numb', '6274', 'Oops', 'wron', 'corr', 'numb', '6274', '5131']

In [16]:
re.findall(r'\w{4,}', text)

['phone', 'number', '6274', 'Oops', 'wrong', 'correct', 'number', '62745131']

In [17]:
re.findall(r'i..', text)

['is ', 'its', 'is ']

In [18]:
text = "1 ad jee ajaja cjcjaae 21"

In [19]:
re.findall(r'[^\d]+',text)

[' ad jee ajaja cjcjaae ']

In [20]:
re.findall(r'[^\D]+', text)

['1', '21']

In [21]:
text = "abc-defe aasd-aaaa"

In [22]:
re.findall(r'[\w]+-[\w]+',text)

['abc-defe', 'aasd-aaaa']

### RegEx using SpaCy

In [23]:
text = "Google announced a new Pixel at Google I/O Google I/O is a great place to get all updates from Google."

In [24]:
pattern = [{'TEXT':'Google'}, {'TEXT': 'I', 'OP': '?'}, {'TEXT':'/', 'OP': '?'}, {'TEXT':'O', 'OP': '?'}]

In [25]:
def callback_method(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = doc[start:end]
    print(entity.text)

In [26]:
matcher.add('Google', callback_method, pattern)

In [27]:
doc = nlp(text)

In [28]:
matches = matcher(doc)

Google
Google
Google I
Google I/
Google I/O
Google
Google I
Google I/
Google I/O
Google


### Linguistic Annotations

In [29]:
pattern = [{'LOWER':'facebook'}, {'LEMMA':'be'},{'POS':'ADV', 'OP':'*'}, {'POS':'ADJ'}]

In [30]:
match_sent = []
def callback_method_2(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    span = doc[start:end]
    sent = span.sent
    match_ents = [{
        'start': span.start_char - sent.start_char,
        'end': span.end_char - sent.start_char,
        'label': 'MATCH'
    }]
    
    match_sent.append({'text':sent.text,'ents':match_ents})

In [31]:
matcher.add('fb',callback_method_2,pattern)

In [32]:
doc = nlp("I'd say that Facebook is evil. – Facebook is pretty cool, right?")

In [33]:
matches = matcher(doc)

In [34]:
matches

[(8017838677478259815, 4, 7), (8017838677478259815, 9, 13)]

In [35]:
match_sent

[{'text': "I'd say that Facebook is evil.",
  'ents': [{'start': 13, 'end': 29, 'label': 'MATCH'}]},
 {'text': '– Facebook is pretty cool, right?',
  'ents': [{'start': 2, 'end': 25, 'label': 'MATCH'}]}]

In [36]:
displacy.render(match_sent, style='ent', manual = True)

### Extracting Phone number (U.S.A. Format)

In [37]:
pattern = [{'ORTH':'('},{'SHAPE':'ddd'}, {'ORTH':')'}, {'SHAPE':'dddd'},{'ORTH':'-','OP':'?'},{'SHAPE':'dddd'}]

In [38]:
doc = nlp('Call me at (123) 4560-7890 or (123) 4560 7890')

In [39]:
print([t.text for t in doc])

['Call', 'me', 'at', '(', '123', ')', '4560', '-', '7890', 'or', '(', '123', ')', '4560', '7890']


In [40]:
matcher.add('Phone Number', callback_method, pattern)

In [41]:
matches = matcher(doc)

(123) 4560-7890
(123) 4560 7890


### Email Address Extraction

In [42]:
pattern = [{"TEXT": {"REGEX": "[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+"}}]

In [43]:
matcher.add("Email", callback_method, pattern)

In [44]:
doc = nlp('email me at krutarth.798@gmail.com or dkrutarth21@gmail.com')
matches = matcher(doc)

krutarth.798@gmail.com
dkrutarth21@gmail.com


### Emoji & Hashtag Extraction 

In [45]:
pos_emoji = ["😀", "😃", "😂", "🤣", "😊", "😍"] 
neg_emoji = ["😞", "😠", "😩", "😢", "😭", "😒"]

In [46]:
# Add patterns to match one or more emoji tokens
pos_patterns = [[{"ORTH": emoji}] for emoji in pos_emoji]
neg_patterns = [[{"ORTH": emoji}] for emoji in neg_emoji]

In [47]:
def label_sentiment(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    string_id = doc.vocab.strings[match_id]
    if doc.vocab.strings[match_id] == 'HAPPY':
        doc.sentiment += 0.1
        print(string_id)
    elif doc.vocab.strings[match_id] == 'SAD':
        doc.sentiment -= 0.1
        print(string_id)

In [48]:
matcher.add("HAPPY", label_sentiment, *pos_patterns)
matcher.add('SAD', label_sentiment, *neg_patterns)

In [49]:
matcher.add('HASHTAG', None, [{'TEXT': '#'}, {'IS_ASCII': True}])

In [50]:
doc = nlp("Hello world 😀 #jantacurfew")

In [51]:
matches=matcher(doc)

HAPPY


In [52]:
for match_id, start, end in matches:
    string_id = doc.vocab.strings[match_id]  # Look up string ID
    span = doc[start:end]
    print(string_id, span.text)

HelloWorld Hello world
HelloWorld world
HAPPY 😀
HASHTAG #jantacurfew


### Custom Rule Based Entity Recognition

In [53]:
from spacy.pipeline import EntityRuler

In [54]:
ruler = EntityRuler(nlp)

In [55]:
patterns = [{"label": "ORG", "pattern": "KFS"},
            {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}]}]

In [56]:
ruler.add_patterns(patterns)

In [57]:
nlp.add_pipe(ruler)

In [58]:
doc = nlp("KFS is opening its first big office in San Francisco.")

In [59]:
for ent in doc.ents:
    print(f'{ent.text:{20}} {ent.label_}')

KFS                  ORG
first                ORDINAL
San Francisco        GPE
