In [2]:
# Rule-based matching is one of the steps in extracting information from unstructured text. 
# It’s used to identify and extract tokens and phrases according to patterns (such as lowercase) and grammatical features (such as part of speech).
# Rule-based matching can use regular expressions to extract entities (such as phone numbers) from an unstructured text. It’s different from extracting text using regular expressions only in the sense that regular expressions don’t consider the lexical and grammatical attributes of the text.

import spacy
from spacy.matcher import Matcher
nlp = spacy.load('en_core_web_sm')
matcher = Matcher (nlp.vocab)
# With rule-based matching, you can extract a first name and a last name, which are always proper nouns:

def extract_full_name(nlp_doc):
    pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}] #list of objects that defines the combination of tokens to be matched; first name/last name so two objects
    matcher.add('FULL_NAME', None, pattern) #pattern is added to Matcher with fullname
    matches = matcher(nlp_doc)
    for match_id, start, end in matches:
        span = nlp_doc[start:end] #matches are obtained with their first and last name
        return span.text

about_text = ('Gus Proto is a Python developer currently' + 
 ' working for a London-based Fintech' + 
 ' company. He is interested in learning'+ 
 ' Natural Language Processing.')
about_doc = nlp(about_text)
extract_full_name(about_doc)

'Gus Proto'

In [None]:
# rule-based matching to extract phone numbers:
conference_org_text = ('There is a developer conference'
    'happening on 21 July 2019 in London. It is titled'
    ' "Applications of Natural Language Processing".'
    ' There is a helpline number available'
    ' at (123) 456-789')

def extract_phone_number(nlp_doc):
    pattern = [{'ORTH': '('}, {'SHAPE': 'ddd'}, #ORTH gives the exact text of the token
               {'ORTH': ')'}, {'SHAPE': 'ddd'}, #SHAPE transforms the token string to show orthographic features.
               {'ORTH': '-', 'OP': '?'},        #OP defines operators. Using ? as a value means that the pattern is optional, meaning it can match 0 or 1 times.
               {'SHAPE': 'ddd'}]
    matcher.add('PHONE_NUMBER', None, pattern)
    matches = matcher(nlp_doc)
    for match_id, start, end in matches:
        span = nlp_doc[start:end]
        return span.text

conference_org_doc = nlp(conference_org_text)
extract_phone_number(conference_org_doc)
