Click <a href='https://www.dataquest.io/blog/web-scraping-tutorial-python/'>here</a> to learn about Regular Expressions (RegEx) using Python.

In [None]:
########################
# DO NOT RUN THIS CELL #
########################

a, X, 9, < -- ordinary characters just match themselves exactly.
. (a period) -- matches any single character except newline '\n'
\w -- matches a "word" character: a letter or digit or underbar [a-zA-Z0-9_].
\W -- matches any non-word character.
\b -- matches word boundary (in between a word character and a non word character)
\s -- matches a single whitespace character -- space, newline, return, tab
\S -- matches any non-whitespace character.
\t, \n, \r -- tab, newline, return
\d -- matches any numeric digit [0-9]
\D matches any non-numeric character.
^ -- matches the beginning of the string, or specify omition of certain characters
$ -- matches the end of the string
\ -- escapes special character.
(x|y|z) matches exactly one of x, y or z.
(x) in general is a remembered group. We can get the value of what matched by using the groups() method of the object returned by re.search.
x? matches an optional x character (in other words, it matches an x zero or one times).
x* matches x zero or more times.
x+ matches x one or more times.
x{m,n} matches an x character at least m times, but not more than n times.
?: matches an expression but do not capture it. Non capturing group.
?= matches a suffix but exclude it from capture. Positive lookahead.
a(?=b) will match the "a" in "ab", but not the "a" in "ac"
In other words, a(?=b) matches the "a" which is followed by the string 'b', without consuming what follows the a.
?! matches if suffix is absent. Negative look ahead.
a(?!b) will match the "a" in "ac", but not the "a" in "ab"
?<= positive look behind
[] matches for groupings of consecutive characters
?<! negative look behind

########################
# DO NOT RUN THIS CELL #
########################

What are word boundaries?
--------------------------------------------------
Before the first character in the string, if the first character is a word character.<br>
After the last character in the string, if the last character is a word character.<br>
Between two characters in the string, where one is a word character and the other is not a word character<br>

In [2]:
import re

# open a new data stream into a file
file = open('./names.txt', encoding='utf-8')

# read the text of the file and store it as Python data
data = file.read()

# always close the data stream
file.close()

### Search for names

In [4]:
# .match()
re.match('Hawkins', data)

<re.Match object; span=(0, 7), match='Hawkins'>

In [5]:
re.match('Patel', data)

In [6]:
re.match('Hawkins, Derek', data)

<re.Match object; span=(0, 14), match='Hawkins, Derek'>

### Search for phone numbers

In [11]:
# 'Derek's'
# "Derek's"
# 'Derek\'s'

"Derek's"

In [7]:
re.search('Patel', data)

<re.Match object; span=(671, 676), match='Patel'>

In [12]:
re.search('\(\d\d\d\) \d\d\d-\d\d\d\d', data)

<re.Match object; span=(38, 52), match='(555) 555-5555'>

In [13]:
re.search('\w\w\w\w\w\w\w, \w\w\w\w\w', data)

<re.Match object; span=(0, 14), match='Hawkins, Derek'>

In [14]:
re.search('\w\w\w\w\w\w\w, \w\w\w\w\w', data)
re.search('\w{7}, \w{5}', data)

<re.Match object; span=(0, 14), match='Hawkins, Derek'>

In [16]:
re.search('\w\w\w\w\w\w\w, \w\w\w\w\w', data)

<re.Match object; span=(0, 14), match='Hawkins, Derek'>

In [17]:
re.search('\w+, \w+', data)

<re.Match object; span=(0, 14), match='Hawkins, Derek'>

In [18]:
re.search('\w{7}, \w+', data)

<re.Match object; span=(0, 14), match='Hawkins, Derek'>

<strong>Exercise 1</strong>:<br>
Write a function that checks for n number of consecutive digits and returns the match

In [22]:
# "g" * 5
# \d * 5

def with_n_digits(num):
    return re.search('\d'*num, data)

In [24]:
with_n_digits(4)

<re.Match object; span=(48, 52), match='5555'>

<strong>Exercise 2</strong>:<br>
Use Regular Expressions to pull the last phone number with the country code in the list using .findall()

In [45]:
# .findall()
phone_numbers = "(555) 555-5559 (555) 555-5558 555 555-5557 555 555-5556 555 555-5555 555-555-5554 555-555-5553 +1 555-555-5552"

In [46]:
# find phone number with parantheses
re.findall('\(\d{3}\) \d{3}-\d{4}', phone_numbers)

['(555) 555-5559', '(555) 555-5558']

In [47]:
# find phone number w/ parantheses AND hyphens
re.findall('\(?\d{3}\)?\s?-?\d{3}-\d{4}', phone_numbers)

['(555) 555-5559',
 '(555) 555-5558',
 '555 555-5557',
 '555 555-5556',
 '555 555-5555',
 '555-555-5554',
 '555-555-5553',
 '555-555-5552']

In [52]:
# find phone number w/ country code
[n.strip() for n in re.findall('\+?\d?\s?\(?\d{3}\)?\s?-?\d{3}-\d{4}', phone_numbers)]

['(555) 555-5559',
 '(555) 555-5558',
 '555 555-5557',
 '555 555-5556',
 '555 555-5555',
 '555-555-5554',
 '555-555-5553',
 '+1 555-555-5552']

In [32]:
information = """
Patel, Ripal ripalp@codingtemple.com : 555 555-5555
Carter, Joel joelc@codingtemple.com : (555) 555-5555
Lang, Lucas lucasl@codingtemple.com : 555-555-5555
Stanton, Brian brians@codingtemple.com : 555 555-5555
Davitt, Sam samd@codingtemple.com : (555) 555-5555
"""

In [40]:
re.findall('[\d\w\'-+.]*@[codingtempl]+[.com]+', information)

['ripalp@codingtemple.com',
 'joelc@codingtemple.com',
 'lucasl@codingtemple.com',
 'brians@codingtemple.com',
 'samd@codingtemple.com']

In [41]:
re.findall('@[codingtempl]+[.com]+', information)

['@codingtemple.com',
 '@codingtemple.com',
 '@codingtemple.com',
 '@codingtemple.com',
 '@codingtemple.com']

In [None]:
# re.X - re.VERBOSE
# re.M - re.MULTILINE
# re.I - re.IGNORECASE

In [62]:
info = re.findall('''
    ([\w]+,\s[\w]+)                     # last_name, first_name
    (\s[\d\w\'-+.]+@[-.\w\d]+)          # email
    (\s:\s\(?\d{3}\)?\s?-?\d{3}-\d{4})  # phone
''', information, re.X)
# ''', information, re.X|re.I|re.M)
info

[('Patel, Ripal', ' ripalp@codingtemple.com', ' : 555 555-5555'),
 ('Carter, Joel', ' joelc@codingtemple.com', ' : (555) 555-5555'),
 ('Lang, Lucas', ' lucasl@codingtemple.com', ' : 555-555-5555'),
 ('Stanton, Brian', ' brians@codingtemple.com', ' : 555 555-5555'),
 ('Davitt, Sam', ' samd@codingtemple.com', ' : (555) 555-5555')]

In [70]:
people = []
for person in info:
    person_dict = {
        'name': person[0],
        'email': person[1].strip(),
        'phone': person[2][3:]
    }
    people.append(person_dict)
    
people

[{'name': 'Patel, Ripal',
  'email': 'ripalp@codingtemple.com',
  'phone': '555 555-5555'},
 {'name': 'Carter, Joel',
  'email': 'joelc@codingtemple.com',
  'phone': '(555) 555-5555'},
 {'name': 'Lang, Lucas',
  'email': 'lucasl@codingtemple.com',
  'phone': '555-555-5555'},
 {'name': 'Stanton, Brian',
  'email': 'brians@codingtemple.com',
  'phone': '555 555-5555'},
 {'name': 'Davitt, Sam',
  'email': 'samd@codingtemple.com',
  'phone': '(555) 555-5555'}]

In [72]:
for p in people:
    print(p.get('name'))
    print(p.get('email'))
    print(p.get('phone'))
    print()

Patel, Ripal
ripalp@codingtemple.com
555 555-5555

Carter, Joel
joelc@codingtemple.com
(555) 555-5555

Lang, Lucas
lucasl@codingtemple.com
555-555-5555

Stanton, Brian
brians@codingtemple.com
555 555-5555

Davitt, Sam
samd@codingtemple.com
(555) 555-5555



In [73]:
contact_search = re.compile('''
    (?P<name>[\w]+,\s[\w]+)                     # last_name, first_name
    (?P<email>\s[\d\w\'-+.]+@[-.\w\d]+)          # email
    (?P<phone>\s:\s\(?\d{3}\)?\s?-?\d{3}-\d{4})  # phone
''', re.X)

In [88]:
# help(contact_search.finditer)
for i in contact_search.finditer(information):
    print(f"Name: {i.group('name')}\nEmail: {i.group('email').strip()}\nPhone: {i.group('phone')[3:]}\n")

Name: Patel, Ripal
Email: ripalp@codingtemple.com
Phone: 555 555-5555

Name: Carter, Joel
Email: joelc@codingtemple.com
Phone: (555) 555-5555

Name: Lang, Lucas
Email: lucasl@codingtemple.com
Phone: 555-555-5555

Name: Stanton, Brian
Email: brians@codingtemple.com
Phone: 555 555-5555

Name: Davitt, Sam
Email: samd@codingtemple.com
Phone: (555) 555-5555



In [92]:
list(contact_search.finditer(information))

[<re.Match object; span=(1, 52), match='Patel, Ripal ripalp@codingtemple.com : 555 555-55>,
 <re.Match object; span=(53, 105), match='Carter, Joel joelc@codingtemple.com : (555) 555-5>,
 <re.Match object; span=(106, 156), match='Lang, Lucas lucasl@codingtemple.com : 555-555-555>,
 <re.Match object; span=(157, 210), match='Stanton, Brian brians@codingtemple.com : 555 555->,
 <re.Match object; span=(211, 261), match='Davitt, Sam samd@codingtemple.com : (555) 555-555>]

### In-class Exercise <br>
<p>Print each persons name and twitter handle etc., using groups, should look like:</p>

In [3]:
# [
#     ([first name] [last name],
#      email, 
#      phone,
#      title,
#      Twitter handle)
# ]

In [1]:
compiler = re.compile(r'''
    (?P<name>[\w ,-]*,?\s[\w-]*)\t                 
    (?P<phone>[\w\d.+-]+@[\w\d.-]+)\t              
    (?P<email>\(?\d{3}\)?-?\s?\d{3}-\d{4})?\t      
    (?P<occupation>[\w ,']+)\t?                    
    (?P<twitter>[@\w]+)?                           
''', re.X)
​
for p in compiler.finditer(data):
    print(f"Name: {p.group('name')}\nPhone: {p.group('phone')}\nEmail: {p.group('email')}\nOccupation: {p.group('occupation')}\nTwitter: {p.group('twitter')}\n")


SyntaxError: invalid non-printable character U+200B (3645313050.py, line 8)