# Introduction to Regular Expressions in Python

In [109]:
# To use regular expressions
import re

In [110]:
# How do we match a number?
print(re.match('1', '1'))

<_sre.SRE_Match object; span=(0, 1), match='1'>


In [111]:
# But that only works for just "1"
print(re.match('2', '1'))

None


In [112]:
# How do we match all the numbers from 0 to 9? 
# The `r''` is for a "raw" string -- as regexes become more complicated, this will matter!
re.match(r'[0-9]', '1')

<_sre.SRE_Match object; span=(0, 1), match='1'>

In [113]:
# There is a short-hand for the character class `[0-9]` that is `\d` (digit)
re.match(r'\d', '1')

<_sre.SRE_Match object; span=(0, 1), match='1'>

In [114]:
# But this only matches the first number we see
re.match(r'\d', '123')

<_sre.SRE_Match object; span=(0, 1), match='1'>

In [115]:
# We can use `{}` to indicate `{min,max}`, `{min,}`, `{,max}`, or `{exactly}`
print(re.match(r'\d{1,4}', '1234567890'))
print(re.match(r'\d{1,}', '1234567890'))
print(re.match(r'\d{,5}', '1234567890'))
print(re.match(r'\d{8}', '1234567890'))

<_sre.SRE_Match object; span=(0, 4), match='1234'>
<_sre.SRE_Match object; span=(0, 10), match='1234567890'>
<_sre.SRE_Match object; span=(0, 5), match='12345'>
<_sre.SRE_Match object; span=(0, 8), match='12345678'>


In [116]:
# What if we wanted to recognize a US SSN (social security number)? 
# We will use `re.compile` to create the regex and use it in a `for` loop 
ssn_re = re.compile(r'\d{3}-\d{2}-\d{4}')
for s in ['123456789', '123-456-789', '123-45-6789']:
    print('{}: {}'.format(s, ssn_re.match(s)))

123456789: None
123-456-789: None
123-45-6789: <_sre.SRE_Match object; span=(0, 11), match='123-45-6789'>


In [34]:
# SSNs always use a dash (`-`) as a number separator, but dates do not
date_re = re.compile(r'\d{4}-\d{2}-\d{2}')
dates = ['1999-01-01', '1999/01/01']
for d in dates:
    print('{}: {}'.format(d, date_re.match(d)))

1999-01-01: <_sre.SRE_Match object; span=(0, 10), match='1999-01-01'>
1999/01/01: None


In [35]:
# Just as we created a character class with `[0-9]` to 
# represent all the numbers from 0 to 9,
# we can create a class to represent the separators "/" and "-" 
# with `[/-]`
date_re = re.compile(r'\d{4}[/-]\d{2}[/-]\d{2}')
dates = ['1999-01-01', '1999/01/01']
for d in dates:
    print('{}: {}'.format(d, date_re.match(d)))

1999-01-01: <_sre.SRE_Match object; span=(0, 10), match='1999-01-01'>
1999/01/01: <_sre.SRE_Match object; span=(0, 10), match='1999/01/01'>


In [87]:
# If we wanted to extract each part of the date (year, month, day),
# we can use parentheses `()` around the parts we want to capture 
# into `groups`. The group "0" is the whole string that was match, 
# and they are numbered sequentially after that for each group
#
# Can you change the regex to match all three strings?
date_re = re.compile(r'(\d{4})[/-](\d{2})[/-](\d{2})')
dates = ['1999-01-01', '1999/01/01', '1999.01.01']
for d in dates:
    match = date_re.match(d)
    print('{}: {}'.format(d, 'match' if match else 'miss'))
    if match:
        print(match.groups())
        print('year:', match.group(1))
    print()

1999-01-01: match
('1999', '01', '01')
year: 1999

1999/01/01: match
('1999', '01', '01')
year: 1999

1999.01.01: miss



In [53]:
# As we add more groups, it can be confusing to 
# remember them by their positions, so we can name them with
# `?P<name>` just inside the parens
date_re = re.compile(r'(?P<year>\d{4})[/-](?P<month>\d{2})[/-](?P<day>\d{2})')
dates = ['1999-01-01', '1999/01/01', '1999.01.01']

for d in dates:
    match = date_re.match(d)
    print('{}: {}'.format(d, 'match' if match else 'miss'))
    if match:
        print('{} = year "{}" month "{}" day "{}"'.format(d, match.group('year'), match.group('month'), match.group('day')))
    print()

1999-01-01: match
1999-01-01 = year "1999" month "01" day "01"

1999/01/01: match
1999/01/01 = year "1999" month "01" day "01"

1999.01.01: miss



In [55]:
# What if we wanted to match a US phone number?
phone_re = re.compile(r'(\d{3}) \d{3}-\d{4}')
phone_re.match('(800) 555-1212')

In [56]:
# Why didn't that work?
# What do those parentheses do again? They group!
# So we need to indicate that the parens are literal 
# things to match by using backslashes `\` to escape them.
phone_re = re.compile(r'\(\d{3}\) \d{3}-\d{4}')
phone_re.match('(800) 555-1212')

<_sre.SRE_Match object; span=(0, 14), match='(800) 555-1212'>

In [58]:
# We could also use character classes to make this more readable
phone_re = re.compile(r'[(]\d{3}[)] \d{3}-\d{4}')
phone_re.match('(800) 555-1212')

<_sre.SRE_Match object; span=(0, 14), match='(800) 555-1212'>

In [86]:
# There is not always a space after the area code, and it may 
# sometimes it may be more than one space (or a tab?)
# We can use the `\s` to indicate any type of whitespace and 
# `*` to indicate zero or more
phone_re = re.compile(r'[(]\d{3}[)]\s*\d{3}-\d{4}')
phones = ['(800)555-1212', '(800) 555-1212', '(800)  555-1212']
for phone in phones:
    print('{}\t{}'.format(phone, phone_re.match(phone)))

(800)555-1212	<_sre.SRE_Match object; span=(0, 13), match='(800)555-1212'>
(800) 555-1212	<_sre.SRE_Match object; span=(0, 14), match='(800) 555-1212'>
(800)  555-1212	<_sre.SRE_Match object; span=(0, 15), match='(800)  555-1212'>


In [85]:
# When the parens around the area code are optional,
# usually there is a dash to separate the area code
phone_re = re.compile(r'[(]?\d{3}[)]?[-]?\s*\d{3}-\d{4}')
phones = ['(800)555-1212', '(800) 555-1212', '800-555-1212']
for phone in phones:
    print('{}\t{}'.format(phone, phone_re.match(phone)))

(800)555-1212	<_sre.SRE_Match object; span=(0, 13), match='(800)555-1212'>
(800) 555-1212	<_sre.SRE_Match object; span=(0, 14), match='(800) 555-1212'>
800-555-1212	<_sre.SRE_Match object; span=(0, 12), match='800-555-1212'>


In [63]:
# This has the affect of matching a dash after parens which 
# is generally not a valid format
phone_re = re.compile(r'[(]?\d{3}[)]?[-]?\s*\d{3}-\d{4}')
phone_re.match('(800)-555-1212')

<_sre.SRE_Match object; span=(0, 14), match='(800)-555-1212'>

In [84]:
# We really have to create two regexes to handle these cases
phone_re1 = re.compile(r'[(]\d{3}[)]\s*\d{3}-\d{4}')
phone_re2 = re.compile(r'\d{3}-\d{3}-\d{4}')
phones = ['(800)555-1212', '(800) 555-1212', '800-555-1212', '(800)-555-1212']
for phone in phones:
    match1 = phone_re1.match(phone)
    match2 = phone_re2.match(phone)
    print('{}\t{}'.format(phone, 'match' if match1 or match2 else 'miss'))

(800)555-1212	match
(800) 555-1212	match
800-555-1212	match
(800)-555-1212	miss


In [83]:
# I worked with a graphic artist who always insisted on using 
# dots as the number separator, and sometimes there are no 
# separators at all
phone_re1 = re.compile(r'[(]\d{3}[)]\s*\d{3}[.-]\d{4}')
phone_re2 = re.compile(r'\d{3}[.-]?\d{3}[.-]?\d{4}')
phones = ['8005551212', '(800)555-1212', '(800) 555-1212', '800-555-1212', '(800)-555-1212', '800.555.1212']
for phone in phones:
    match1 = phone_re1.match(phone)
    match2 = phone_re2.match(phone)
    print('{}\t{}'.format(phone, 'match' if match1 or match2 else 'miss'))

8005551212	match
(800)555-1212	match
(800) 555-1212	match
800-555-1212	match
(800)-555-1212	miss
800.555.1212	match


In [93]:
phone_re1 = re.compile(r'([(]\d{3}[)])\s*(\d{3})[.-](\d{4})')
phones = ['8005551212', '(800)555-1212', '(800) 555-1212', '800-555-1212', '(800)-555-1212', '800.555.1212']
for phone in phones:
    print('{}\t{}'.format(phone, phone_re1.match(phone)))

8005551212	None
(800)555-1212	<_sre.SRE_Match object; span=(0, 13), match='(800)555-1212'>
(800) 555-1212	<_sre.SRE_Match object; span=(0, 14), match='(800) 555-1212'>
800-555-1212	None
(800)-555-1212	None
800.555.1212	None


In [102]:
# OK, now let's normalize the numbers by using parens to
# capture the area code, prefix, and line number and then 
# create a standard representation.
phone_re1 = re.compile(r'[(](\d{3})[)]\s*(\d{3})[.-](\d{4})')
phone_re2 = re.compile(r'(\d{3})[.-]?(\d{3})[.-]?(\d{4})')
phones = ['8005551212', '(800)555-1212', '(800) 555-1212', '800-555-1212', '(800)-555-1212', '800.555.1212']
for phone in phones:
    match = phone_re1.match(phone) or phone_re2.match(phone)
    standard = '{}-{}-{}'.format(match.group(1), match.group(2), match.group(3)) if match else 'miss'
    print('{}\t{}'.format(phone, standard))

8005551212	800-555-1212
(800)555-1212	800-555-1212
(800) 555-1212	800-555-1212
800-555-1212	800-555-1212
(800)-555-1212	miss
800.555.1212	800-555-1212


In [117]:
# And if we add named capture groups...
phone_re1 = re.compile(r'[(](?P<area_code>\d{3})[)]\s*(?P<prefix>\d{3})[.-](?P<line_num>\d{4})')
phone_re2 = re.compile(r'(?P<area_code>\d{3})[.-]?(?P<prefix>\d{3})[.-]?(?P<line_num>\d{4})')
phones = ['8005551212', '(800)555-1212', '(800) 555-1212', '800-555-1212', '(800)-555-1212', '800.555.1212']
for phone in phones:
    match = phone_re1.match(phone) or phone_re2.match(phone)
    standard = '{}-{}-{}'.format(match.group('area_code'), match.group('prefix'), match.group('line_num')) if match else 'miss'
    print('{}\t{}'.format(phone, standard))

8005551212	800-555-1212
(800)555-1212	800-555-1212
(800) 555-1212	800-555-1212
800-555-1212	800-555-1212
(800)-555-1212	miss
800.555.1212	800-555-1212


In [108]:
# And if we add named capture groups
# and named groups in `format` ...
phone_re1 = re.compile(r'[(](?P<area_code>\d{3})[)]\s*(?P<prefix>\d{3})[.-](?P<line_num>\d{4})')
phone_re2 = re.compile(r'(?P<area_code>\d{3})[.-]?(?P<prefix>\d{3})[.-]?(?P<line_num>\d{4})')
phones = ['8005551212', '(800)555-1212', '(800) 555-1212', '800-555-1212', '(800)-555-1212', '800.555.1212']
for phone in phones:
    match = phone_re1.match(phone) or phone_re2.match(phone)
    standard = '{area_code}-{prefix}-{line_num}'.format(prefix=match.group('prefix'), 
                                                        area_code=match.group('area_code'),
                                                        line_num=match.group('line_num')) if match else 'miss'
    print('{}\t{}'.format(phone, standard))

8005551212	800-555-1212
(800)555-1212	800-555-1212
(800) 555-1212	800-555-1212
800-555-1212	800-555-1212
(800)-555-1212	miss
800.555.1212	800-555-1212
