# Chapter 8 - Text pattern matching with Regular Expressions

## Examples

In [21]:
def is_phone_number(text):
    if len(text) != 12:  # Phone numbers have exactly 12 characters.
        return False
    for i in range(0, 3):  # The first three characters must be numbers.
        if not text[i].isdecimal():
            return False
    if text[3] != '-':  # The fourth character must be a dash.
        return False
    for i in range(4, 7): # The next three characters must be numbers.
        if not text[i].isdecimal():
            return False
    if text[7] != '-':  # The eighth character must be a dash.
        return False
    for i in range(8, 12):  # The next four characters must be numbers.
        if not text[i].isdecimal():
            return False
    return True

print('Is 415-555-4242 a phone number?', is_phone_number('415-555-4242'))
print(is_phone_number('415-555-4242'))
print('Is Moshi moshi a phone number?', is_phone_number('Moshi moshi'))
print(is_phone_number('Moshi moshi'))

Is 415-555-4242 a phone number? True
True
Is Moshi moshi a phone number? False
False


In [22]:
def is_phone_number(us_phone_number):
    # check number length
    if len(us_phone_number) != 12:
        return False
    
    # check dashes
    if us_phone_number[3] != '-' or us_phone_number[7] != '-':
        return False

    #check numbers
    if (us_phone_number[:3].isdecimal() and us_phone_number[4:7].isdecimal() and us_phone_number[8:12].isdecimal()) == False:
        return False

    return True

my_phone = '415-659-8875'
is_phone_number(my_phone)

# Find phone number in larger string
message = 'Call me at 415-659-8875 tomorrow. 654-987-3215 is my office phone number'
for i in range(len(message)):
    segment = message[i:i+12]
    if is_phone_number(segment):
        print('Phone number found: ' + segment)
print('Done')

Phone number found: 415-659-8875
Phone number found: 654-987-3215
Done


In [23]:
import re
phone_re = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)')
mo = phone_re.search('My number is 415-555-4242.')
print(mo.group())
print(mo.group(0))
print(mo.group(1))
print(mo.group(2))

415-555-4242
415-555-4242
415
555-4242


In [24]:
area_code, main_number = mo.groups()
print(area_code)
print(main_number)


415
555-4242


In [25]:
import re
re.compile(r'(\(Parentheses\))')

re.compile(r'(\(Parentheses\))', re.UNICODE)

In [26]:
import re
pattern = re.compile(r'Cat(erpillar|astrophe|ch|egory)')
match = pattern.search('Catch me if you can.')
print(match.group())
print(match.group(1))

Catch
ch


In [27]:
import re
pattern = re.compile(r'\d{3}-\d{3}-\d{4}')  # This regex has no groups.
pattern.findall('Cell: 415-555-9999 Work: 212-555-0000')

['415-555-9999', '212-555-0000']

In [28]:
pattern = re.compile(r'(\d{3})-(\d{3})-(\d{4})')  # This regex has groups.
pattern.findall('Cell: 415-555-9999 Work: 212-555-0000')

[('415', '555', '9999'), ('212', '555', '0000')]

In [29]:
import re
pattern = re.compile(r'\d{3}')
print(pattern.findall('1234'))
print(pattern.findall('12345'))
print(pattern.findall('123456'))

['123']
['123']
['123', '456']


In [33]:
import re
vowel_pattern = re.compile(r'[aeiouAEIOU]')
consonant_pattern = re.compile(r'[^aeiouAEIOU]') # ^ is a negative symbol

print(vowel_pattern.findall('RoboCop eats BABY FOOD.'))
print(consonant_pattern.findall('RoboCop eats BABY FOOD.'))

['o', 'o', 'o', 'e', 'a', 'A', 'O', 'O']
['R', 'b', 'C', 'p', ' ', 't', 's', ' ', 'B', 'B', 'Y', ' ', 'F', 'D', '.']


In [32]:
import re
pattern = re.compile(r'\d+\s\w+')
pattern.findall('12 drummers, 11 pipers, 10 lords, 9 ladies, 8 maids, 7 swans, 6 geese, 5 rings, 4 birds, 3 hens, 2 doves, 1 partridge')

['12 drummers',
 '11 pipers',
 '10 lords',
 '9 ladies',
 '8 maids',
 '7 swans',
 '6 geese',
 '5 rings',
 '4 birds',
 '3 hens',
 '2 doves',
 '1 partridge']

In [34]:
import re
at_re = re.compile(r'\w+')
at_re.findall('The cat in the hat sat on the flat mat')

['The', 'cat', 'in', 'the', 'hat', 'sat', 'on', 'the', 'flat', 'mat']

In [38]:
import re
pattern = re.compile(r'42!?')
print(pattern.search('42!'))
print(pattern.search('42'))

<re.Match object; span=(0, 3), match='42!'>
<re.Match object; span=(0, 2), match='42'>


In [40]:
import re
pattern = re.compile(r'42?!')
print(pattern.search('42!'))
print(pattern.search('4!'))
print(pattern.search('42') == None)  # No match

<re.Match object; span=(0, 3), match='42!'>
<re.Match object; span=(0, 2), match='4!'>
True


In [41]:
pattern = re.compile(r'(\d{3}-)?\d{3}-\d{4}')
match1 = pattern.search('My number is 415-555-4242')
print(match1.group())

match2 = pattern.search('My number is 555-4242')
print(match2.group())

415-555-4242
555-4242


In [48]:
import re
pattern = re.compile('Eggs(and spam)*')


print(pattern.search('Eggs'))
print(pattern.search('Eggs and spam'))
print(pattern.search('Eggs and spam and spam'))
print(pattern.search('Eggs and spam and spam and spam'))

<re.Match object; span=(0, 4), match='Eggs'>
<re.Match object; span=(0, 4), match='Eggs'>
<re.Match object; span=(0, 4), match='Eggs'>
<re.Match object; span=(0, 4), match='Eggs'>


In [49]:
pattern = re.compile('Eggs(and spam)+')
print(pattern.search('Eggs and spam'))
print(pattern.search('Eggs and spam and spam'))
print(pattern.search('Eggs and spam and spam and spam'))

None
None
None


In [56]:
import re
haRegex = re.compile(r'(Ha){3}')
match1 = haRegex.search('HaHaHa')
print(match1.group())

match = haRegex.search('HaHa')
match == None

HaHaHa


True

In [55]:
import re
greedy_pattern = re.compile(r'(Ha){3,5}')
match1 = greedy_pattern.search('HaHaHaHaHa')
print(match1.group())

lazy_pattern = re.compile(r'(Ha){3,5}?')
match2 = lazy_pattern.search('HaHaHaHaHa')
print(match2.group())

HaHaHaHaHa
HaHaHa


In [57]:
import re
name_pattern = re.compile(r'First Name: (.*) Last Name: (.*)')
name_match = name_pattern.search('First Name: Al Last Name: Sweigart')
print(name_match.group(1))
print(name_match.group(2))

Al
Sweigart


In [58]:
import re
lazy_pattern = re.compile(r'<.*?>')
match1 = lazy_pattern.search('<To serve man> for dinner.>')
print(match1.group())

greedy_re = re.compile(r'<.*>')
match2 = greedy_re.search('<To serve man> for dinner.>')
print(match2.group())

<To serve man>
<To serve man> for dinner.>


In [61]:
import re
no_newline_re = re.compile('.*')
no_newline_re.search('Serve the public trust.\nProtect the innocent. \nUphold the law.').group()
'Serve the public trust.'

'Serve the public trust.'

In [60]:
newline_re = re.compile('.*', re.DOTALL)
newline_re.search('Serve the public trust.\nProtect the innocent. \nUphold the law.').group()

'Serve the public trust.\nProtect the innocent. \nUphold the law.'

In [63]:
import re
begins_with_hello = re.compile(r'^Hello')
print(begins_with_hello.search('Hello, world!'))
print(begins_with_hello.search('He said "Hello."') == None)

<re.Match object; span=(0, 5), match='Hello'>
True


In [64]:
import re
ends_with_number = re.compile(r'\d$')
print(ends_with_number.search('Your number is 42'))
print(ends_with_number.search('Your number is forty two.') == None)

<re.Match object; span=(16, 17), match='2'>
True


In [65]:
import re
whole_string_is_num = re.compile(r'^\d+$')
print(whole_string_is_num.search('1234567890'))
print(whole_string_is_num.search('12345xyz67890') == None)

<re.Match object; span=(0, 10), match='1234567890'>
True


In [66]:
import re
pattern = re.compile(r'\bcat.*?\b')
pattern.findall('The cat found a catapult catalog in the catacombs.')

['cat', 'catapult', 'catalog', 'catacombs']

In [67]:
import re
pattern = re.compile(r'\Bcat\B')
print(pattern.findall('certificate'))  # Match
print(pattern.findall('catastrophe'))  # No match

['cat']
[]


Quantifier
* ?   {0, 1}
* \*   {0, }
* \+   {1, }

* The ? matches zero or one instance of the preceding qualifier.
* The * matches zero or more instances of the preceding qualifier.
* The + matches one or more instances of the preceding qualifier.
* The {n} matches exactly n instances of the preceding qualifier.
* The {n,} matches n or more instances of the preceding qualifier.
* The {,m} matches 0 to m instances of the preceding qualifier.
* The {n,m} matches at least n and at most m instances of the preceding qualifier.
* {n,m}? or *? or +? performs a non-greedy match of the preceding qualifier.
* ^spam means the string must begin with spam.
* spam$ means the string must end with spam.
* The . matches any character, except newline characters.
* The \d, \w, and \s match a digit, word, or space character, respectively.
* The \D, \W, and \S match anything except a digit, word, or space character, respectively. [abc] matches any character between the square brackets (such as a, b, or c).
* [^abc] matches any character that isn’t between the square brackets.
* (Hello) groups 'Hello' together as a single qualifier.

In [69]:
import re
pattern1 = re.compile('RoboCop')
pattern2 = re.compile('ROBOCOP')
pattern3 = re.compile('robOcop')
pattern4 = re.compile('RobocOp')

pattern = re.compile(r'robocop', re.I)
print(pattern.search('RoboCop is part man, part machine, all cop.').group())
print(pattern.search('ROBOCOP protects the innocent.').group())
print(pattern.search('Have you seen robocop?').group())


RoboCop
ROBOCOP
robocop


In [71]:
import re
agent_pattern = re.compile(r'Agent \w+')
agent_pattern.sub('CENSORED', 'Agent Alice contacted Agent Bob.')

'CENSORED contacted CENSORED.'

In [72]:
import re
agent_pattern = re.compile(r'Agent (\w)\w*')
agent_pattern.sub(r'\1****', 'Agent Alice contacted Agent Bob.')

'A**** contacted B****.'

Verbose mode

In [73]:
pattern = re.compile(r'((\d{3}|\(\d{3}\))?(\s|-|\.)?\d{3}(\s|-|\.)\d{4}(\s*(ext|x|ext\.)\s*\d{2,5})?)')

In [70]:
pattern = re.compile(r'''(
    (\d{3}|\(\d{3}\))?  # Area code
    (\s|-|\.)?  # Separator
    \d{3}  # First three digits
    (\s|-|\.)  # Separator
    \d{4}  # Last four digits
    (\s*(ext|x|ext\.)\s*\d{2,5})?  # Extension
    )''', re.VERBOSE)


In [74]:
some_regex = re.compile('foo', re.IGNORECASE | re.DOTALL | re.VERBOSE)

Extract contact info from large documents

In [76]:
import pyperclip, re

pyperclip.copy('''
    To: johnbee@mail.com
    Dear john bee, 
    Here's my phone n°: 456-123-6548
    Have a nice day!
''')

# phone regex
phone_numbers_pattern = re.compile(r'''(
    (\d{3}|\(\d{3}\))?              # area code
    (\s|-|\.)?                      # first separator
    (\d{3})                         # first three digits
    (\s|-|\.)                       # second separator  
    (\d{4})                         # last four digits   
    (\s*(ext|x|ext\.)\s*(\d{2,5}))? # extension                              
)''', re.VERBOSE)

# mail regex
mails_pattern = re.compile(r'''(
    [a-zA-Z0-9._%+-1]+   # letters or numbers (username)
    @                   # @ symbol
    [a-zA-Z0-9.-]+      # domain name
    (\.[a-zA-Z]{2,})  # dot smth                     
)''', re.VERBOSE)

# find matches in clipboard text
text = str(pyperclip.paste())

matches = []

for groups in phone_numbers_pattern.findall(text):
    phone_num = '-'.join([groups[1], groups[3], groups[5]])
    if groups[6] != '':
        phone_num += ' x' + groups[6]
    matches.append(phone_num)

for groups in mails_pattern.findall(text):
    matches.append(groups[0])

# copy results to clipboard ->join matches into a string
if len(matches) > 0:
    pyperclip.copy('\n'.join(matches))
    print('Copied to clipboard')
    print('\n'.join(matches))
else:
    print('No phone numbers or email adresses detected')

Copied to clipboard
456-123-6548
johnbee@mail.com


### HUMRE

HUMRE         | REGEX
--------------|-----  
DIGIT         | \d  
NONDIGIT      | \D    
WORD          | \w  
NONWORD       | \W  
WHITESPACE    | \s  
NONWHITESPACE | \S  




[Full link](https://automatetheboringstuff.com/3e/chapter9.html)

In [78]:
from humre import *
phone_regex = exactly(3, DIGIT) + '-' + exactly(3, DIGIT) + '-' + exactly(4, DIGIT)
phone_regex

'\\d{3}-\\d{3}-\\d{4}'

In [79]:
import re
pattern = re.compile(phone_regex)
pattern.search('My number is 415-555-4242')

<re.Match object; span=(13, 25), match='415-555-4242'>

In [81]:
import re
from humre import *
phone_regex = group(
    optional_group(either(exactly(3, DIGIT),            # Area code
                          OPEN_PAREN + exactly(3, DIGIT) + CLOSE_PAREN)),
    optional(group_either(WHITESPACE, '-', PERIOD)),    # Separator
    group(exactly(3, DIGIT)),                           # First three digits
    group_either(WHITESPACE, '-', PERIOD),              # Separator
    group(exactly(4, DIGIT)),                           # Last four digits
    optional_group(                                     # Extension
      zero_or_more(WHITESPACE),
      group_either('ext', 'x', r'ext\.'),
      zero_or_more(WHITESPACE),
      group(between(2, 5, DIGIT))
      )
    )

pattern = re.compile(phone_regex)
match = pattern.search('My number is 415-555-1212.')
print(match.group())

415-555-1212


## Practice Questions

### Strong Password Detector

In [84]:
import re

'''
# Non-regex code
def validate_password(password):
    strong_password = True
    
    # check length
    if len(password) < 8:
        strong_password = False
    
    # lowercase, uppercase and number check 
    lowercases = 0
    uppercases = 0
    numbers = 0
    for c in password:
        if c.islower():
            lowercases += 1
        elif c.isupper():
            uppercases += 1
        elif c.isdigit():
            numbers += 1
    if lowercases == 0:
        strong_password = False
    if uppercases == 0:
        strong_password = False
    if numbers == 0:
        strong_password = False
    return strong_password
'''
def validate_password(password):
    # check length
    if len(password) < 8:
        return False
    
    # check uppercase
    if not re.search(r'[A-Z]', password):
        return False
    
    # check lowercase
    if not re.search(r'[a-z]', password):
        return False
    
    # check numbers
    if not re.search(r'\d', password):
        return False
    
    return True
    
my_password = 'MyBeautifulPassword2'
print(validate_password(my_password))

True


### Regex version of the strip() method

In [85]:
def strip_copy(s, chars=None):
    if chars is None:
        return re.sub(r'^\s+|\s+$', '', s)
    else:
        pattern = f'^[{re.escape(chars)}]+|[{re.escape(chars)}]+$'
        return re.sub(pattern, '', s)
sentence = '---so what?---'
print(strip_copy(sentence, '-'))
print('function printed')


so what?
function printed
