# Chapter 7 – Pattern Matching with Regular Expressions
https://automatetheboringstuff.com/chapter7/

In [4]:
def is_phone_number(text):
    if len(text) != 12:
        return False
    for i in range(0, 3):
        if not text[i].isdecimal():
            return False
    if text[3] != '-':
        return False
    if i in range(4, 7):
        if not text[i].isdecimal():
            return False
    if text[7] != '-':
        return False
    for i in range(8, 12):
        if not text[i].isdecimal():
            return False

    return True

print('415-555-4242 is a phone number:')
print(is_phone_number('415-555-4242'))

print('Moshi moshi is a phone number:')
print(is_phone_number('Moshi moshi'))

message = 'Call me at 415-555-1011 tomorrow. 415-555-9999 is my office.'
for i in range(len(message)):
    chunk = message[i:i+12]
    if is_phone_number(chunk):
        print('Phone number found: ' + chunk)
print('Done')

415-555-4242 is a phone number:
True
Moshi moshi is a phone number:
False
Phone number found: 415-555-1011
Phone number found: 415-555-9999
Done


In [6]:
message = 'Call me at 415-555-1011 tomorrow. 415-555-9999 is my office.'
for i in range(len(message)):
    chunk = message[i:i+12]
    if is_phone_number(chunk):
        print('Phone number found: ' + chunk)
print('Done')

Phone number found: 415-555-1011
Phone number found: 415-555-9999
Done


In [7]:
import re

phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
mo = phoneNumRegex.search('My number is 415-555-4242.')
print('Phone number found: ' + mo.group())

Phone number found: 415-555-4242


#### Review of Regular Expression Matching

While there are several steps to using regular expressions in Python, each step is fairly simple.

Import the regex module with import re.

Create a Regex object with the re.compile() function. (Remember to use a raw string.)

Pass the string you want to search into the Regex object’s search() method. This returns a Match object.

Call the Match object’s group() method to return a string of the actual matched text.

In [11]:
import re

phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)')
mo = phoneNumRegex.search('My number is 415-555-4242.')
print(mo.group(1))
print(mo.group(2))
print(mo.group(0))
print(mo.group())

415
555-4242
415-555-4242
415-555-4242


In [12]:
mo.groups()

('415', '555-4242')

In [13]:
areaCode, mainNumber = mo.groups()
print(areaCode)
print(mainNumber)

415
555-4242


In [16]:
phoneNumRegex = re.compile(r'(\(\d\d\d\)) (\d\d\d-\d\d\d\d)')
mo = phoneNumRegex.search('My phone number is (415) 555-4242.')
print("mo.group(1) >> " + mo.group(1))
print("mo.group(2) >> " + mo.group(2))

mo.group(1) >> (415)
mo.group(2) >> 555-4242


#### Matching Multiple Groups with the Pipe

In [18]:
heroRegex = re.compile(r'Batman|Tina Fey')
mo1 = heroRegex.search('Batman and Tina Fey.')
mo1.group()

'Batman'

In [20]:
mo2 = heroRegex.search('Tina Fey and Batman.')
mo2.group()

'Tina Fey'

In [25]:
batRegex = re.compile(r'Bat(man|mobile|copter|bat)')
mo = batRegex.search('Batmobile lost a wheel')
print('mo.group() >> ' + mo.group())
print('mo.group(1) >> ' + mo.group(1))

mo.group() >> Batmobile
mo.group(1) >> mobile


#### Optional Matching with the Question Mark    (....)?
The ? character flags the group that precedes it as an optional part of the pattern.

We can think of the ? as saying, 
#### “Match zero or one of the group preceding this question mark.”

In [4]:
import re

batRegex = re.compile(r'Bat(wo)?man')
mo1 = batRegex.search('The Adventures of Batman')
mo1.group()

'Batman'

In [6]:
mo2 = batRegex.search('The Adventures of Batwoman')
mo2.group()

'Batwoman'

The (wo)? part of the regular expression means that the pattern wo is an optional group. The regex will match text that has zero instances or one instance of wo in it. This is why the regex matches both 'Batwoman' and 'Batman'.

In [11]:
phoneRegex = re.compile(r'(\d\d\d-)?\d\d\d-\d\d\d\d')
mo1 = phoneRegex.search('My number is 415-555-4242')
mo1.group()

'415-555-4242'

In [12]:
mo2 = phoneRegex.search('My number is 555-4242')
mo2.group()

'555-4242'

#### Matching Zero or More with the Star
“match zero or more”—the group that precedes the star can occur any number of times in the text. It can be completely absent or repeated over and over again.

In [22]:
batRegex = re.compile(r'Bat(wo)*man')
mo1 = batRegex.search('The Adventures of Batman')
mo1.group()

'Batman'

In [23]:
mo2 = batRegex.search('The Adventures of Batwoman')
mo2.group()

'Batwoman'

In [24]:
mo3 = batRegex.search('The Adventures of Batwowowowoman')
mo3.group()

'Batwowowowoman'

#### Matching One or More with the Plus
It is not optional.

In [25]:
batRegex = re.compile(r'Bat(wo)+man')
mo1 = batRegex.search('The Adventures of Batwoman')
mo1.group()

'Batwoman'

In [26]:
mo2 = batRegex.search('The Adventures of Batwowowowoman')
mo2.group()

'Batwowowowoman'

In [30]:
mo3 = batRegex.search('The Adventures of Batman')
mo3 == None

True

#### Matching Specific Repetitions with Curly Brackets
These two regular expressions match identical patterns:

(Ha){3}
(Ha)(Ha)(Ha)

In [2]:
import re

haRegex = re.compile(r'(Ha){3}')
mo1 = haRegex.search('HaHaHa')
mo1.group()

'HaHaHa'

In [4]:
mo2 = haRegex.search('Ha')
mo2 == None

True

In [5]:
greedyHaRegex = re.compile(r'(Ha){3,5}')
mo1 = greedyHaRegex.search('HaHaHaHaHa')
mo1.group()

'HaHaHaHaHa'

In [6]:
nongreedyHaRegex = re.compile(r'(Ha){3,5}?')
mo1 = nongreedyHaRegex.search('HaHaHaHaHa')
mo1.group()

'HaHaHa'

#### ? Meaning 
1. Declaring a nongreedy match. 
2. For optional group.
