 # PATTERN MATCHING WITH REGULAR EXPRESSIONS



## chapter 7

In [1]:
import re

In [2]:
phoneNumRegex = re.compile(r'\d{3}-\d{3}-\d{4}')

where the 'r' marks the string as 'raw' and the curly brackets indicate how many time the pattern is repeated 

In [4]:
mo = phoneNumRegex.search('My number is 415-555-4242.')

In [5]:
print('Phone number found: ' + mo.group())

Phone number found: 415-555-4242


useful site: https://www.regexpal.com/

In [6]:
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)')
mo = phoneNumRegex.search('My number is 415-555-4242.')
mo.group(1)



'415'

In [8]:
mo.group(2)

'555-4242'

In [9]:
mo.group(0)

'415-555-4242'

In [10]:
mo.group()

'415-555-4242'

In [11]:
print(mo.groups())
areaCode, mainNumber = mo.groups()
print(areaCode)
print(mainNumber)

('415', '555-4242')
415
555-4242


# Pipes

In [15]:
heroRegex = re.compile (r'Batman|Tina Fey')
mo1 = heroRegex.search('Batman and Tina Fey.')
print(mo1.group())
mo2 = heroRegex.search('Tina Fey and Batman.')
print(mo2.group())

Batman
Tina Fey


In [16]:
batRegex = re.compile(r'Bat(man|mobile|copter|bat)')
mo = batRegex.search('Batmobile lost a wheel')
print(mo.group())
print(mo.group(1))

Batmobile
mobile


# Optional

In [17]:
batRegex = re.compile(r'Bat(wo)?man')
mo1 = batRegex.search('The Adventures of Batman')
print(mo1.group())
mo2 = batRegex.search('The Adventures of Batwoman')
print(mo2.group())

Batman
Batwoman


# Zero or more 

In [18]:
batRegex = re.compile(r'Bat(wo)*man')
mo1 = batRegex.search('The Adventures of Batman')
print(mo1.group())
mo2 = batRegex.search('The Adventures of Batwoman')
print(mo2.group())
mo3 = batRegex.search('The Adventures of Batwowowowoman')
print(mo3.group())

Batman
Batwoman
Batwowowowoman


# One or more

In [19]:
batRegex = re.compile(r'Bat(wo)+man')
mo1 = batRegex.search('The Adventures of Batwoman')
print(mo1.group())
mo2 = batRegex.search('The Adventures of Batwowowowoman')
print(mo2.group())

Batwoman
Batwowowowoman


# Greedy

In [20]:
greedyHaRegex = re.compile(r'(Ha){3,5}')
mo1 = greedyHaRegex.search('HaHaHaHaHa')
print(mo1.group())
nongreedyHaRegex = re.compile(r'(Ha){3,5}?')
mo2 = nongreedyHaRegex.search('HaHaHaHaHa')
print(mo2.group())

HaHaHaHaHa
HaHaHa


# findall()

In [21]:
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d') # has no groups
phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000')


['415-555-9999', '212-555-0000']

In [22]:
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d)-(\d\d\d\d)') # has groups
phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000')

[('415', '555', '9999'), ('212', '555', '0000')]

# tailormade classes

In [24]:
vowelRegex = re.compile(r'[aeiouAEIOU]')
vowelRegex.findall('RoboCop eats baby food. BABY FOOD.')

['o', 'o', 'o', 'e', 'a', 'a', 'o', 'o', 'A', 'O', 'O']

In [31]:
consonantRegex = re.compile(r'[^aeiouAEIOU]')
consonantRegex.findall('RoboCop')

['R', 'b', 'C', 'p']

# Beginning and end

In [33]:
beginsWithHello = re.compile(r'^Hello')
print(beginsWithHello.search('Hello world!'))
beginsWithHello.search('He said hello.') == None

<re.Match object; span=(0, 5), match='Hello'>


True

In [36]:
endsWithNumber = re.compile(r'\d$')
print(endsWithNumber.search('Your number is 42'))
print(endsWithNumber.search('Your number is forty two.'))

<re.Match object; span=(16, 17), match='2'>
None


In [37]:
wholeStringIsNum = re.compile(r'^\d+$')
print(wholeStringIsNum.search('1234567890'))
print(wholeStringIsNum.search('12345xyz67890') == None)

print(wholeStringIsNum.search('12 34567890') == None)

<re.Match object; span=(0, 10), match='1234567890'>
True
True


# Wildcard

In [38]:
atRegex = re.compile(r'.at')
atRegex.findall('The cat in the hat sat on the flat mat.')

['cat', 'hat', 'sat', 'lat', 'mat']

In [39]:
nameRegex = re.compile(r'First Name: (.*) Last Name: (.*)')
mo = nameRegex.search('First Name: Al Last Name: Sweigart')
print(mo.group(1))
print(mo.group(2))

Al
Sweigart


In [40]:
nongreedyRegex = re.compile(r'<.*?>')
mo = nongreedyRegex.search('<To serve man> for dinner.>')
print(mo.group())
greedyRegex = re.compile(r'<.*>')
mo = greedyRegex.search('<To serve man> for dinner.>')
print(mo.group())

<To serve man>
<To serve man> for dinner.>


# Review

The ? matches zero or one of the preceding group.

The * matches zero or more of the preceding group.

The + matches one or more of the preceding group.

The {n} matches exactly n of the preceding group.

The {n,} matches n or more of the preceding group.

The {,m} matches 0 to m of the preceding group.

The {n,m} matches at least n and at most m of the preceding group.

{n,m}? or *? or +? performs a nongreedy match of the preceding group.

^spam means the string must begin with spam.

spam$ means the string must end with spam.

The . matches any character, except newline characters.
\d , \w , and \s match a digit, word, or space character, respectively.

\D , \W , and \S match anything except a digit, word, or space character,
respectively.

[abc] matches any character between the brackets (such as a, b, or c).

[^abc] matches any character that isn’t between the brackets.

# case insensitive

In [42]:
robocop = re.compile(r'robocop', re.I)
print(robocop.search('RoboCop is part man, part machine, all cop.').group())
print(robocop.search('ROBOCOP protects the innocent.').group())
print(robocop.search('Al, why does your programming book talk about robocop so much?').group())

RoboCop
ROBOCOP
robocop


# sub

In [43]:
namesRegex = re.compile(r'Agent \w+')
namesRegex.sub('CENSORED', 'Agent Alice gave the secret documents to Agent Bob.')

'CENSORED gave the secret documents to CENSORED.'

In [45]:
agentNamesRegex = re.compile(r'Agent (\w)\w*')
agentNamesRegex.sub(r'\1****', 'Agent Alice told Agent Carol that Agent Eve knew Agent Bob was a double agent.')

'A**** told C**** that E**** knew B**** was a double agent.'

# Shorthands

\d       Any numeric digit from 0 to 9.

\D       Any character that is not a numeric digit from 0 to 9.

\w       Any letter, numeric digit, or the underscore character.
(Think of this as matching “word” characters.)

\W       Any character that is not a letter, numeric digit, or the
underscore character.

\s       Any space, tab, or newline character. (Think of this as
matching “space” characters.)

\S       Any character that is not a space, tab, or newline.