In [1]:
import re

In [2]:
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)')
mo = phoneNumRegex.search('My number is 415-555-4242.')

print(mo.group())
print(mo.group(1))
print(mo.group(2))
print(mo.group(0))
print(mo.groups())

415-555-4242
415
555-4242
415-555-4242
('415', '555-4242')


In [3]:
areaCode, mainNumber = mo.groups()
print(areaCode)
print(mainNumber)

415
555-4242


Chars to watch out for in regex<br>
\. \^ \$ \* \+ \? \{ \} \[ \] \\ \| \( \)

In [4]:
heroRegex = re.compile (r'Batman|Tina Fey')
mo1 = heroRegex.search('Batman and Tina Fey')
print(mo1.group())

mo2 = heroRegex.search('Tina Fey and Batman')
print(mo2.group())    

Batman
Tina Fey


In [5]:
batRegex = re.compile(r'Bat(man|mobile|copter|bat)')
mo = batRegex.search('Batmobile lost a wheel')

# returns full matched text
print(mo.group())

# returns a part of the matched text
print(mo.group(1))

Batmobile
mobile


In [6]:
# (wo?) is optional
batRegex = re.compile(r'Bat(wo)?man')
mo1 = batRegex.search('The Adventures of Batman')
print(mo1.group())

mo2 = batRegex.search('The Adventures of Batwoman')
print(mo2.group())

Batman
Batwoman


In [7]:
phoneRegex = re.compile(r'(\d\d\d-)?\d\d\d-\d\d\d\d')
mo1 = phoneRegex.search('My number is 415-555-4242')
print(mo1.group())

mo2 = phoneRegex.search('My number is 555-4242')
print(mo2.group())

415-555-4242
555-4242


In [8]:
# The * (called the star or asterisk) means “match zero or more”—
# the group that precedes the star can occur any number of times 
# in the text.
batRegex = re.compile(r'Bat(wo)*man')
mo1 = batRegex.search('The Adventures of Batman')
print(mo1.group())

mo2 = batRegex.search('The Adventures of Batwoman')
print(mo2.group())

mo3 = batRegex.search('The Adventures of Batwowowowoman')
print(mo3.group())


Batman
Batwoman
Batwowowowoman


In [9]:
batRegex = re.compile(r'Bat(wo)+man')
mo1 = batRegex.search('The Adventures of Batwoman')
print(mo1.group())

mo2 = batRegex.search('The Adventures of Batwowowowoman')
print(mo2.group())

mo3 = batRegex.search('The Adventures of Batman')
print(mo3 == None)


Batwoman
Batwowowowoman
True


In [10]:
# (Ha){3,5}
# ((Ha)(Ha)(Ha))|((Ha)(Ha)(Ha)(Ha))|((Ha)(Ha)(Ha)(Ha)(Ha))

In [11]:
haRegex = re.compile(r'(Ha){3}')
mo1 = haRegex.search('HaHaHa')
print(mo1.group())

mo2 = haRegex.search('Ha')
print(mo2 == None)

HaHaHa
True


In [12]:
greedyHaRegex = re.compile(r'(Ha){3,5}')
mo1 = greedyHaRegex.search('HaHaHaHaHa')
print(mo1.group())

nongreedyHaRegex = re.compile(r'(Ha){3,5}?')
mo2 = nongreedyHaRegex.search('HaHaHaHaHa')
print(mo2.group())


HaHaHaHaHa
HaHaHa


In [13]:
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d') # has no groups
phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000')
# returns a list of string matches

['415-555-9999', '212-555-0000']

In [14]:
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d)-(\d\d\d\d)') # has groups
>>> phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000')
# returns a list of tuples of strings

[('415', '555', '9999'), ('212', '555', '0000')]

\d Any numeric digit from 0 to 9.<br>
\D Any character that is not a numeric digit from 0 to 9.<br>
\w Any letter, numeric digit, or the underscore character.<br>
(Think of this as matching “word” characters.)<br>
\W Any character that is not a letter, numeric digit, or the<br>
underscore character.<br>
\s Any space, tab, or newline character. (Think of this as<br>
matching “space” characters.)<br>
\S Any character that is not a space, tab, or newline.

In [16]:
xmasRegex = re.compile(r'\d+\s\w+')
xmasRegex.findall('12 drummers, 11 pipers, 10 lords, 9 ladies, 8 maids, 7swans, 6 geese, 5 rings, 4 birds, 3 hens, 2 doves, 1 partridge')

['12 drummers',
 '11 pipers',
 '10 lords',
 '9 ladies',
 '8 maids',
 '6 geese',
 '5 rings',
 '4 birds',
 '3 hens',
 '2 doves',
 '1 partridge']

In [17]:
vowelRegex = re.compile(r'[aeiouAEIOU]')
vowelRegex.findall('RoboCop eats baby food. BABY FOOD.')

['o', 'o', 'o', 'e', 'a', 'a', 'o', 'o', 'A', 'O', 'O']

Note that inside the square brackets, the normal regular expression <br>symbols are not interpreted as such. This means you do not need <br>to escape the ., *, ?, or () characters with a preceding <br>backslash. For example, the character class [0-5.] will match <br>digits 0 to 5 and a period. You do not need to write it as [0-5\.].

In [19]:
# By placing a caret character (^) just after the character class’s 
# opening bracket, you can make a negative character class. 
# A negative character class will match all the characters 
# that are not in the character class
consonantRegex = re.compile(r'[^aeiouAEIOU]')
consonantRegex.findall('RoboCop eats baby food. BABY FOOD.')

['R',
 'b',
 'C',
 'p',
 ' ',
 't',
 's',
 ' ',
 'b',
 'b',
 'y',
 ' ',
 'f',
 'd',
 '.',
 ' ',
 'B',
 'B',
 'Y',
 ' ',
 'F',
 'D',
 '.']

<h1>“Carrots cost dollars”</h1>

In [22]:
beginsWithHello = re.compile(r'^Hello')
beginsWithHello.search('Hello, world!')

<re.Match object; span=(0, 5), match='Hello'>

In [23]:
beginsWithHello.search('He said hello.') == None

True

In [24]:
endsWithNumber = re.compile(r'\d$')
endsWithNumber.search('Your number is 42')

<re.Match object; span=(16, 17), match='2'>

In [25]:
endsWithNumber.search('Your number is forty two.') == None

True

In [26]:
# The r'^\d+$' regular expression string matches strings that both 
# begin and end with one or more numeric characters.
wholeStringIsNum = re.compile(r'^\d+$')
wholeStringIsNum.search('1234567890')

<re.Match object; span=(0, 10), match='1234567890'>

In [27]:
wholeStringIsNum.search('12345xyz67890') == None

True

In [28]:
wholeStringIsNum.search('12 34567890') == None

True

In [29]:
# Wildcard
atRegex = re.compile(r'.at')
atRegex.findall('The cat in the hat sat on the flat mat.')

['cat', 'hat', 'sat', 'lat', 'mat']

In [33]:
# The dot-star uses greedy mode
nameRegex = re.compile(r'First Name: (.*) Last Name: (.*)')
mo = nameRegex.search('First Name: Al Last Name: Sweigart')
print(mo.group(1))
print(mo.group(2))

Al
Sweigart


In [35]:
# To match any and all text in a non-greedy fashion, use the dot, 
# star, and question mark (.*?).

nongreedyRegex = re.compile(r'<.*?>')
mo = nongreedyRegex.search('<To serve man> for dinner.>')
print(mo.group())

<To serve man>


In [36]:
greedyRegex = re.compile(r'<.*>')
mo = greedyRegex.search('<To serve man> for dinner.>')
print(mo.group())

<To serve man> for dinner.>


In [38]:
# By passing re.DOTALL as the second argument to re.compile(), 
# you can make the dot character match all characters, 
# including the newline character.
noNewlineRegex = re.compile('.*')
noNewlineRegex.search('Serve the public trust.\nProtect the innocent.\nUphold the law.').group()

'Serve the public trust.'

In [39]:
newlineRegex = re.compile('.*', re.DOTALL)
newlineRegex.search('Serve the public trust.\nProtect the innocent.\nUphold the law.').group()

'Serve the public trust.\nProtect the innocent.\nUphold the law.'

• The ? matches zero or one of the preceding group.<br>
• The * matches zero or more of the preceding group.<br>
• The + matches one or more of the preceding group.<br>
• The {n} matches exactly n of the preceding group.<br>
• The {n,} matches n or more of the preceding group.<br>
• The {,m} matches 0 to m of the preceding group.<br>
• The {n,m} matches at least n and at most m of the preceding group.<br>
• {n,m}? or *? or +? performs a non-greedy match of the preceding <br>group.<br>
• ^spam means the string must begin with spam.<br>
• spam$ means the string must end with spam.<br>
• The . matches any character, except newline characters.<br>
• \d, \w, and \s match a digit, word, or space character, <br>respectively.<br>
• \D, \W, and \S match anything except a digit, word, or space <br>character,<br>
respectively.<br>
• [abc] matches any character between the brackets (such as a, b, <br>or c).<br>
• [^abc] matches any character that isn’t between the brackets.

In [41]:
# Normally, regular expressions match text with the exact 
# casing you specify.
robocop = re.compile(r'robocop', re.I)
robocop.search('RoboCop is part man, part machine, all cop.').group()

'RoboCop'

In [42]:
robocop.search('ROBOCOP protects the innocent.').group()

'ROBOCOP'

In [43]:
robocop.search('Al, why does your programming book talk about robocop so much?').group()

'robocop'

The sub() method for Regex objects is passed two arguments. <br>The first argument is a string to replace any matches. The second<br> is the string for the regular expression. The sub() method <br>returns a string with the substitutions applied.

In [44]:
namesRegex = re.compile(r'Agent \w+')
namesRegex.sub('CENSORED', 'Agent Alice gave the secret documents to Agent Bob.')

'CENSORED gave the secret documents to CENSORED.'

In [48]:
agentNamesRegex = re.compile(r'Agent (\w)\w*')
agentNamesRegex.sub(r'\1****', 'Agent Alice told Agent Carol that Agent Eve knew Agent Bob was a double agent.')

'A**** told C**** that E**** knew B**** was a double agent.'

<h1>Managing Complex Regex</h1>

In [46]:
phoneRegex = re.compile(r'((\d{3}|\(\d{3}\))?(\s|-|\.)?\d{3}(\s|-|\.)\d{4}(\s*(ext|x|ext.)\s*\d{2,5})?)')

In [51]:
phoneRegex = re.compile(r'''(
    (\d{3}|\(\d{3}\))? # area code
    (\s|-|\.)? # separator
    \d{3} # first 3 digits
    (\s|-|\.) # separator
    \d{4} # last 4 digits
    Pattern Matching with Regular Expressions 179
    (\s*(ext|x|ext.)\s*\d{2,5})? # extension
    )''', re.VERBOSE)

# Note how the previous example uses the triple-quote syntax (''') to
# create a multiline string so that you can spread the regular 
# expression definition over many lines, making it much more legible.

if you want a regular expression that’s case-insensitive and <br>
includes newlines to match the dot character, you would form <br>
your re.compile() call like this:<br>

In [52]:
someRegexValue = re.compile('foo', re.IGNORECASE | re.DOTALL)

In [53]:
someRegexValue = re.compile('foo', re.IGNORECASE | re.DOTALL | re.VERBOSE)