In [1]:
import re

In [2]:
# finding phone numbers using Regex
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d')
mo = phoneNumRegex.search('My number is 415-555-4242.')
print('Phone number found:' + mo.group())

Phone number found:415-555-424


In [3]:
#grouping with parentheses
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d)')
mo = phoneNumRegex.search('My number is 415-555-4242.')

print(mo.group(1))
print(mo.group(2))
print(mo.group(0))
print(mo.group())
print(mo.groups())  # all groups 

areaCode, mainNumber = mo.groups()
print(areaCode)
print(mainNumber)

415
555-424
415-555-424
415-555-424
('415', '555-424')
415
555-424


In [6]:
# Matching Multiple Groups with the Pipe
heroRegex = re.compile(r'Batman|Tina Fey')
mo1 = heroRegex.search('Batman and Tina Fey.')
mo2 = heroRegex.search('Tina Fey and Batman.')
mo1.group(), mo2.group()

('Batman', 'Tina Fey')

In [7]:
batRegex = re.compile(r'Bat(man|mobile|copter|bat)')
mo =  batRegex.search('Batmobile lost a wheel')
mo.group(), mo.group(1)

('Batmobile', 'mobile')

In [8]:
# optional matching with the question Mark
batRegex = re.compile(r'Bat(wo)?man')
mo1 = batRegex.search('The Adventures of Batman')
print(mo1.group())

mo2 = batRegex.search('The Adventures of Batwoman')
print(mo2.group())

Batman
Batwoman


In [12]:
phoneRegex = re.compile(r'(\d\d\d-)?\d\d\d-\d\d\d')
mo1 = phoneRegex.search('My number is 415-415-555-4242')
print(mo1.group())

mo2 = phoneRegex.search('My number is 555-4242')
mo2.group()

415-415-555


'555-424'

In [13]:
# matching zero or More with the star 
batRegex = re.compile(r'Bat(wo)*man')
mo1 = batRegex.search('The Adventures of Batman')
print(mo1.group())

mo2 = batRegex.search("The Adventures Of Batman")
print(mo2.group())

mo3 = batRegex.search('The Adventures of Batwowowowoman')
print(mo3.group())

Batman
Batman
Batwowowowoman


In [17]:
# matching one or more with the Plus
batRegex = re.compile(r'Bat(wo)+man')
mo1 = batRegex.search('The Adventures of Batwoman')
print(mo1.group())

mo2 = batRegex.search('The Adventures of Batwowowowoman')
print(mo2.group())

mo3 = batRegex.search('The Adventures of Batman')
mo3 == None

Batwoman
Batwowowowoman


True

In [19]:
# matching specific repetitions with curly brackets 
haRegex = re.compile(r'(Ha){3}')
mo1 = haRegex.search('HaHaHa')
print(mo1.group())

mo2 = haRegex.search('Ha')
mo2 == None

HaHaHa


True

In [25]:
# greedy and non-greedy matching  
greedyHaRegex = re.compile(r'(Ha){3,5}') # greedy - don't leave a gap in between 3 and 5
mo1 = greedyHaRegex.search('HaHaHaHaHa')
print(mo1.group())

nongreedyHaRegex = re.compile(r'(Ha){3,5}?') # nongreedy
mo2 = nongreedyHaRegex.search('HaHaHaHaHa')
print(mo2.group())

HaHaHaHaHa
HaHaHa


In [26]:
# The Findall() Method 
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
mo = phoneNumRegex.search('Cell: 415-555-9999 Work: 212-555-0000')
mo.group()

'415-555-9999'

In [27]:
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d') # has no groups
phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000')

['415-555-9999', '212-555-0000']

### Character Classe
- \d: Any numeric digit from 0 to 9.
- \D:  Any character that is not a numeric digit from 0 to 9.
- \w:  Any letter, numeric digit, or the underscore character.
(Think of this as matching “word” characters.)
- \W:  Any character that is not a letter, numeric digit, or the
underscore character.
- \s:  Any space, tab, or newline character. (Think of this as
matching “space” characters.)
- \S: Any character that is not a space, tab, or newline

In [28]:
xmasRegex = re.compile(r'\d+\s\w+')
xmasRegex.findall('12 drummers, 11 pipers, 10 lords, 9 ladies, 8 maids, 7 swans, 6 geese, 5 rings, 4 birds, 3 hens, 2 doves, 1 partridge')

['12 drummers',
 '11 pipers',
 '10 lords',
 '9 ladies',
 '8 maids',
 '7 swans',
 '6 geese',
 '5 rings',
 '4 birds',
 '3 hens',
 '2 doves',
 '1 partridge']

In [29]:
# Making Your Own Character Classes
vowelRegex = re.compile(r'[aeiouAEIOU]')
vowelRegex.findall('RoboCop eats baby food. BABY FOOD.')

['o', 'o', 'o', 'e', 'a', 'a', 'o', 'o', 'A', 'O', 'O']

In [30]:
consonantRegex = re.compile(r'[^aeiouAEIOU]')
consonantRegex.findall('RoboCop eats baby food. BABY FOOD.')

['R',
 'b',
 'C',
 'p',
 ' ',
 't',
 's',
 ' ',
 'b',
 'b',
 'y',
 ' ',
 'f',
 'd',
 '.',
 ' ',
 'B',
 'B',
 'Y',
 ' ',
 'F',
 'D',
 '.']

In [32]:
# the caret and dollar sign character
beginsWithHello = re.compile(r'^Hello')
beginsWithHello.search('Hello world!')


<re.Match object; span=(0, 5), match='Hello'>

In [33]:
beginsWithHello.search('He said hello.') == None

True

In [34]:
endsWithNumber = re.compile(r'\d$')
endsWithNumber.search('Your number is 42')

<re.Match object; span=(16, 17), match='2'>

In [35]:
endsWithNumber.search('Your number is forty two.') == None

True

In [36]:
wholeStringIsNum = re.compile(r'^\d+$') # entire string should be regex 
wholeStringIsNum.search('1234567890')

<re.Match object; span=(0, 10), match='1234567890'>

In [38]:
wholeStringIsNum.search('12345xyz67890') == None

True

In [39]:
wholeStringIsNum.search('12 34567890') == None

True

In [41]:
# The Wildcard Character (.)
atRegex = re.compile(r'.at')
atRegex.findall('The cat in the hat sat on the flat mat.')  

#the dot character will match just one character, which
#is why the match for the text flat matched only lat

['cat', 'hat', 'sat', 'lat', 'mat']

In [43]:
# Matching everything with Dot-Star
nameRegex = re.compile(r'First Name: (.*) Last Name: (.*)')
mo = nameRegex.search('First Name: A1 Last Name: Sweigart')
mo.group(1), mo.group(2)

('A1', 'Sweigart')

In [44]:
nongreedyRegex = re.compile(r'<.*?>')
mo = nongreedyRegex.search('<To serve man> for dinner.>')
mo.group()

'<To serve man>'

In [45]:
greedyRegex = re.compile(r'<.*>')
mo = greedyRegex.search('<To serve man> for dinner.>')
mo.group()

'<To serve man> for dinner.>'

In [46]:
# Matching Nwlines with the Dot Character 
noNewlineRegex = re.compile('.*')
noNewlineRegex.search('Serve the public trust.\nProtect the innocent.\nUphold the law.').group()

'Serve the public trust.'

In [48]:
noNewlineRegex = re.compile('.*', re.DOTALL)
noNewlineRegex.search('Serve the public trust.\nProtect the innocent.\nUphold the law.').group()

'Serve the public trust.\nProtect the innocent.\nUphold the law.'

### Review Of The Regex Symbols 
- The ? matches zero or one of the preceding group.
-	The * matches zero or more of the preceding group.
- The + matches one or more of the preceding group.
- The {n} matches exactly n of the preceding group.
- The {n,} matches n or more of the preceding group.
- The {,m} matches 0 to m of the preceding group.
- The {n,m} matches at least n and at most m of the preceding group.
- {n,m}? or *? or +? performs a nongreedy match of the preceding group.
- ^spam means the string must begin with spam.
- spam$ means the string must end with spam.
- The . matches any character, except newline characters.
- \d, \w, and \s match a digit, word, or space character, respectively.
- \D, \W, and \S match anything except a digit, word, or space character,
respectively.
- [abc] matches any character between the brackets (such as a, b, or c).
- [^abc] matches any character that isn’t between the bracket

In [49]:
# Case-Insentitive Matching 
robocop = re.compile(r'robocop', re.I)
robocop.search('RoboCop is part man, part machine, all cop.').group()

'RoboCop'

In [50]:
# Substituting Strings with the sub() Method 
namesRegex = re.compile(r'Agent \w+')
namesRegex.sub('CENSORED',  'Agent Alice gave the secret documents to Agent Bob.')

'CENSORED gave the secret documents to CENSORED.'

In [51]:
agentNamesRegex = re.compile(r'Agent(\w)\w')
agentNamesRegex.sub(r'\1****', 'Agent Alice told Agent Carol that AgentEve knew Agent Bob was a double agent.')

'Agent Alice told Agent Carol that E****e knew Agent Bob was a double agent.'