# Finding Patterns of Text Without Regular Expressions

In [1]:
def isPhoneNumber(text):
    if len(text) != 12:
        return False
    for i in range(0,3):
        if not text[i].isdecimal():
            return False
        
    if text[3] != '-':
        return False
    for i in range(4,7):
        if not text[i].isdecimal():
            return False
        
    if text[7] != '-':
        return False
    for i in range(8,12):
        if not text[i].isdecimal():
            return False
        
    return True

print('Is 415-555-4242 a phone number?')
print(isPhoneNumber('415-555-4242'))
print('Is Moshi moshi a phone number ?')
print(isPhoneNumber('Moshi moshi'))

Is 415-555-4242 a phone number?
True
Is Moshi moshi a phone number ?
False


In [2]:
message = 'Call me at 455-555-6583 tomorrow. 525-342-6646 is my office.'
for i in range(len(message)):
    chunk = message[i:i+12]
    if isPhoneNumber(chunk):
        print("Phone Number found: "+ chunk)
print('Done')

Phone Number found: 455-555-6583
Phone Number found: 525-342-6646
Done


# Finding Patterns of Text with Regular Expressions

1. Creating Regex Objects

In [1]:
import re

# Passing a string value representing your regular expression to re.compile() returns a Regex pattern object (or simply, a Regex object).
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')


2. Matching Regex Objects

In [None]:
# A Regex object’s search() method searches the string it is passed for any matches to the regex
mo = phoneNumRegex.search('My number is 424-242-5353.')

# group() method that will return the actual matched text from the searched string.
print('Phone number found: '+mo.group())

Phone number found: 424-242-5353


3. Grouping with Parentheses

In [7]:
# Adding parentheses will create groups in the regex
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)')
mo = phoneNumRegex.search('my number is 455-343-2424.')
mo.group(1)

'455'

In [8]:
mo.group(2)

'343-2424'

In [9]:
mo.group(0)

'455-343-2424'

In [10]:
mo.group()

'455-343-2424'

In [11]:
mo.groups()

('455', '343-2424')

In [12]:
areaCode, mainNumber = mo.groups()
print("Area Code: "+ areaCode + " Main Number: "+ mainNumber)

Area Code: 455 Main Number: 343-2424


In [13]:
# if you need to match a parenthesis in your text
phoneNumRegex = re.compile(r'(\(\d\d\d\)) (\d\d\d-\d\d\d\d)')
mo = phoneNumRegex.search('my number is (456) 345-5242.')
mo.group(1)

'(456)'

In [17]:
mo.group(2)

'345-5242'

In [18]:
mo.group(0)

'(456) 345-5242'

In [16]:
mo.groups()

('(456)', '345-5242')

>NOTE: In regular expressions, the following characters have special meanings: .  ^  $  *  +  ?  {  }  [  ]  \  |  (  ) <br> If you want to detect these characters as part of your text pattern, you need to escape them with a backslash.

4. Matching Multiple Groups with the Pipe

In [None]:
heroRegex = re.compile(r'Batman|Tina Fey')

# When both Batman and Tina Fey occur in the searched string, the first occurrence of matching text will be returned as the Match object
mo1 = heroRegex.search("Batman and Tina Fey")
mo1.group()


'Batman'

In [22]:
mo2 = heroRegex.search("Tina Fey and Batman")
mo2.group()

'Tina Fey'

In [2]:
"""Since all these strings start with Bat, it would be nice if you could specify that prefix only once."""

batRegex = re.compile(r'Bat(man|mobile|copter|bat)')
mo = batRegex.search('Batmobile lost a wheel')
mo.group()

'Batmobile'

In [5]:
mo.groups()

('mobile',)

In [6]:
mo.group(0)

'Batmobile'

In [3]:
mo.group(1)

'mobile'

5. Optional Matching with the Question Mark

In [7]:
# The ? character flags the group that precedes it as an optional part of the pattern.
batRegex = re.compile(r'Bat(wo)?man')
mo1 = batRegex.search('The adventures of Batman')
mo1.group()

'Batman'

In [12]:
mo2 = batRegex.search('The adventures of Batwoman')
mo2.group()

'Batwoman'

In [13]:
PhoneRegex = re.compile(r'(\d\d\d-)?\d\d\d-\d\d\d\d')
mo1 = PhoneRegex.search('My number is 234-234-2423')
mo1.group()

'234-234-2423'

In [15]:
mo2 = PhoneRegex.search('My number is 555-2324')
mo2.group()

'555-2324'

6. Matching Zero or More with the Star

In [16]:
batRegex = re.compile(r'Bat(wo)*man')
mo1 = batRegex.search('The adventures of Batman')
mo1.group()

'Batman'

In [19]:
mo2 = batRegex.search('The Adventures of Batwoman')
mo2.group()

'Batwoman'

In [20]:
mo3 = batRegex.search('The Adventures of Batwowowoman')
mo3.group()

'Batwowowoman'

7. Matching One or More with the Plus

In [21]:
batRegex = re.compile(r'Bat(wo)+man')
mo1 = batRegex.search('The Adventures of Batwoman')
mo1.group()

'Batwoman'

In [22]:
mo2 = batRegex.search('The Adventures of Batwowowoman')
mo2.group()

'Batwowowoman'

In [24]:
mo3 = batRegex.search('The Adventures of Batman')
mo3 == None

True

8. Matching Specific Repetitions with Braces

In [25]:
haRegex = re.compile(r'(Ha){3}')
mo1 = haRegex.search("HaHaHa")
mo1.group()

'HaHaHa'

In [26]:
mo2 = haRegex.search('Ha')
mo2 == None

True

In [27]:
haRegex2 = re.compile(r'(Ha){3,5}')
mo1 = haRegex2.search("HaHaHaHaHa")
mo2 = haRegex2.search("HaHaHaHa")
mo3 = haRegex2.search("HaHaHa")
mo1.group(),mo2.group(),mo3.group()


('HaHaHaHaHa', 'HaHaHaHa', 'HaHaHa')

9. Greedy & non-greedy matchings

In [28]:
"""Python’s regular expressions are greedy by default, which means that in ambiguous situations they will match the longest string possible. The non-greedy (also called lazy) version of the braces, which matches the shortest string possible, has the closing brace followed by a question mark."""

greedyHaRegex = re.compile(r'(Ha){3,5}')
mo1 = greedyHaRegex.search("HaHaHaHaHa")
mo1.group()


'HaHaHaHaHa'

In [30]:
non_greedyHaRegex = re.compile(r'(Ha){3,5}?')
mo2 = non_greedyHaRegex.search('HaHaHaHaHa')
mo2.group()

'HaHaHa'

10. findall method

In [31]:
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
mo = phoneNumRegex.search("Cell: 234-242-5445 Work: 234-242-4644")
mo.group()

'234-242-5445'

In [33]:
# findall with no groups format
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
phoneNumRegex.findall("Cell: 234-242-5445 Work: 234-242-4644") 

['234-242-5445', '234-242-4644']

In [34]:
# finall with groups format
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d)-(\d\d\d\d)') 
phoneNumRegex.findall("Cell: 234-242-5445 Work: 234-242-4644")

[('234', '242', '5445'), ('234', '242', '4644')]

11. Character classes

* \d : any numeric digit from 0 to 9
* \D : !(\d)
* \w : Any letter, numeric digit or the underscore character.(matches word characters)
* \W : Any character that is not a letter, numeric digit or the underscore character.
* \s : Any space, tab, or newline character.
* \S : Any character that isn't a space, tab, or newline.

In [38]:
xmasRegex = re.compile(r'\d+\s+\w+')
xmasRegex.findall("12  drummers, 11 pipers, 10 lords, 9 ladies, 8 maids, 7 swans, 6 geese, 5 rings, 4 birds, 3 hens, 2 doves, 1 partridge")


['12  drummers',
 '11 pipers',
 '10 lords',
 '9 ladies',
 '8 maids',
 '7 swans',
 '6 geese',
 '5 rings',
 '4 birds',
 '3 hens',
 '2 doves',
 '1 partridge']

12. Making your own character classes

In [39]:
vowelRegex = re.compile(r'[aeiouAEIOU]')
vowelRegex.findall('RoboCop eats baby food. BABY FOOD.')

['o', 'o', 'o', 'e', 'a', 'a', 'o', 'o', 'A', 'O', 'O']

>Note that inside the square brackets, the normal regular expression symbols are not interpreted as such. This means you do not need to escape the ., *, ?, or () characters with a preceding backslash.

In [40]:
# By placing a caret character (^) just after the character class’s opening bracket, you can make a negative character class.
constantRegex = re.compile(r'[^aeiouAEIOU]')
constantRegex.findall('RoboCop eats baby food. BABY FOOD.')

['R',
 'b',
 'C',
 'p',
 ' ',
 't',
 's',
 ' ',
 'b',
 'b',
 'y',
 ' ',
 'f',
 'd',
 '.',
 ' ',
 'B',
 'B',
 'Y',
 ' ',
 'F',
 'D',
 '.']

13. The Caret and Dollar Sign Characters

In [41]:
# You can also use the caret symbol (^) at the start of a regex to indicate that a match must occur at the beginning of the searched text.
beginsWithHello = re.compile(r'^Hello')
beginsWithHello.search('Hello, world')

<re.Match object; span=(0, 5), match='Hello'>

In [42]:
beginsWithHello.search("He said Hello.") == None

True

In [43]:
# you can put a dollar sign ($) at the end of the regex to indicate the string must end with this regex pattern.
endsWithNumber = re.compile(r'\d$')
endsWithNumber.search('Your number is 242')


<re.Match object; span=(17, 18), match='2'>

In [45]:
endsWithNumber.search('Your number is two-four-two') == None

True

In [46]:
wholeStringNum = re.compile(r'^\d+$')
wholeStringNum.search('123456')

<re.Match object; span=(0, 6), match='123456'>

In [48]:
wholeStringNum.search('1234xyz24242') == None

True

In [49]:
wholeStringNum.search('12 234242') == None

True

14. The wildcard character

In [50]:
# the dot character (wildcard) match any character except for a newline.
# the dot char will just match one char eg. lat instead flat
atRegex = re.compile(r'.at')
atRegex.findall('The cat in the hat sat on the flat mat.')

['cat', 'hat', 'sat', 'lat', 'mat']

15. Matching Everything with Dot-star

In [55]:
nameRegex = re.compile(r'First Name: (.*) Last Name: (.*)')
mo = nameRegex.search("First Name: AL Last Name: Sweigwart")
mo.group(1),mo.group(2)

('AL', 'Sweigwart')

In [56]:
"""The dot-star uses greedy mode: It will always try to match as much text as possible. To match any and all text in a non-greedy fashion, use the dot, star, and question mark (.*?)."""

non_greedyRegex = re.compile(r'<.*?>')
mo = non_greedyRegex.search('<To serve man> for dinner.>')
mo.group()

'<To serve man>'

In [57]:
greedyRegex = re.compile(r'<.*>')
mo = greedyRegex.search('<To Search man> for dinner>')
mo.group()

'<To Search man> for dinner>'

16. Matching Newlines with the Dot Character

In [58]:
# The dot-star will match everything except a newline
noNewLineRegex = re.compile('.*')
noNewLineRegex.search('Serve the public trust. \nProtect the innocent. \nUphold the law.').group()

'Serve the public trust. '

In [59]:
# you can make the dot character match all characters, including the newline character.
newlineRegex = re.compile('.*',re.DOTALL)
newlineRegex.search("Serve the public trust. \nProtect the innocent. \nUphold the law.").group()

'Serve the public trust. \nProtect the innocent. \nUphold the law.'

17. Case-Insensitive Matching

In [60]:
robocop = re.compile(r'robocop',re.I)
robocop.search('RoboCop is part man, part machine, all cop.').group()

'RoboCop'

In [61]:
robocop.search('ROBOCOP is part man, part machine, all cop.').group()


'ROBOCOP'

In [62]:
robocop.search('Al, why does your programming book talk about robocop so much?').group()


'robocop'

18. Substituting strings with the sub() method

In [63]:
#  The sub() method returns a string with the substitutions applied.
namesRegex = re.compile(r'Agent \w+')
namesRegex.sub('CENSORED', 'Agent Alice gave the secret documents to Agent Bob.')

'CENSORED gave the secret documents to CENSORED.'

In [66]:
"""you can type \1, \2, \3, and so on, to mean “Enter the text of group 1, 2, 3, and so on, in the substitution.”"""

agentNamesRegex = re.compile(r'Agent (\w)\w*')
agentNamesRegex.sub(r'\1****','Agent Alice told Agent Carol that Agent Eve knew Agent Bob was a double agent.')

'A**** told C**** that E**** knew B**** was a double agent.'

19. Managing Complex Regexes

In [67]:
phoneRegex = re.compile(r'''(
    (\d{3}|\(\d{3}\))? # area code
    (\s|-|\.)?  # seperator
    \d{3} #first 3 digits
    (\s|-|\.) #seperator
    \d{4} # last 4 digits
    (\s*(ext|x|ext.)\s*\d{2,5})? # extension
                        )''',re.VERBOSE)

In [68]:
# So if you want a regular expression that’s case-insensitive and includes newlines to match the dot character
someRegexValue = re.compile('foo',re.IGNORECASE | re.DOTALL | re.VERBOSE)

> Review of Regex Symbols
This chapter covered a lot of notation, so here’s a quick review of what you learned about basic regular expression syntax:

* The ? matches zero or one of the preceding group.
* The * matches zero or more of the preceding group.
* The + matches one or more of the preceding group.
* The {n} matches exactly n of the preceding group.
* The {n,} matches n or more of the preceding group.
* The {,m} matches 0 to m of the preceding group.
* The {n,m} matches at least n and at most m of the preceding group.
* {n,m}? or *? or +? performs a non-greedy match of the preceding group.
* ^spam means the string must begin with spam.
* spam$ means the string must end with spam.
* The . matches any character, except newline characters.
* \d, \w, and \s match a digit, word, or space character, respectively.
* \D, \W, and \S match anything except a digit, word, or space character,respectively.
* [abc] matches any character between the brackets (such as a, b, or c).
* [^abc] matches any character that isn’t between the brackets.