In [1]:
import re

In [5]:
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
mo = phoneNumRegex.search('My phone num is 636-399-3284.')
print(mo.group())

636-399-3284


In [8]:
#regex search returns a group compose of 2 parts. 
#group() and group(0) both return the entire string
#group(1) returns first portion of string
#group(2) return the second portion

IndexError: no such group

In [17]:
#assign results to multiple strings
#areaCode, mainNumber = mo.groups()

#following characters have special meaning
#.^$+?{}[]\|()
#to scape them put \ in front of r string
phoneNumRegex = re.compile(r'(\(\d\d\d)\)(\d\d\d-\d\d\d\d)')

In [18]:
#now regex detects strings of format (838) 888-3883
mo = phoneNumRegex.search('(383)838-3939')
print(mo.group())

(383)838-3939


In [20]:
#The pipe | can be used to check match for multiple items
# a|b will return a match if either a or b is found.
#will return match for first item found
heroRegex = re.compile( r'Batman|Superman')
mo1 = heroRegex.search("Batman and Superman")
print(mo1.group())
mo1 = heroRegex.search("Superman annnd Batman")
print(mo1.group())

Batman
Superman


In [22]:
#Following can be used to match many strings starting with Bat
batRegex = re.compile(r'Bat(man|mobile|copter|bat)')
mo = batRegex.search("Batcopter is down")
print(mo.group())

Batcopter


In [25]:
#? flags preceding group as optional
batRegex = re.compile(r'Bat(wo)?man')  #(wo)is optional
mo1 = batRegex.search('The adventures of Batman')
print(mo1.group())
mo1 = batRegex.search('The great Batwoman')
print(mo1.group())


Batman
Batwoman


In [27]:
#another optional example with phone #
#this time (ddd) is optional
phoneRegex = re.compile(r'(\d\d\d-)?\d\d\d-\d\d\d\d')
mo1 = phoneRegex.search('423-434-3333')
print(mo1.group())
mo2 = phoneRegex.search("222-3333")
print(mo2.group())

423-434-3333
222-3333


In [29]:
# * matches zero or more of the groups preceding the *
# group before * can be absent, or repeated over and over again
batRegex = re.compile(r'Bat(wo)*man')
mo1 = batRegex.search("blah blah Batman")
print(mo1.group())
mo2 = batRegex.search("blaskdjf Batwoman")
print(mo2.group())
mo3 = batRegex.search("The mighty Batwowowowowowoman")
print(mo3.group())

Batman
Batwoman
Batwowowowowowoman


In [37]:
# + matches one or more. meaning preceding group needs to be in string at least once
# group can not be absent
batRegex = re.compile(r'Bat(wo)+man')
mo1 = batRegex.search('The bla of Batwoman')
mo2 = batRegex.search("The adjdj of Batwowowowowoman")
mo3 = batRegex.search("adventure of Batman") #not a match because it requires at least one 'wo'
print(mo1.group())
print(mo2.group())
print(mo3 == None)

Batwoman
Batwowowowowoman
True


In [31]:
mo3 == None

False

In [38]:
# {} how many matches you want
# (ha){3} will match "hahaha"
#Can also specify a range{min,max}
#(ha){3,5} will match "hahaha", "hahahaha", "hahahahaha"
#(ha) {,5} will match 0 to 5
#(ha) {3,} will match 3 or more
haRegex = re.compile(r'(Ha){3}')
mo = haRegex.search('HaHaHa')
print(mo.group())


HaHaHa


In [39]:
#when using range and 1+ strings match the pattern, by the default search will return longest
greedyHaRegex = re.compile(r'(Ha){3,5}')
mo1 = greedyHaRegex.search("HaHaHaHaHa")
print(mo1.group())

nongreedyHaRegex = re.compile(r'(Ha){3,5}?')
mo2 = nongreedyHaRegex.search("HaHaHaHaHa")
print(mo2.group())

HaHaHaHaHa
HaHaHa


In [40]:
#search() only returns first match
#findall() returns a list of all strings matching the pattern if there are no groups in expression
#findall() returns tuples if expression conteins groups
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d') #no groups
mo = phoneNumRegex.search("434-433-4324 and 304-345-4524")
print(mo.group())

434-433-4324


In [47]:
print( phoneNumRegex.findall("323-434-4352 annnnd 324-443-4242"))


['323-434-4352', '324-443-4242']


In [59]:
phoneNumRegex = re.compile(r"(\d\d\d-)(\d\d\d\-)(\d\d\d\d)") # has groups
print(phoneNumRegex.findall('412-333-4241 annnd soem other number 933-324-3298'))


[('412-', '333-', '4241'), ('933-', '324-', '3298')]


In [None]:
#Therefore, if pattern has groups we get list of tuples and if no groups we get a list of strings

In [61]:
#character classes:
# \d -> any digit 0-9
# \D -> any char that is not numeric 0-9
# \w -> any letter, numeric digit or _ character. 
# \W -> any char not a letter, numeric digits or _
# \s -> any space, tab or nl char
# \S -> any char not space, tab, nl
# [0-5] will match numbers 0 to 5
xmasRegex = re.compile(r'\d+\s\w+') #matches digit, space and letter
print(xmasRegex.findall('12 drummers, 11 pipers, 10 lords, 9 ladies, 8 maids, 7 swans, 6 geese, 5 rings, 3 hens, 2 doves, 1 partridge'))

['12 drummers', '11 pipers', '10 lords', '9 ladies', '8 maids', '7 swans', '6 geese', '5 rings', '3 hens', '2 doves', '1 partridge']


In [63]:
#define own character classes
#[aeiouAEIOU] will match any of vowel
vowelRegex = re.compile(r'[aeiouAEIOU]')
print(vowelRegex.findall('Robocop eats baby food. BABY FOOD.'))

['o', 'o', 'o', 'e', 'a', 'a', 'o', 'o', 'A', 'O', 'O']


In [64]:
#[a-zA-Z0-0] will match all lowercase, uppercase and numbers
#can also make negative classes by adding ^
#negative classes select all except for what is in the pattern
consonantRegex = re.compile(r'[^aeiouAEIOU]')
print(consonantRegex.findall('Robocop eats baby food. BABY FOOD.'))

['R', 'b', 'c', 'p', ' ', 't', 's', ' ', 'b', 'b', 'y', ' ', 'f', 'd', '.', ' ', 'B', 'B', 'Y', ' ', 'F', 'D', '.']


In [67]:
# ^ at the start of regex indicates match must take place at beginning of str
# $ at end of of regex indicates match must take place at end of string
# can combine ^ and $ so entire string must match the given patten
beginsWithHello = re.compile(r'^Hello')
print(beginsWithHello.search('Hello, world!'))


<re.Match object; span=(0, 5), match='Hello'>


In [68]:
print(beginsWithHello.search("He said Hello") == None)

True


In [69]:
endsWithNumber = re.compile(r'\d$') #check if string ends with number
print(endsWithNumber.search("Your number is 3333"))
print(endsWithNumber.search("Your number is not hereeee") == None)

<re.Match object; span=(18, 19), match='3'>
True


In [70]:
wholeStringIsNum = re.compile(r'^\d+$') #checks entire string is composed of numbers
print(wholeStringIsNum.search('212324242424'))
print(wholeStringIsNum.search('3232d33232') == None)


<re.Match object; span=(0, 12), match='212324242424'>
True


In [71]:
#wildcard . matches any char except nl
atRegex = re.compile(r'.at')
print(atRegex.findall('The cat in the hat sat on the flat mat.'))


['cat', 'hat', 'sat', 'lat', 'mat']


In [74]:
# . and be combined with * as .* to mea any and all after pattern
nameRegex = re.compile(r'First Name: (.*) Last Name: (.*)')
mo = nameRegex.search('First Name: Isaac Last Name: Reyes')
print(mo.group(1))
print(mo.group(2))

Isaac
Reyes


In [75]:
#.* is greedy by default and will match as much as possible
#use .*? to make it non greedy
nongreedyRegex = re.compile(r'<.*?>') #match anything after <, followed by >
mo = nongreedyRegex.search('<To server man> for dinner.>')
print(mo.group())

greedyRegex = re.compile(r'<.*>')
mo = greedyRegex.search('<To serve man> for dinner.>')
print(mo.group())

<To server man>
<To serve man> for dinner.>


In [77]:
# . can be forced to match nl
noNewLineRegex = re.compile('.*')
noNewLineRegex.search('Serve the public trust.\nProtect the innocent.\nUphold the laaw.').group()

'Serve the public trust.'

In [79]:
NewLineRegex = re.compile('.*', re.DOTALL)
NewLineRegex.search('Serve the public trust.\nProtect the innocent.\nUphold the laaw.').group()

'Serve the public trust.\nProtect the innocent.\nUphold the laaw.'

In [81]:
#Make regrex case insensitive to match pattern regardless of caps or lower
#pass second paremeter re.I or re.IGNORECASE
robocop = re.compile(r'robocop', re.I)
robocop.search('Robocop is part man, part machine, all cop.').group()
robocop.search("ROBOCOP protects the innocent.").group()

'Robocop'

In [82]:
#sub() can be used to replace text
#first argument is the text used to replace and second argument is the pattern
namesRegex = re.compile(r'Agent \w+')
namesRegex.sub('CENSORED', 'Agent Alice gave the secret documents to Agent Bob.')

'CENSORED gave the secret documents to CENSORED.'

In [83]:
#Eh, re-read this one. I am too tired to try to make sense of it rn
agentNamesRegex = re.compile(r'Agent (\w)\w*')
agentNamesRegex.sub(r'\1****', 'Agent Alice told Agent Carol that Agent Eve knew Agent Bob was a double agent.')

'A**** told C**** that E**** knew B**** was a double agent.'

In [85]:
#Verbose mode allows you to separate regex over multiple lines
phoneRegex = re.compile(r'''(
(\d{3}|\(\d{3}\))?     #area code
(\s|-|\.)?             #separator
\d{3}                  #first 3 digits
(\s|-|\.)              #separator
\d{4}                  #last 4 digits
(\s*(ext|x|ext.)\s*\d{2,5})?   #extension
)''', re.VERBOSE)

In [86]:
#can use | to combine arguments to regex since regex only takes a single second argument
someRegexValue = re.compile('foo', re.IGNORECASE | re.DOTALL | re.VERBOSE)