In [2]:
import re

Base in https://docs.python.org/2/library/re.html#search-vs-match

# Search

re.search(pattern, string, flags=0)
Scan through string looking for the first location where the regular expression pattern produces a match, and return a corresponding MatchObject instance. Return None if no position in the string matches the pattern; note that this is different from finding a zero-length match at some point in the string.

In [3]:
result = re.search('cx',"abbbbbbbcx")
print(result)

<_sre.SRE_Match object; span=(8, 10), match='cx'>


In [4]:
result = re.search('ft',"abbbbbbbcx")
print(result)

None


In [5]:
result = re.search('k+',"abbbbbbbcx")
print(result)

None


In [6]:
result = re.search('b+',"abbbbbbbcx")
print(result)
print(result.start())
print(result.end())

<_sre.SRE_Match object; span=(1, 8), match='bbbbbbb'>
1
8


In [7]:
email = "tony@tiremove_thisger.net"
result = re.search("remove_this", email)
email[:result.start()] + email[result.end():]

'tony@tiger.net'

In [8]:
result.group(0)

'remove_this'

# Match

re.match(pattern, string, flags=0)
If zero or more characters at the beginning of string match the regular expression pattern, return a corresponding MatchObject instance. Return None if the string does not match the pattern; note that this is different from a zero-length match.

In [9]:
result = re.match("(\w+) (\w+)", "Isaac Newton, physicist")

print(result.group(0))    # The entire match

print(result.group(1))    # The first parenthesized subgroup.

print(result.group(2))    # The second parenthesized subgroup.

print(result.group(1, 2)) # Multiple arguments give us a tuple.

Isaac Newton
Isaac
Newton
('Isaac', 'Newton')


In [10]:
result = re.match("(?P<first_name>\w+) (?P<last_name>\w+)", "Compadre Washington")
print(result.group('first_name'))
print(result.group('last_name'))

Compadre
Washington


# Search vs. Match

Python offers two different primitive operations based on regular expressions: re.match() checks for a match only at the beginning of the string, while re.search() checks for a match anywhere in the string.

In [11]:
print( re.match("c", "abcdef")  )   # No match
print( re.search("c", "abcdef") )   # Match

None
<_sre.SRE_Match object; span=(2, 3), match='c'>


Regular expressions beginning with '^' can be used with search() to restrict the match at the beginning of the string:

In [12]:
print (re.match("c", "abcdef"))    # No match
print (re.search("^c", "abcdef"))  # No match
print (re.search("^a", "abcdef"))  # Match

None
None
<_sre.SRE_Match object; span=(0, 1), match='a'>


# Split

re.split(pattern, string, maxsplit=0, flags=0)
Split string by the occurrences of pattern. If capturing parentheses are used in pattern, then the text of all groups in the pattern are also returned as part of the resulting list. If maxsplit is nonzero, at most maxsplit splits occur, and the remainder of the string is returned as the final element of the list. (Incompatibility note: in the original Python 1.5 release, maxsplit was ignored. This has been fixed in later releases.)

In [13]:
re.split('\W+', 'Words, words, words.')

['Words', 'words', 'words', '']

In [16]:
re.split('[a-f]+', '0a3b9')

['0', '3', '9']

Note that split will never split a string on an empty pattern match. For example:

In [15]:
re.split('x*', 'foo')

  return _compile(pattern, flags).split(string, maxsplit)


['foo']

In [52]:
text = """Ross McFluff: 834.345.1254 155 Elm Street
...
... Ronald Heathmore: 892.345.3428 436 Finley Avenue
... Frank Burger: 925.541.7625 662 South Dogwood Way
...
...
... Heather Albrecht: 548.326.4584 919 Park Place"""

In [54]:
entries = re.split("\n+", text)

In [55]:
entries

['Ross McFluff: 834.345.1254 155 Elm Street',
 'Ronald Heathmore: 892.345.3428 436 Finley Avenue',
 'Frank Burger: 925.541.7625 662 South Dogwood Way',
 'Heather Albrecht: 548.326.4584 919 Park Place']

In [58]:
[re.split(":? ", entry, 3) for entry in entries]

[['Ross', 'McFluff', '834.345.1254', '155 Elm Street'],
 ['Ronald', 'Heathmore', '892.345.3428', '436 Finley Avenue'],
 ['Frank', 'Burger', '925.541.7625', '662 South Dogwood Way'],
 ['Heather', 'Albrecht', '548.326.4584', '919 Park Place']]

# Sub

re.sub(pattern, repl, string, count=0, flags=0)
Return the string obtained by replacing the leftmost non-overlapping occurrences of pattern in string by the replacement repl. If the pattern isn’t found, string is returned unchanged. repl can be a string or a function;

In [59]:
phone = "2004-959-559 # This is Phone Number"

In [63]:
re.sub('#.*$', "", phone)

'2004-959-559 '

In [62]:
# \D matches any non-digit character
re.sub('\D', "", phone) 

'2004959559'

In [14]:
data = '''<HTML>
    <HEAD>
        <TITLE>Mind Bending Blog</TITLE>
    </HEAD>

    <BODY>
        <H1>Bem Vindo!</H1>
        Bem vindo! Acesse o meu blog
        Visite também o portal de notícias
    </BODY>

</HTML>'''
print(data)

<HTML>
    <HEAD>
        <TITLE>Mind Bending Blog</TITLE>
    </HEAD>

    <BODY>
        <H1>Bem Vindo!</H1>
        Bem vindo! Acesse o meu blog
        Visite também o portal de notícias
    </BODY>

</HTML>


In [17]:
print ( re.sub('<.*?>', "", data) )


    
        Mind Bending Blog
    

    
        Bem Vindo!
        Bem vindo! Acesse o meu blog
        Visite também o portal de notícias
    




# Findall

In [3]:
#finding all adverbs
text = "He was carefully disguised but captured quickly by police."
re.findall("\w+ly", text)

['carefully', 'quickly']

# Finditer

In [67]:
text = "He was carefully disguised but captured quickly by police."
for result in re.finditer(r"\w+ly", text):
    print('%02d-%02d: %s' % (result.start(), result.end(), result.group(0)))

07-16: carefully
40-47: quickly


# Flags

### IGNORECASE

In [18]:
re.split('[a-f]+', '0a3B9')

['0', '3B9']

In [19]:
re.split('[a-f]+', '0a3B9',flags=re.IGNORECASE)

['0', '3', '9']

In [20]:
re.sub('\sAND\s', ' & ', 'Baked Beans And Spam', flags=re.IGNORECASE)

'Baked Beans & Spam'

### MULTILINE

In [29]:
print( re.search('^X', 'A\nB\nX') )

None


In [30]:
re.search('^X', 'A\nB\nX', re.MULTILINE)

<_sre.SRE_Match object; span=(4, 5), match='X'>

In [49]:
paragraph = '''<p>
This is a paragraph.
It has multiple lines.
</p>
'''

In [54]:
print ( re.search('^It has.*', paragraph) )

None


In [55]:
result = re.search('^It has.*', paragraph, re.MULTILINE)
if(result is not None):
    print(result.group(0))

It has multiple lines.


### DOTALL

In [59]:
print ( re.search('<p>.*</p>', paragraph) )

None


In [61]:
result = re.search('<p>.*</p>', paragraph,re.DOTALL)
if(result is not None):
    print(result.group(0))

<p>
This is a paragraph.
It has multiple lines.
</p>
