In [2]:
import re

# Transform to bytecode

In [5]:
pattern = re.compile(r'\bfoo\b')
pattern.match('foo bar')

<_sre.SRE_Match object; span=(0, 3), match='foo'>

# Match slash

In [6]:
pattern = re.compile(r'\\')
pattern.match(r'\author')

<_sre.SRE_Match object; span=(0, 1), match='\\'>

# Directly call module method

In [7]:
re.match(r'<HTML>', '<HTML>')

<_sre.SRE_Match object; span=(0, 6), match='<HTML>'>

# `Match` method

In [9]:
pattern = re.compile(r'<HTML>')
pattern.match('<HTML><head>')  # Successful match

<_sre.SRE_Match object; span=(0, 6), match='<HTML>'>

In [14]:
pattern.match(' <HTML>') is None  # Unsuccessful match

True

In [13]:
pattern.search(' <HTML>')  # Successful search

<_sre.SRE_Match object; span=(1, 7), match='<HTML>'>

## Position

In [15]:
pattern.match('  <HTML>', 2)

<_sre.SRE_Match object; span=(2, 8), match='<HTML>'>

Does not work with ^

In [16]:
pattern = re.compile(r'^<HTML>')
pattern.match('  <HTML>') is None

True

With slicing

In [17]:
pattern.match('  <HTML>'[2:])

<_sre.SRE_Match object; span=(0, 6), match='<HTML>'>

End position

In [19]:
pattern.match('<HTML>'[:2]) is None

True

In [20]:
pattern.match('<HTML>', 0, 2) is None

True

End of the line

In [21]:
pattern.match('<HTML>  ', 0, 6)

<_sre.SRE_Match object; span=(0, 6), match='<HTML>'>

In [23]:
pattern.match('<HTML>  '[:6])

<_sre.SRE_Match object; span=(0, 6), match='<HTML>'>

# `Search` method

In [24]:
pattern = re.compile(r'world')
pattern.search('hello world')

<_sre.SRE_Match object; span=(6, 11), match='world'>

In [25]:
pattern.search('hola mundo') is None

True

# `MULTILINE` Flag

In [36]:
pattern = re.compile(r'^<HTML>', re.MULTILINE)
pattern.search('<HTML>')

<_sre.SRE_Match object; span=(0, 6), match='<HTML>'>

In [28]:
pattern.search(' <HTML>') is None

True

In [29]:
pattern.search('  \n<HTML>')

<_sre.SRE_Match object; span=(3, 9), match='<HTML>'>

In [37]:
pattern.search('  \n<HTML>', 3)

<_sre.SRE_Match object; span=(3, 9), match='<HTML>'>

In [38]:
pattern.search('  \n<HTML>', 4) is None

True

In [43]:
pattern.search('</div></body>\n<HTML>', 4)

<_sre.SRE_Match object; span=(14, 20), match='<HTML>'>

# `findall` method

In [44]:
pattern = re.compile(r'\w+')
pattern.findall('hello world')

['hello', 'world']

Empty matches also

In [45]:
pattern = re.compile(r'a*')
pattern.findall('aba')

['a', '', 'a', '']

In [46]:
pattern = re.compile(r'a?')
pattern.findall('aba')

['a', '', 'a', '']

Tupels returned

In [54]:
pattern = re.compile(r'(\w+) (\w+)')
pattern.findall('Hello world hola mundo')

[('Hello', 'world'), ('hola', 'mundo')]

# `finditer` method

Works like `findall` but returns an iterator in which every element is an `MatchObject`

In [67]:
it = pattern.finditer("Hello world hola mundo")
match = next(it)
match.groups()

('Hello', 'world')

In [68]:
match.span()

(0, 11)

In [69]:
match = next(it)
match.groups()

('hola', 'mundo')

In [70]:
match.span()

(12, 22)

# `split` method

In [71]:
re.split(r'\n', 'Beautiful is better than ugly.\nExplicit is better than implicit')

['Beautiful is better than ugly.', 'Explicit is better than implicit']

In [75]:
pattern = re.compile(r'\W')
pattern.split('hello world')

['hello', 'world']

In [76]:
pattern.findall('hello world')

[' ']

## `maxsplit` parameter

In [77]:
pattern.split('Beautiful is better than ugly.', 2)

['Beautiful', 'is', 'better than ugly.']

## Capture the splitting pattern too

In [78]:
pattern = re.compile(r'(-)')
pattern.split('hello-world')

['hello', '-', 'world']

Split operation always returns the captured groups

If group matches the start of the string:

In [81]:
pattern = re.compile(r'(\W)')
pattern.split(' hello world')

['', ' ', 'hello', ' ', 'world']

# `sub` method

Returns string after replacement

In [3]:
pattern = re.compile(r'[0-9]+')
pattern.sub('-', 'order0 order1 order13')

'order- order- order-'

In [5]:
re.sub(r'00', '-', 'order00000')

'order--0'

Function usage:

In [11]:
def normalize_orders(matchobj):
    return 'A' if matchobj.group(1) == '-' else 'B'

re.sub(r'([-|A-Z])', normalize_orders, '-1234 A193 B123')

'A1234 B193 B123'

Backreferences (none greedy thanks to `?`):

In [12]:
text = 'imagine a new *world*, a magic *world*'
pattern = re.compile(r'\*(.*?)\*')
pattern.sub(r'<b>\g<1><\\b>', text)

'imagine a new <b>world<\\b>, a magic <b>world<\\b>'

Use `count` argument to specify number of replacements

# `subn` method

In [13]:
pattern.subn(r'<b>\g<1><\\b>', text)

('imagine a new <b>world<\\b>, a magic <b>world<\\b>', 2)

# `MatchObject`

## `group` method

Returns specified subgroups

In [14]:
pattern = re.compile(r'(\w+) (\w+)')
match = pattern.search('Hello world')
match.groups()

('Hello', 'world')

In [15]:
match.group(0)

'Hello world'

In [16]:
match.group(1)

'Hello'

In [17]:
match.group(2)

'world'

In [18]:
match.group(3)

IndexError: no such group

In [19]:
match.group(0, 2)

('Hello world', 'world')

Named groups:

In [21]:
pattern = re.compile(r'(?P<first>\w+) (?P<second>\w+)')
match = pattern.search('Hello world')
match.group('first')

'Hello'

In [22]:
match.group(1)

'Hello'

In [24]:
match.group(0, 'first', 2)

('Hello world', 'Hello', 'world')

## `groups` method

In [25]:
match.groups()

('Hello', 'world')

Default arguments:

In [26]:
pattern = re.compile('(\w+) (\w+)?')
match = pattern.search('Hello ')
match.groups('mundo')

('Hello', 'mundo')

In [27]:
match.groups()

('Hello', None)

# `groupdict` method

In [28]:
pattern = re.compile(r'(?P<first>\w+) (?P<second>\w+)')
match = pattern.search('Hello world')
match.groupdict()

{'first': 'Hello', 'second': 'world'}

## `start` method

In [30]:
pattern = re.compile(r'(?P<first>\w+) (?P<second>\w+)?')
match = pattern.search('Hello ')
match.start(1)

0

In [32]:
match.start(2)

-1

## `end` method

In [33]:
match.end(1)

5

## `span` method

(start - end)

In [34]:
match.span(1)

(0, 5)

## `expand` method

In [36]:
text = 'imagine a new *world*, a magic *world*'
match = re.search(r'\*(.*?)\*', text)
match.expand(r'<b>\g<1><\\b>')

'<b>world<\\b>'

# `escape` method

In [37]:
re.findall(re.escape('^'), '^like^')

['^', '^']

# `purge` method

Clear regex cache

# Compilation flags

## `IGNORECASE`

In [38]:
pattern = re.compile(r'[a-z]+', re.I)
pattern.search('Felix')

<_sre.SRE_Match object; span=(0, 5), match='Felix'>

In [39]:
pattern.search('felix')

<_sre.SRE_Match object; span=(0, 5), match='felix'>

## `MULTILINE`

In [42]:
pattern = re.compile('^\w+\: (\w+/\w+/\w+)')
pattern.findall('date: 12/01/2013 \ndate: 11/01/2013')

['12/01/2013']

In [43]:
pattern = re.compile('^\w+\: (\w+/\w+/\w+)', re.M)
pattern.findall('date: 12/01/2013 \ndate: 11/01/2013')

['12/01/2013', '11/01/2013']

## `DOTALL`

In [44]:
re.findall('^\d(.)', '1\ne')

[]

In [45]:
re.findall('^\d(.)', '1\ne', re.S)

['\n']

## `LOCALE`

In [46]:
chars = ''.join(chr(i) for i in range(256))
' '.join(re.findall(r'\w', chars))

'0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z _ a b c d e f g h i j k l m n o p q r s t u v w x y z ª ² ³ µ ¹ º ¼ ½ ¾ À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö Ø Ù Ú Û Ü Ý Þ ß à á â ã ä å æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ö ø ù ú û ü ý þ ÿ'

In [48]:
import locale
locale.setlocale(locale.LC_ALL, '')

'LC_CTYPE=en_US.UTF-8;LC_NUMERIC=ru_RU.UTF-8;LC_TIME=ru_RU.UTF-8;LC_COLLATE=en_US.UTF-8;LC_MONETARY=ru_RU.UTF-8;LC_MESSAGES=en_US.UTF-8;LC_PAPER=ru_RU.UTF-8;LC_NAME=ru_RU.UTF-8;LC_ADDRESS=ru_RU.UTF-8;LC_TELEPHONE=ru_RU.UTF-8;LC_MEASUREMENT=ru_RU.UTF-8;LC_IDENTIFICATION=ru_RU.UTF-8'

In [49]:
' '.join(re.findall(r'\w', chars, re.LOCALE))

'0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z _ a b c d e f g h i j k l m n o p q r s t u v w x y z'

## `UNICODE`

In [50]:
re.findall('\w+', 'this is an example')

['this', 'is', 'an', 'example']

In [58]:
re.findall(ur'\w+', u'абвгд', re.UNICODE)  # Python 2

SyntaxError: invalid syntax (<ipython-input-58-beaec1d3a7ec>, line 1)

In [62]:
re.findall(r'\w+', 'абвг деёж')

['абвг', 'деёж']

## `VERBOSE`

In [60]:
pattern = re.compile(r"""[#|_] + #comment
\ \# #comment
\d+""", re.VERBOSE)
pattern.findall('# #2')

['# #2']

# `DEBUG`

In [61]:
re.compile(r'[a-f|3-8]', re.DEBUG)

in
  range (97, 102)
  literal 124
  range (51, 56)


re.compile(r'[a-f|3-8]', re.UNICODE|re.DEBUG)