In [2]:
import re

# Look Around

## Look Ahead

In [5]:
pattern = re.compile(r'fox')
result = pattern.search('The quick brown fox jumps over the lazy dog')
result.start(), result.end()

(16, 19)

In [6]:
pattern = re.compile(r'(?=fox)')
result = pattern.search('The quick brown fox jumps over the lazy dog')
result.start(), result.end()

(16, 16)

In [7]:
pattern = re.compile(r'\w+(?=,)')
pattern.findall('They were three: Felix, Victor, and Carlos.')

['Felix', 'Victor']

In [9]:
pattern = re.compile(r'\w+,')
pattern.findall('They were three: Felix, Victor, and Carlos')

['Felix,', 'Victor,']

In [10]:
pattern = re.compile(r'\w+(?=,|\.)')
pattern.findall('They were three: Felix, Victor, and Carlos.')

['Felix', 'Victor', 'Carlos']

### Negative Look Ahead

In [12]:
pattern = re.compile(r'John(?!\sSmith)')
result = pattern.finditer('I would rather go out with John McLane than with John Smith or John Bon Jovi')
for i in result:
    print(i.start(), i.end())

27 31
63 67


## Look Around and Substitutions

In [14]:
pattern = re.compile(r'\d{1,3}')
pattern.findall('The number is: 12345567890')

['123', '455', '678', '90']

In [22]:
pattern = re.compile(r'\d{1,3}(?=(\d{3})+(?!\d))')
results = pattern.finditer('1234567890')
for result in results:
    print(result.group())

1
234
567


In [24]:
pattern = re.compile(r'\d{1,3}(?=(\d{3})+(?!\d))')
pattern.sub(r'\g<0>,', '1234567890')

'1,234,567,890'

## Look Behind

In [25]:
pattern = re.compile(r'(?<=John\s)McLane')
results = pattern.finditer('I would rather go out with John McLane than with John Smith or John Bon Jovi')
for result in results:
    print(result.start(), result.end())

32 38


For variable length patterns in look behind is required than use `regex` module

In [26]:
pattern = re.compile(r'(?<=(John|Jonathan)\s)McLane')

error: look-behind requires fixed-width pattern

Extract names form tweet:

In [28]:
text = 'Know your Big Data = 5 for $50 on eBooks and 40% off all eBooks until Friday #bigdata #hadoop @HadoopNews packtpub.com/bigdataoffers'
pattern = re.compile(r'\B@[\w_]+')
pattern.findall(text)

['@HadoopNews']

In [29]:
pattern = re.compile(r'(?<=\B@)[\w_]+')
pattern.findall(text)

['HadoopNews']

### Negative Look Behind

In [3]:
pattern = re.compile(r'(?<!John\s)Doe')
results = pattern.finditer('John Doe, Calvin Doe, Hobbes Doe')
for result in results:
    print(result.start(), result.end())

17 20
29 32


## Look Around and Groups

In [4]:
pattern = re.compile(r'\w+\s[\d-]+\s[\d:,]+\s(.*(?<!authentication\s)failed)')
pattern.findall('INFO 2013-09-17 12:13:44,487 authentication failed')

[]

In [5]:
pattern.findall('INFO 2013-09-17 12:13:44,487 something else failed')

['something else failed']