# 18. Regular expression (2)

In [1]:
import re

### 5. Compile options

In [2]:
pattern = re.compile('a.b', re.DOTALL)
pattern.match('a\nb')

<re.Match object; span=(0, 3), match='a\nb'>

In [3]:
pattern = re.compile('[a-z]', re.I)
print(pattern.match('python'))
print(pattern.match('Python'))
print(pattern.match('PYTHON'))

<re.Match object; span=(0, 1), match='p'>
<re.Match object; span=(0, 1), match='P'>
<re.Match object; span=(0, 1), match='P'>


In [4]:
pattern = re.compile("^python\s\w+")

data = """python one
life is too short
python two
you need python
python three"""

m = pattern.findall(data)

if m:
    print("Found : ", m)
else:
    print("No data found")

Found :  ['python one']


In [5]:
# multi line data
pattern = re.compile("^python\s\w+", re.MULTILINE)
m = pattern.findall(data)

if m:
    print("Found : ", m)
else:
    print("No data found")

Found :  ['python one', 'python two', 'python three']


In [6]:
charref = re.compile(r'&[#](0[0-7]+|[0-9]+|x[0-9a-fA-F]+);')

In [7]:
# multi line expression
charref = re.compile(r"""
&[#]
(
0[0-7]+
|[0-9]+
|x[0-9a-fA-F]+
)
;
""",re.VERBOSE)

In [8]:
# escape character
pattern = re.compile(r'\\section')

### 6. Group

Grouping data
1. expression searching some specific data repeated continuously
2. declaring group: ()

Related functions
1. group(): all characters matched = group(0)
2. group(1): 1st group 
3. group(n): nth group

In [9]:
pattern = re.compile('(test)+')
data = pattern.search('testtesttest OK?')
data

<re.Match object; span=(0, 12), match='testtesttest'>

In [10]:
data.group(0)

'testtesttest'

In [11]:
data.group(1)

'test'

In [12]:
# generate regular expression like "name phone"
# "yoo 000-0000-0000"
# answer - (\w+)\s\d{3}[-]\d{3,4}[-]\d{4}

pattern = re.compile("(\w+)\s\d+[-]\d+[-]\d+")
data = pattern.search("yoo 010-1111-2345")
data

<re.Match object; span=(0, 17), match='yoo 010-1111-2345'>

In [13]:
data.group()

'yoo 010-1111-2345'

In [14]:
data.group(0)

'yoo 010-1111-2345'

In [15]:
data.group(1)

'yoo'

In [16]:
# error
data.group(2)

IndexError: no such group

In [17]:
# group name
pattern = re.compile(r"(?P<name>\w+)\s+(\d+[-]\d+[-]\d+)")
data = pattern.search("park 010-1234-1234")

data.group("name")

'park'

In [18]:
pattern = re.compile(r"(\w+)\s+(\d+[-]\d+[-]\d+)")
data = pattern.search("yoo 010-1111-2345")

data.group(2)

'010-1111-2345'

In [19]:
pattern = re.compile(r"(\w+)(\s+)(\d+[-]\d+[-]\d+)")
data = pattern.search("yoo 010-1111-2345")

data.group(2)

' '

In [20]:
data.group(3)

'010-1111-2345'

In [21]:
pattern = re.compile(r"(\w+)(\s+)(\d+)[-](\d+)[-](\d+)")
data = pattern.search("yoo 010-1111-2345")

data.group(5)

'2345'

In [22]:
data.groups()

('yoo', ' ', '010', '1111', '2345')

### 7. Forward search

In [23]:
pattern = re.compile(".+:")
pattern.search("http://google.com")

<re.Match object; span=(0, 5), match='http:'>

In [24]:
pattern = re.compile(".+(?=:)")
pattern.search("http://google.com")

<re.Match object; span=(0, 4), match='http'>

In [25]:
pattern = re.compile(".*[.]([^b].?.?|.[^a]?.?|..?[^t]?)$")
pattern.search("sendmail.cf")

<re.Match object; span=(0, 11), match='sendmail.cf'>

In [26]:
pattern.search("autoexec.bat")

In [27]:
pattern = re.compile(".*[.](?!bat$).*$")
pattern.search("autoexec.bat")

### 6. String replacement

In [28]:
pattern = re.compile('(blue|white|red)')
pattern.sub('colour', 'blue socks and red shoes')

'colour socks and colour shoes'

In [29]:
pattern.sub('colour', 'blue socks and red shoes', count=1)

'colour socks and red shoes'

In [30]:
pattern = re.compile('(blue|white|red)')
pattern.subn( 'colour', 'blue socks and red shoes')

('colour socks and colour shoes', 2)

In [31]:
pattern = re.compile(r"(?P<name>\w+)\s+(?P<phone>(\d+)[-]\d+[-]\d+)")
pattern.sub("\g<phone> \g<name>", "park 010-1234-1234")

'010-1234-1234 park'

In [32]:
def hexrepl(match):
    value = int(match.group())
    return hex(value)

pattern = re.compile(r'\d+')

In [33]:
pattern.sub(hexrepl, 'Call 65490 for printing, 49152 for user code.')

'Call 0xffd2 for printing, 0xc000 for user code.'

### 7. Greedy vs Non-greedy

In [34]:
s = '<html><head><title>Title</title>'

print(re.match('<.*>', s).span())
print(re.match('<.*>', s).group())

(0, 32)
<html><head><title>Title</title>


In [35]:
print(re.match('<.*?>', s).span())
print(re.match('<.*?>', s).group())

(0, 6)
<html>
