<a href="https://colab.research.google.com/github/kemaladamr/python_re/blob/main/RE_with_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re

In [2]:
pattern = re.compile(r'\bfoo\b')

In [3]:
pattern.match("foo bar")

<re.Match object; span=(0, 3), match='foo'>

## Backslash in string literals

In [4]:
pattern = re.compile("\\\\")
pattern.match("\\author")

<re.Match object; span=(0, 1), match='\\'>

In [5]:
pattern = re.compile(r"\\")
pattern.match(r"\author")

<re.Match object; span=(0, 1), match='\\'>

## Building blocks for Python regex

In [6]:
pattern = re.compile(r'fo+')

In [7]:
pattern = re.compile(r'<HTML>')
pattern.match("<HTML>")


<re.Match object; span=(0, 6), match='<HTML>'>

In [8]:
re.match(r'<HTML>', "<HTML>")

<re.Match object; span=(0, 6), match='<HTML>'>

### Searching

In [9]:
# match(string[, pos[, endpos]])
pattern = re.compile(r'<HTML>')
pattern.match("<HTML><head>")

<re.Match object; span=(0, 6), match='<HTML>'>

In [10]:
pattern.match(" <HTML>")

In [11]:
pattern.search("    <HTML>")

<re.Match object; span=(4, 10), match='<HTML>'>

In [12]:
pattern = re.compile(r'<HTML>')
pattern.match("     <HTML>")

In [13]:
pattern.match("     <HTML>", 5)

<re.Match object; span=(5, 11), match='<HTML>'>

In [14]:
pattern = re.compile(r'^<HTML>')
pattern.match("<HTML>")

<re.Match object; span=(0, 6), match='<HTML>'>

In [15]:
pattern.match("     <HTML>", 5)

In [16]:
pattern.match("     <HTML>"[5:])

<re.Match object; span=(0, 6), match='<HTML>'>

In [17]:
pattern = re.compile(r'<HTML>')
pattern.match("<HTML>"[:5])

In [18]:
pattern.match("<HTML>", 0, 5)

In [19]:
pattern = re.compile(r'<HTML>$')
pattern.match("<HTML>   ", 0, 6)

<re.Match object; span=(0, 6), match='<HTML>'>

In [20]:
pattern.match("<HTML>   "[:6])

<re.Match object; span=(0, 6), match='<HTML>'>

In [21]:
# search(string[, pos[, endpos]])
pattern = re.compile(r"world")
pattern.search("hello   world")

<re.Match object; span=(8, 13), match='world'>

In [22]:
pattern.search("hola    mundo ")

In [23]:
pattern = re.compile(r'^<HTML>', re.MULTILINE)
pattern.search("<HTML>")

<re.Match object; span=(0, 6), match='<HTML>'>

In [24]:
pattern.search("    <HTML>")

In [25]:
pattern.search("        \n<HTML>")

<re.Match object; span=(9, 15), match='<HTML>'>

In [26]:
pattern.search("        \n<HTML>", 9)

<re.Match object; span=(9, 15), match='<HTML>'>

In [27]:
pattern.search('</div></body>\n<HTML>', 4)

<re.Match object; span=(14, 20), match='<HTML>'>

In [28]:
pattern.search("    \n<HTML>", 6)

In [29]:
# findall(string[, pos[, endpos]])
pattern = re.compile(r"\w+")
pattern.findall("hello  world")

['hello', 'world']

In [30]:
pattern = re.compile(r'a*')
pattern.findall("aba")

['a', '', 'a', '']

In [31]:
pattern = re.compile(r'a?')
pattern.findall("aba")

['a', '', 'a', '']

In [32]:
pattern = re.compile(r"(\w+) (\w+)")
pattern.findall("Hello world hola mundo")

[('Hello', 'world'), ('hola', 'mundo')]

In [33]:
# finditer(string[, pos[, endpos]])
pattern = re.compile(r"(\w+) (\w+)")
it = pattern.finditer("Hello world hola mundo")
match = next(it)
match.groups()

('Hello', 'world')

In [34]:
match.span()

(0, 11)

In [35]:
match = next(it)
match.groups()

('hola', 'mundo')

In [36]:
match.span()

(12, 22)

In [37]:
# StopIteration (error)
# match = next(it)

StopIteration: ignored

### Modifying a string

In [38]:
# split(string, maxsplit=0)
re.split(r"\n", "Beautiful is better than ugly.\nExplicit is better than implicit.")

['Beautiful is better than ugly.', 'Explicit is better than implicit.']

In [39]:
pattern = re.compile(r"\W")
pattern.split("hello world")

['hello', 'world']

In [40]:
pattern = re.compile(r"\W")
pattern.findall("hello world")

[' ']

In [41]:
pattern = re.compile(r"\W")
pattern.split("Beautiful is better than ugly", 2)

['Beautiful', 'is', 'better than ugly']

In [42]:
pattern = re.compile(r"(-)")
pattern.split("hello-world")

['hello', '-', 'world']

In [43]:
pattern = re.compile(r"-")
pattern.split("hello-world")

['hello', 'world']

In [44]:
pattern = re.compile(r"(\W)")
pattern.split(" hello world")

['', ' ', 'hello', ' ', 'world']

In [45]:
# sub(repl, string, count=0)
pattern = re.compile(r"[0-9]+")
pattern.sub("-", "order0 order1 order13")

'order- order- order-'

In [46]:
re.sub('00', '-', 'order00000') # five 0s sub from leftmost pattern

'order--0'

In [47]:
def normalize_orders(matchobj):
    if matchobj.group(1) == '-':return "A"
    else: return "B"

re.sub('([-|A-Z])', normalize_orders, '1234 A193 B123')

'1234 B193 B123'

In [48]:
text = "imagine a new *world*, a magic *world*"
pattern = re.compile(r'\*(.*?)\*')
pattern.sub(r"<b>\g<1><\\b>", text)

'imagine a new <b>world<\\b>, a magic <b>world<\\b>'

In [49]:
text = "imagine a new *world*, a magic *world*"
pattern = re.compile(r'\*(.*?)\*')
pattern.sub(r"<b>\g<1>1<\\b>", text)

'imagine a new <b>world1<\\b>, a magic <b>world1<\\b>'

In [50]:
# subn(repl, string, count=0)
text = "imagine a new *world*, a magic *world*"
pattern = re.compile(r'\*(.*?)\*')
pattern.subn(r"<b>\g<1><\\b>", text)

('imagine a new <b>world<\\b>, a magic <b>world<\\b>', 2)

### MatchObject

In [51]:
#group([group1,...])
pattern = re.compile(r"(\w+) (\w+)")
match = pattern.search("Hello world")

In [52]:
match.group()

'Hello world'

In [53]:
match.group(0)

'Hello world'

In [54]:
match.group(1)

'Hello'

In [55]:
match.group(2)

'world'

In [None]:
# match.group(3)

In [57]:
match.group(0, 2)

('Hello world', 'world')

In [58]:
pattern = re.compile(r"(?P<first>\w+) (?P<second>\w+)")

In [59]:
match = pattern.search("Hello world")
match.group('first')

'Hello'

In [60]:
match.group(1)

'Hello'

In [61]:
match.group(0, 'first', 2)

('Hello world', 'Hello', 'world')

In [62]:
# groups([default])
pattern = re.compile("(\w+) (\w+)")
match = pattern.search("Hello world")
match.groups()

('Hello', 'world')

In [63]:
pattern = re.compile("(\w+) (\w+)?")
match = pattern.search("Hello ")
match.groups("mundo")

('Hello', 'mundo')

In [64]:
match.groups()

('Hello', None)

In [65]:
# groupdict([default])
pattern = re.compile(r"(?P<first>\w+) (?P<second>\w+)")
pattern.search("Hello world").groupdict()

{'first': 'Hello', 'second': 'world'}

In [66]:
# start([group])
pattern = re.compile(r"(?P<first>\w+) (?P<second>\w+)?")
match = pattern.search("Hello ")
match.start(1)

0

In [67]:
match = pattern.search("Hello ")
match.start(2)

-1

In [68]:
# end([group])
pattern = re.compile(r"(?P<first>\w+) (?P<second>\w+)?")
match = pattern.search("Hello ")
match.end(1)

5

In [69]:
match = pattern.search("Hello ")
match.end(2)

-1

In [70]:
# span([group])
pettern = re.compile(r"(?P<first>\w+) (?P<second>\w+)?")
match = pattern.search("Hello ")
match.span(1)

(0, 5)

In [72]:
# expand(template)
text = "imagine a new *world*, a magic *world*"
match = re.search(r'\*(.*?)\*', text)
match.expand(r"<b>\g<1><\\b>")

'<b>world<\\b>'

### Module Operations

In [73]:
# escape
re.findall(re.escape("^"), "^like^")

['^', '^']

## Compilation flags

In [74]:
# re.IGNORECASE or re.I
pattern = re.compile(r"[a-z]+", re.I)
pattern.search("Felix")

<re.Match object; span=(0, 5), match='Felix'>

In [75]:
pattern.search("felix")

<re.Match object; span=(0, 5), match='felix'>

In [76]:
# re.MULTILINE or re.M
pattern = re.compile("^\w+\: (\w+/\w+/\w+)", re.M)
pattern.findall("date: 12/01/2013 \ndate: 11/01/2013")

['12/01/2013', '11/01/2013']