<a href="https://colab.research.google.com/github/kemaladamr/python_re/blob/main/RE_with_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re

In [2]:
pattern = re.compile(r'\bfoo\b')

In [3]:
pattern.match("foo bar")

<re.Match object; span=(0, 3), match='foo'>

## Backslash in string literals

In [4]:
pattern = re.compile("\\\\")
pattern.match("\\author")

<re.Match object; span=(0, 1), match='\\'>

In [5]:
pattern = re.compile(r"\\")
pattern.match(r"\author")

<re.Match object; span=(0, 1), match='\\'>

## Building blocks for Python regex

In [6]:
pattern = re.compile(r'fo+')

In [7]:
pattern = re.compile(r'<HTML>')
pattern.match("<HTML>")


<re.Match object; span=(0, 6), match='<HTML>'>

In [8]:
re.match(r'<HTML>', "<HTML>")

<re.Match object; span=(0, 6), match='<HTML>'>

### Searching

In [9]:
# match(string[, pos[, endpos]])
pattern = re.compile(r'<HTML>')
pattern.match("<HTML><head>")

<re.Match object; span=(0, 6), match='<HTML>'>

In [10]:
pattern.match(" <HTML>")

In [11]:
pattern.search("    <HTML>")

<re.Match object; span=(4, 10), match='<HTML>'>

In [12]:
pattern = re.compile(r'<HTML>')
pattern.match("     <HTML>")

In [13]:
pattern.match("     <HTML>", 5)

<re.Match object; span=(5, 11), match='<HTML>'>

In [16]:
pattern = re.compile(r'^<HTML>')
pattern.match("<HTML>")

<re.Match object; span=(0, 6), match='<HTML>'>

In [17]:
pattern.match("     <HTML>", 5)

In [18]:
pattern.match("     <HTML>"[5:])

<re.Match object; span=(0, 6), match='<HTML>'>

In [19]:
pattern = re.compile(r'<HTML>')
pattern.match("<HTML>"[:5])

In [21]:
pattern.match("<HTML>", 0, 5)

In [22]:
pattern = re.compile(r'<HTML>$')
pattern.match("<HTML>   ", 0, 6)

<re.Match object; span=(0, 6), match='<HTML>'>

In [23]:
pattern.match("<HTML>   "[:6])

<re.Match object; span=(0, 6), match='<HTML>'>

In [25]:
# search(string[, pos[, endpos]])
pattern = re.compile(r"world")
pattern.search("hello   world")

<re.Match object; span=(8, 13), match='world'>

In [26]:
pattern.search("hola    mundo ")

In [28]:
pattern = re.compile(r'^<HTML>', re.MULTILINE)
pattern.search("<HTML>")

<re.Match object; span=(0, 6), match='<HTML>'>

In [29]:
pattern.search("    <HTML>")

In [30]:
pattern.search("        \n<HTML>")

<re.Match object; span=(9, 15), match='<HTML>'>

In [34]:
pattern.search("        \n<HTML>", 9)

<re.Match object; span=(9, 15), match='<HTML>'>

In [35]:
pattern.search('</div></body>\n<HTML>', 4)

<re.Match object; span=(14, 20), match='<HTML>'>

In [37]:
pattern.search("    \n<HTML>", 6)

In [38]:
# findall(string[, pos[, endpos]])
pattern = re.compile(r"\w+")
pattern.findall("hello  world")

['hello', 'world']

In [39]:
pattern = re.compile(r'a*')
pattern.findall("aba")

['a', '', 'a', '']

In [40]:
pattern = re.compile(r'a?')
pattern.findall("aba")

['a', '', 'a', '']

In [43]:
pattern = re.compile(r"(\w+) (\w+)")
pattern.findall("Hello world hola mundo")

[('Hello', 'world'), ('hola', 'mundo')]

In [46]:
# finditer(string[, pos[, endpos]])
pattern = re.compile(r"(\w+) (\w+)")
it = pattern.finditer("Hello world hola mundo")
match = next(it)
match.groups()

('Hello', 'world')

In [47]:
match.span()

(0, 11)

In [49]:
match = next(it)
match.groups()

('hola', 'mundo')

In [50]:
match.span()

(12, 22)

In [51]:
# StopIteration (error)
match = next(it)


StopIteration: ignored

### Modifying a string

In [52]:
# split(string, maxsplit=0)
re.split(r"\n", "Beautiful is better than ugly.\nExplicit is better than implicit.")

['Beautiful is better than ugly.', 'Explicit is better than implicit.']

In [53]:
pattern = re.compile(r"\W")
pattern.split("hello world")

['hello', 'world']

In [54]:
pattern = re.compile(r"\W")
pattern.findall("hello world")

[' ']

In [55]:
pattern = re.compile(r"\W")
pattern.split("Beautiful is better than ugly", 2)

['Beautiful', 'is', 'better than ugly']

In [56]:
pattern = re.compile(r"(-)")
pattern.split("hello-world")

['hello', '-', 'world']

In [57]:
pattern = re.compile(r"-")
pattern.split("hello-world")

['hello', 'world']

In [58]:
pattern = re.compile(r"(\W)")
pattern.split(" hello world")

['', ' ', 'hello', ' ', 'world']

In [60]:
# sub(repl, string, count=0)
pattern = re.compile(r"[0-9]+")
pattern.sub("-", "order0 order1 order13")

'order- order- order-'

In [62]:
re.sub('00', '-', 'order00000') # five 0s sub from leftmost pattern

'order--0'

In [64]:
def normalize_orders(matchobj):
    if matchobj.group(1) == '-':return "A"
    else: return "B"

re.sub('([-|A-Z])', normalize_orders, '1234 A193 B123')

'1234 B193 B123'

In [65]:
text = "imagine a new *world*, a magic *world*"
pattern = re.compile(r'\*(.*?)\*')
pattern.sub(r"<b>\g<1><\\b>", text)

'imagine a new <b>world<\\b>, a magic <b>world<\\b>'

In [66]:
text = "imagine a new *world*, a magic *world*"
pattern = re.compile(r'\*(.*?)\*')
pattern.sub(r"<b>\g<1>1<\\b>", text)

'imagine a new <b>world1<\\b>, a magic <b>world1<\\b>'

In [69]:
# subn(repl, string, count=0)
text = "imagine a new *world*, a magic *world*"
pattern = re.compile(r'\*(.*?)\*')
pattern.subn(r"<b>\g<1><\\b>", text)

('imagine a new <b>world<\\b>, a magic <b>world<\\b>', 2)

### MatchObject

In [70]:
#group([group1,...])
pattern = re.compile(r"(\w+) (\w+)")
match = pattern.search("Hello world")

In [71]:
match.group()

'Hello world'

In [72]:
match.group(0)

'Hello world'

In [73]:
match.group(1)

'Hello'

In [74]:
match.group(2)

'world'

In [75]:
match.group(3)

IndexError: ignored

In [76]:
match.group(0, 2)

('Hello world', 'world')

In [78]:
pattern = re.compile(r"(?P<first>\w+) (?P<second>\w+)")

In [80]:
match = pattern.search("Hello world")
match.group('first')

'Hello'

In [81]:
match.group(1)

'Hello'

In [82]:
match.group(0, 'first', 2)

('Hello world', 'Hello', 'world')

In [84]:
# groups([default])
pattern = re.compile("(\w+) (\w+)")
match = pattern.search("Hello world")
match.groups()

('Hello', 'world')

In [85]:
pattern = re.compile("(\w+) (\w+)?")
match = pattern.search("Hello ")
match.groups("mundo")

('Hello', 'mundo')

In [86]:
match.groups()

('Hello', None)