In [34]:
import re

text_to_search = '''
word1 word2
abcdefghijklmnopqurtuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
1234567890
abc
Har -HaHaff

word -word 
sword
wording
word

MetaCharacters (Need to be escaped):
. ^ $ * + ? { } [ ] \ | ( )
\n
200$
Ha 
HaHarr

coreyms.com
coreyms.com

321-555-4321
123.555.1234
123*555*1234
800-555-1234
900-555-1234

Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T
abc
'''

https://github.com/CoreyMSchafer/code_snippets/blob/master/Python-Regular-Expressions/snippets.txt

In [3]:
print('Tab')
print('\tTab')
print(r'\tTab')  # r stand for row string

Tab
	Tab
\tTab


### patterns

In [13]:
pattern = re.compile(r'abc')
matches = pattern.finditer(text_to_search)
for match in matches: print(match)

<re.Match object; span=(1, 4), match='abc'>
<re.Match object; span=(66, 69), match='abc'>
<re.Match object; span=(281, 284), match='abc'>


In [14]:
print(text_to_search[1:4])
print(text_to_search[66:69])
print(text_to_search[281:284])

abc
abc
abc


In [30]:
# . ^ $ * + ? { } [ ] \ | ( ) all these symbols neet to be escaped
pattern = re.compile(r'\.')
matches = pattern.finditer(text_to_search)
for match in matches: print(match)

<re.Match object; span=(116, 117), match='.'>
<re.Match object; span=(158, 159), match='.'>
<re.Match object; span=(180, 181), match='.'>
<re.Match object; span=(184, 185), match='.'>
<re.Match object; span=(232, 233), match='.'>
<re.Match object; span=(263, 264), match='.'>
<re.Match object; span=(276, 277), match='.'>


In [31]:
# any dot need to be escaped
pattern = re.compile(r'coreyms\.com')
matches = pattern.finditer(text_to_search)
for match in matches: print(match)

<re.Match object; span=(151, 162), match='coreyms.com'>


In [36]:
# .       - Any Character Except New Line
pattern = re.compile(r'\D')
matches = pattern.finditer(text_to_search)
for match in matches: print(match)

<re.Match object; span=(0, 1), match='\n'>
<re.Match object; span=(1, 2), match='a'>
<re.Match object; span=(2, 3), match='b'>
<re.Match object; span=(3, 4), match='c'>
<re.Match object; span=(4, 5), match='d'>
<re.Match object; span=(5, 6), match='e'>
<re.Match object; span=(6, 7), match='f'>
<re.Match object; span=(7, 8), match='g'>
<re.Match object; span=(8, 9), match='h'>
<re.Match object; span=(9, 10), match='i'>
<re.Match object; span=(10, 11), match='j'>
<re.Match object; span=(11, 12), match='k'>
<re.Match object; span=(12, 13), match='l'>
<re.Match object; span=(13, 14), match='m'>
<re.Match object; span=(14, 15), match='n'>
<re.Match object; span=(15, 16), match='o'>
<re.Match object; span=(16, 17), match='p'>
<re.Match object; span=(17, 18), match='q'>
<re.Match object; span=(18, 19), match='u'>
<re.Match object; span=(19, 20), match='r'>
<re.Match object; span=(20, 21), match='t'>
<re.Match object; span=(21, 22), match='u'>
<re.Match object; span=(22, 23), match='v'>
<re.Ma

# `\b` (Word Boundary)

- `\b` matches a position **between** a word character (`\w`: letters, digits, or underscore) and a non-word character (`\W`: any character that is not a word character), or between a word character and the start or end of a string.
- It **detects word boundaries** (the transition between a word and a non-word).

#### Example:
```regex
\bword\b
```
This pattern will match the word "word" only when it is a standalone word in the text. It won't match if "word" is part of a longer word like "sword" or "wording".

Matches the empty string, but only at the beginning or end of a word. A word is defined as a sequence of word characters. Note that formally, `\b` is defined as the boundary between a `\w` and a `\W `character (or vice versa), or between `\w` and the beginning or end of the string. This means that `r'\bat\b'` matches `'at'`, `'at.'`, `'(at)'`, and `'as at ay'` but not `'attempt' `or `'atlas'`.

In [30]:
# \b      - Word Boundary, 'Ha' text pattern, at the begginig or at the and of word
pattern = re.compile(r'\bword\b')    # at the start and at the end, only word
matches = pattern.finditer(text_to_search)
for match in matches: print(match)

<re.Match object; span=(95, 99), match='word'>
<re.Match object; span=(101, 105), match='word'>
<re.Match object; span=(121, 125), match='word'>


In [31]:
# \b      - Word Boundary, 'Ha' text pattern, at the begginig or at the and of word
pattern = re.compile(r'\bword')    # at the start only includes word, and wording, word1, word2
matches = pattern.finditer(text_to_search)
for match in matches: print(match)

<re.Match object; span=(1, 5), match='word'>
<re.Match object; span=(7, 11), match='word'>
<re.Match object; span=(95, 99), match='word'>
<re.Match object; span=(101, 105), match='word'>
<re.Match object; span=(113, 117), match='word'>
<re.Match object; span=(121, 125), match='word'>


### `\B` (Non-Word Boundary)
- `\B` matches any position **that is not** a word boundary.
- It ensures that the match occurs **inside** a word or between two non-word characters, essentially the opposite of `\b`.

#### Example:
```regex
\Bword\B
```
This pattern will only match `"word"` if it is **not** at the start or end of a word, i.e., it would match inside a longer word like "swordplay" or "password", but **not** when "word" is a standalone word.


In [102]:
# \B      - Not a Word Boundary,
pattern = re.compile(r'\Bword')  # ir matches only sword, because word is at the last
matches = pattern.finditer(text_to_search)
for match in matches: print(match)

<re.Match object; span=(102, 106), match='word'>


In [32]:
pattern = re.compile(r'\Bword')  # ir matches only sword, because word is at the last
matches = pattern.finditer("swordplay")
for match in matches: print(match)

<re.Match object; span=(1, 5), match='word'>


In [33]:
pattern = re.compile(r'\Bword')  # ir matches only sword, because word is at the last
matches = pattern.finditer("password")
for match in matches: print(match)

<re.Match object; span=(4, 8), match='word'>


# `^` & `$`

`^` (Caret.) Matches the start of the string, and in MULTILINE mode also matches immediately after each newline.

`$` Matches the end of the string or just before the newline at the end of the string, 

In [None]:
pattern = re.compile(r'\d\d\d')  # ir matches only sword, because word is at the last
matches = pattern.finditer("password")
for match in matches: print(match)