In [1]:
import re

## Important Notes 

Period . - Any Character Except New Line  
\d      - Digit (0-9)  
\D      - Not a Digit (0-9)  
\w      - Word Character (a-z, A-Z, 0-9, _)  
\W      - Not a Word Character  
\s      - Whitespace (space, tab, newline)  
\S      - Not Whitespace (space, tab, newline)  

### Anchor elements
```
\b      - Word Boundary  
\B      - Not a Word Boundary  
^       - Beginning of a String  
$       - End of a String  

```

[]      - Matches Characters in brackets  
[^ ]    - Matches Characters NOT in brackets  
|       - Either Or  
( )     - Group  

### Quantifiers:  
```
*       - 0 or More  
+       - 1 or More  
?       - 0 or One  
{3}     - Exact Number  
{3,8}   - Range of Numbers (Minimum, Maximum)  
```


#### Sample Regexs to match all email addresses####

[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+  

In [2]:
text_to_search = '''
abcdefghijklmnopqurtuvwxyzabc
ABCDEFGHIJKLMNOPQRSTUVWXYZabc
1234567890abc_yahoo


Ha HaHa_HA

MetaCharacters (Need to be escaped):
. ^ $ * + ? { } [ ] \ | ( )

coreyms.com


321-555-4321
123.555.1234
123*555*1234
800-555-1234
900-555-1234

Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T

cat 
mat
pat
bat

'''

emails = '''
CoreyMSchafer@gmail.com
corey.schafer@university.edu
corey-321-schafer@my-work.net
'''

sentence = 'Start a sentence and then bring it to an end'

In [3]:
urls = '''

https://www.google.com
http://coreyms.com
https://youtube.com
https://www.nasa.gov
'''

In [4]:
print('Ordinary String : \n' + '\tTab')


Ordinary String : 
	Tab


In [5]:
print('Raw String : \n' + r'\tTab')

Raw String : 
\tTab


In [6]:
pattern = re.compile(r'abc')
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<_sre.SRE_Match object; span=(1, 4), match='abc'>
<_sre.SRE_Match object; span=(27, 30), match='abc'>
<_sre.SRE_Match object; span=(57, 60), match='abc'>
<_sre.SRE_Match object; span=(71, 74), match='abc'>


In [7]:
print(text_to_search[1:4])

abc


In [8]:
pattern = re.compile(r'.')  # It will match almost everything
matches = pattern.finditer(text_to_search)

#for match in matches:
#    print(match)

In [9]:
# But if backslash (\) is used it will search only for period(.)
pattern = re.compile(r'\.')  
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<_sre.SRE_Match object; span=(132, 133), match='.'>
<_sre.SRE_Match object; span=(168, 169), match='.'>
<_sre.SRE_Match object; span=(191, 192), match='.'>
<_sre.SRE_Match object; span=(195, 196), match='.'>
<_sre.SRE_Match object; span=(243, 244), match='.'>
<_sre.SRE_Match object; span=(274, 275), match='.'>
<_sre.SRE_Match object; span=(287, 288), match='.'>


In [10]:
# To match the url  coreyms.com

pattern = re.compile(r'coreyms\.com')  
matches = pattern.finditer(text_to_search)
#  matches  -----> its prints as <callable_iterator at 0x7a4609e208> so better use for loop

for match in matches:
    print(match)

<_sre.SRE_Match object; span=(161, 172), match='coreyms.com'>


In [11]:
# To match only the digits

pattern = re.compile(r'\d')  
matches = pattern.finditer(text_to_search)

# for match in matches:
#    print(match)

In [12]:
# To match only the digits 800-555-1234  and 900-555-1234

pattern = re.compile(r'[89]0*[-.]\d*[-.]\d*')  
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<_sre.SRE_Match object; span=(214, 226), match='800-555-1234'>
<_sre.SRE_Match object; span=(227, 239), match='900-555-1234'>


In [13]:
# To match the phone numbers

pattern = re.compile(r'\d*[-.]\d*[-.]\d*')  
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<_sre.SRE_Match object; span=(175, 187), match='321-555-4321'>
<_sre.SRE_Match object; span=(188, 200), match='123.555.1234'>
<_sre.SRE_Match object; span=(214, 226), match='800-555-1234'>
<_sre.SRE_Match object; span=(227, 239), match='900-555-1234'>


More accurate and better way of doing this is :

In [14]:
pattern = re.compile(r'\d{3}[-.]\d{3}[-.]\d{4}')  
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<_sre.SRE_Match object; span=(175, 187), match='321-555-4321'>
<_sre.SRE_Match object; span=(188, 200), match='123.555.1234'>
<_sre.SRE_Match object; span=(214, 226), match='800-555-1234'>
<_sre.SRE_Match object; span=(227, 239), match='900-555-1234'>


In [15]:
# To match the phone numbers from a text file

pattern = re.compile(r'\d*[-.]\d*[-.]\d*')

with open('data.txt', 'r') as f:
    contents = f.read()
    
    matches = pattern.finditer(contents)
    
    for match in matches:
        print(match)

<_sre.SRE_Match object; span=(12, 24), match='615-555-7164'>
<_sre.SRE_Match object; span=(102, 114), match='800-555-5669'>
<_sre.SRE_Match object; span=(191, 203), match='560-555-5153'>
<_sre.SRE_Match object; span=(281, 293), match='900-555-9340'>
<_sre.SRE_Match object; span=(378, 390), match='714-555-7405'>
<_sre.SRE_Match object; span=(467, 479), match='800-555-6771'>
<_sre.SRE_Match object; span=(557, 569), match='783-555-4799'>
<_sre.SRE_Match object; span=(647, 659), match='516-555-4615'>
<_sre.SRE_Match object; span=(740, 752), match='127-555-1867'>
<_sre.SRE_Match object; span=(831, 843), match='608-555-4938'>
<_sre.SRE_Match object; span=(917, 929), match='568-555-6051'>
<_sre.SRE_Match object; span=(1005, 1017), match='292-555-1875'>
<_sre.SRE_Match object; span=(1093, 1105), match='900-555-3205'>
<_sre.SRE_Match object; span=(1182, 1194), match='614-555-1166'>
<_sre.SRE_Match object; span=(1273, 1285), match='530-555-2676'>
<_sre.SRE_Match object; span=(1359, 1371), match=

In [16]:
# To match cat, mat and pat

pattern = re.compile(r'[^b]at')
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<_sre.SRE_Match object; span=(292, 295), match='cat'>
<_sre.SRE_Match object; span=(297, 300), match='mat'>
<_sre.SRE_Match object; span=(301, 304), match='pat'>


## To search for a word character
\w - Word Character (a-z, A-Z, 0-9, _)

In [17]:
text = 'I[_]am#_2*\\|'
pattern = re.compile(r'\w')  
matches = pattern.finditer(text)

for match in matches:
    print(match)

<_sre.SRE_Match object; span=(0, 1), match='I'>
<_sre.SRE_Match object; span=(2, 3), match='_'>
<_sre.SRE_Match object; span=(4, 5), match='a'>
<_sre.SRE_Match object; span=(5, 6), match='m'>
<_sre.SRE_Match object; span=(7, 8), match='_'>
<_sre.SRE_Match object; span=(8, 9), match='2'>


### To search for words that are not word characters

\W - Not a Word Character (includes metacharacters, tabs, spaces)

In [18]:
text = '''
I[_]am#_2*\\| sl99
df2'''
pattern = re.compile(r'\W')  
matches = pattern.finditer(text)

for match in matches:
    print(match)

<_sre.SRE_Match object; span=(0, 1), match='\n'>
<_sre.SRE_Match object; span=(2, 3), match='['>
<_sre.SRE_Match object; span=(4, 5), match=']'>
<_sre.SRE_Match object; span=(7, 8), match='#'>
<_sre.SRE_Match object; span=(10, 11), match='*'>
<_sre.SRE_Match object; span=(11, 12), match='\\'>
<_sre.SRE_Match object; span=(12, 13), match='|'>
<_sre.SRE_Match object; span=(13, 14), match=' '>
<_sre.SRE_Match object; span=(18, 19), match='\n'>


## To match
Mr. Schafer  
Mr Smith  
Ms Davis  
Mrs. Robinson  
Mr. T  

In [19]:
pattern = re.compile(r'Mr\.?\s[A-Z]\w*')  
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<_sre.SRE_Match object; span=(241, 252), match='Mr. Schafer'>
<_sre.SRE_Match object; span=(253, 261), match='Mr Smith'>
<_sre.SRE_Match object; span=(285, 290), match='Mr. T'>


In [20]:
# To match Mrs. and Ms also with the help of group
pattern = re.compile(r'(Mr|Ms|Mrs)\.?\s[A-Z]\w*')  
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<_sre.SRE_Match object; span=(241, 252), match='Mr. Schafer'>
<_sre.SRE_Match object; span=(253, 261), match='Mr Smith'>
<_sre.SRE_Match object; span=(262, 270), match='Ms Davis'>
<_sre.SRE_Match object; span=(271, 284), match='Mrs. Robinson'>
<_sre.SRE_Match object; span=(285, 290), match='Mr. T'>


### To match the following email:
CoreyMSchafer@gmail.com  
corey.schafer@university.edu  
corey-321-schafer@my-work.net 

In [21]:
pattern = re.compile(r'\w*@\w*\.(com|edu|net)')  
matches = pattern.finditer(emails)

for match in matches:
    print(match)

<_sre.SRE_Match object; span=(1, 24), match='CoreyMSchafer@gmail.com'>
<_sre.SRE_Match object; span=(31, 53), match='schafer@university.edu'>


In [22]:
pattern = re.compile(r'[a-zA-Z0-9-.]+@[a-z-]+\.(com|edu|net)')  
matches = pattern.finditer(emails)

for match in matches:
    print(match)

<_sre.SRE_Match object; span=(1, 24), match='CoreyMSchafer@gmail.com'>
<_sre.SRE_Match object; span=(25, 53), match='corey.schafer@university.edu'>
<_sre.SRE_Match object; span=(54, 83), match='corey-321-schafer@my-work.net'>


In [23]:

pattern = re.compile(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+.[a-zA-Z0-9-.]+')  
matches = pattern.finditer(emails)

for match in matches:
    print(match)

<_sre.SRE_Match object; span=(1, 24), match='CoreyMSchafer@gmail.com'>
<_sre.SRE_Match object; span=(25, 53), match='corey.schafer@university.edu'>
<_sre.SRE_Match object; span=(54, 83), match='corey-321-schafer@my-work.net'>


### To capture information from groups -- from urls string
We only want to capture the domain name followed by the top-level domain.  

Eg: google.com,  
nasa.gov, etc 


In [24]:
# Matching urls done by me
pattern = re.compile(r'https?://\w{3}\.?\w+\.(com|gov)')  
matches = pattern.finditer(urls)

for match in matches:
    print(match)

<_sre.SRE_Match object; span=(2, 24), match='https://www.google.com'>
<_sre.SRE_Match object; span=(25, 43), match='http://coreyms.com'>
<_sre.SRE_Match object; span=(44, 63), match='https://youtube.com'>
<_sre.SRE_Match object; span=(64, 84), match='https://www.nasa.gov'>


In [25]:
# Matching urls done by corey
pattern = re.compile(r'https?://(www\.)?\w+\.\w+')  
matches = pattern.finditer(urls)

for match in matches:
    print(match)

<_sre.SRE_Match object; span=(2, 24), match='https://www.google.com'>
<_sre.SRE_Match object; span=(25, 43), match='http://coreyms.com'>
<_sre.SRE_Match object; span=(44, 63), match='https://youtube.com'>
<_sre.SRE_Match object; span=(64, 84), match='https://www.nasa.gov'>


In [26]:
# Matching urls done by corey
# By putting it as groups
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')  
matches = pattern.finditer(urls)

for match in matches:
    print(match)

<_sre.SRE_Match object; span=(2, 24), match='https://www.google.com'>
<_sre.SRE_Match object; span=(25, 43), match='http://coreyms.com'>
<_sre.SRE_Match object; span=(44, 63), match='https://youtube.com'>
<_sre.SRE_Match object; span=(64, 84), match='https://www.nasa.gov'>


So in the regular expression above :
> 'https?://(www\.)?(\w+)(\.\w+)'

there are 4 groups :
- __1st group : group(0)__ - The entire url refered by the url.
- __2nd group : group(1)__ - (www\.)
- __3rd group : group(2)__ - (\w+)   : Domain name
- __4th group : group(3)__ - (\.\w+) : Top level domain eg .com or .gov

The match object has a group method.

In [27]:
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')  
matches = pattern.finditer(urls)

for match in matches:
    print(match.group(0))

https://www.google.com
http://coreyms.com
https://youtube.com
https://www.nasa.gov


In [28]:
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')  
matches = pattern.finditer(urls)

for match in matches:
    print(match.group(1))

www.
None
None
www.


In [29]:
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')  
matches = pattern.finditer(urls)

for match in matches:
    print(match.group(2))

google
coreyms
youtube
nasa


In [30]:
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')  
matches = pattern.finditer(urls)

for match in matches:
    print(match.group(3))

.com
.com
.com
.gov


In [31]:
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')  
subbed_urls = pattern.sub(r'\2\3', urls)
print(subbed_urls)



google.com
coreyms.com
youtube.com
nasa.gov



In [32]:
pattern = re.compile(r'(Mr|Ms|Mrs)\.?\s[A-Z]\w*')  
matches = pattern.findall(text_to_search)

for match in matches:
    print(match)

Mr
Mr
Ms
Mrs
Mr


In [33]:
pattern = re.compile(r'Start')
matches = pattern.match(sentence)
for match in matches:
    print(match)

TypeError: '_sre.SRE_Match' object is not iterable

In [34]:
pattern = re.compile(r'Start')
matches = pattern.search(sentence)
print(matches)

<_sre.SRE_Match object; span=(0, 5), match='Start'>


In [35]:
pattern = re.compile(r'dne')
matches = pattern.search(sentence)
print(matches)

None


In [36]:
pattern = re.compile(r'Start', re.IGNORECASE)
matches = pattern.search(sentence)
print(matches)

<_sre.SRE_Match object; span=(0, 5), match='Start'>


In [37]:
pattern = re.compile(r'Start', re.I)
matches = pattern.search(sentence)
print(matches)

<_sre.SRE_Match object; span=(0, 5), match='Start'>
