In [1]:
import re

### Raw strings

In [2]:
print('Hello\nworld')

Hello
world


In [3]:
print(r'Hello\nworld')

Hello\nworld


Always use raw strings with regular expressions! We'll go over an example of why that's the case later on, once we've gotten our feet wet with regex and it'll make more sense.

### Basics

In [4]:
text_to_search = '''
abcdefghijklmnopqurtuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
1234567890

Go GoGo

MetaCharacters (Need to be escaped):
. ^ $ * + ? { } [ ] \ | ( )

github.com
github*com  (don't match this)

321-555-4321
123.555.1234
123*555*1234
800-555-1234
900-555-1234

Mr. Metz
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T

The cat in the hat sat on a mat
'''

In [5]:
# basic match
pattern = re.compile(r'abc')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

text_to_search[1:4]

<_sre.SRE_Match object; span=(1, 4), match='abc'>


'abc'

In [6]:
# finding special characters by escaping them
pattern = re.compile(r'github\.com')  # . \. github\.com
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<_sre.SRE_Match object; span=(142, 152), match='github.com'>


In [7]:
# regex special characters
pattern = re.compile(r'\BGo')  # . \w \W \d \D \s \bGo \BGo
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<_sre.SRE_Match object; span=(72, 74), match='Go'>


In [8]:
# beginning and end of a string
phrase = "Begin a sentence and bring it to an end"
pattern = re.compile(r'q$')  # ^Begin ^q end$ q$
matches = pattern.finditer(phrase)
for match in matches:
    print(match)

In [9]:
# matching phone numbers
pattern = re.compile(r'[89]00[-.]\d{3}[-.]\d{4}')
# \d\d\d.\d\d\d.\d\d\d\d
# exact number: \d{3}.\d{3}.\d{4} \d{3}-\d{3}-\d{4} \d{3}\.\d{3}\.\d{4}
# using character set: \d{3}[-.]\d{3}[-.]\d{4} 800[-.]\d{3}[-.]\d{4} \d00[-.]\d{3}[-.]\d{4} [89]00[-.]\d{3}[-.]\d{4}
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<_sre.SRE_Match object; span=(193, 205), match='800-555-1234'>
<_sre.SRE_Match object; span=(206, 218), match='900-555-1234'>


In [10]:
# matching what's in and not in character set
pattern = re.compile(r'[^m]at')  # [1-5] [^1-5] [chs]at [^m]at 
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<_sre.SRE_Match object; span=(272, 275), match='cat'>
<_sre.SRE_Match object; span=(283, 286), match='hat'>
<_sre.SRE_Match object; span=(287, 290), match='sat'>


In [11]:
# quanitifiers, groups, either or
pattern = re.compile(r'(Mr|Ms|Mrs)\.?\s[A-Z]\w*')
# Mr\. Mr\.? Mr\.?\s[A-Z]\w+ Mr\.?\s[A-Z]\w*
# M[rs]s?\.?\s[A-Z]\w* M(r|s|rs)\.?\s[A-Z]\w* (Mr|Ms|Mrs)\.?\s[A-Z]\w*
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<_sre.SRE_Match object; span=(220, 228), match='Mr. Metz'>
<_sre.SRE_Match object; span=(229, 237), match='Mr Smith'>
<_sre.SRE_Match object; span=(238, 246), match='Ms Davis'>
<_sre.SRE_Match object; span=(247, 260), match='Mrs. Robinson'>
<_sre.SRE_Match object; span=(261, 266), match='Mr. T'>


In [12]:
emails = '''
JeffPBezos@gmail.com
jeff.bezos@ucsd.edu
jeff-321-bezos@amazon-work.net
'''

In [13]:
# matching emails
pattern = re.compile(r'[\w.+-]+@[\w-]+\.[a-zA-Z0-9-.]+')
# \w+@\w+\.com
# [\w.]+@\w+\.(com|edu)
# [\w.-]+@[\w-]+\.(com|edu|net)
# [\w.-]+@[\w-]+\.\w+
# actual: [\w.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+
matches = pattern.finditer(emails)
for match in matches:
    print(match)

<_sre.SRE_Match object; span=(1, 21), match='JeffPBezos@gmail.com'>
<_sre.SRE_Match object; span=(22, 41), match='jeff.bezos@ucsd.edu'>
<_sre.SRE_Match object; span=(42, 72), match='jeff-321-bezos@amazon-work.net'>


In [14]:
urls = '''
https://www.google.com
http://jeffbezos.com
https://youtube.com
https://www.nasa.gov
'''

In [15]:
# matching urls and groups
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')
# https?://(www\.)?\w+\.\w+
# https?://(www\.)?(\w+)(\.\w+)

# matches = pattern.finditer(urls)
# for match in matches:
#     print(match)
#     print(match.group(3))
#     print(f'{match.group(2)}{match.group(3)}')

subbed_urls = pattern.sub(r'\2\3', urls)
print(subbed_urls)


google.com
jeffbezos.com
youtube.com
nasa.gov



### Other pattern methods and flags

In [16]:
sentence = 'The cat in the hat sat on a mat'

In [17]:
# return match only
pattern = re.compile(r'[chs]at')
matches = pattern.findall(sentence)
for match in matches:
    print(match)

cat
hat
sat


In [18]:
# first match only
pattern = re.compile(r'qqq')  # [chs]at qqq
match = pattern.search(sentence)
print(match)

None


In [19]:
# case-insensitive
sentence = 'The Cat in the Hat sat on a mat'
pattern = re.compile(r'[chs]at', re.I)  # re.IGNORECASE re.I
matches = pattern.findall(sentence)
for match in matches:
    print(match)

Cat
Hat
sat


### Why raw strings are required

In [20]:

example = 'in regex, the \\n denotes a new line'
print(example)

in regex, the \n denotes a new line


In [21]:
pattern = re.compile(r'\\n')
matches = pattern.findall(example)
for match in matches:
    print(match)

\n
