# Regular Expressions

In [1]:
# Libraries
import re

## Example 1

In [2]:
text = "That person wears marvelous trousers."

In [3]:
# Find all the "a" in the text
pattern = "a"
re.findall(pattern, text)

['a', 'a', 'a']

In [4]:
# Find all the "er" in the text
pattern = "er"
re.findall(pattern, text)

['er', 'er']

In [5]:
# Find all the "e" and "r" in the text
pattern = "e|r"
re.findall(pattern, text)

['e', 'r', 'e', 'r', 'r', 'e', 'r', 'e', 'r']

## Example 2

In [6]:
text = """My boss asked me to turn in my TPS reports. 
I told him they were done, but they are not."""

In [7]:
text

'My boss asked me to turn in my TPS reports. \nI told him they were done, but they are not.'

In [8]:
# Find all the symbols except the newline
pattern = "."
result = re.findall(pattern, text)
print(result)

['M', 'y', ' ', 'b', 'o', 's', 's', ' ', 'a', 's', 'k', 'e', 'd', ' ', 'm', 'e', ' ', 't', 'o', ' ', 't', 'u', 'r', 'n', ' ', 'i', 'n', ' ', 'm', 'y', ' ', 'T', 'P', 'S', ' ', 'r', 'e', 'p', 'o', 'r', 't', 's', '.', ' ', 'I', ' ', 't', 'o', 'l', 'd', ' ', 'h', 'i', 'm', ' ', 't', 'h', 'e', 'y', ' ', 'w', 'e', 'r', 'e', ' ', 'd', 'o', 'n', 'e', ',', ' ', 'b', 'u', 't', ' ', 't', 'h', 'e', 'y', ' ', 'a', 'r', 'e', ' ', 'n', 'o', 't', '.']


In [9]:
# Find all symbols except symbols from "a" to "m"
pattern = "[^a-m]"
result = re.findall(pattern, text)
print(result)

['M', 'y', ' ', 'o', 's', 's', ' ', 's', ' ', ' ', 't', 'o', ' ', 't', 'u', 'r', 'n', ' ', 'n', ' ', 'y', ' ', 'T', 'P', 'S', ' ', 'r', 'p', 'o', 'r', 't', 's', '.', ' ', '\n', 'I', ' ', 't', 'o', ' ', ' ', 't', 'y', ' ', 'w', 'r', ' ', 'o', 'n', ',', ' ', 'u', 't', ' ', 't', 'y', ' ', 'r', ' ', 'n', 'o', 't', '.']


In [10]:
# Find all symbols from "a" to "m"
pattern = "[a-m]"
result = re.findall(pattern, text)
print(result)

['b', 'a', 'k', 'e', 'd', 'm', 'e', 'i', 'm', 'e', 'l', 'd', 'h', 'i', 'm', 'h', 'e', 'e', 'e', 'd', 'e', 'b', 'h', 'e', 'a', 'e']


In [11]:
# Find all symbols from "a" to "m" and "A" to "M"
pattern = "[a-mA-M]"
result = re.findall(pattern, text)
print(result)

['M', 'b', 'a', 'k', 'e', 'd', 'm', 'e', 'i', 'm', 'e', 'I', 'l', 'd', 'h', 'i', 'm', 'h', 'e', 'e', 'e', 'd', 'e', 'b', 'h', 'e', 'a', 'e']


## Example 3

In [12]:
text = "Is the correct spelling color or colour?"

In [13]:
# Find the words color and colour
pattern = "colou?r"
result = re.findall(pattern, text)
print(result)

['color', 'colour']


## Example 4

In [14]:
text = "Let's see how we can match the following: aw, aww, awww, awwww, awwwww"

In [15]:
# Find "aw", "aww", "awww", "awwww" and "awwwww"
# Method 1
pattern = "aww?w?w?w?"
result = re.findall(pattern, text)
print(result)

['aw', 'aww', 'awww', 'awwww', 'awwwww']


In [16]:
# Find "aw", "aww", "awww", "awwww" and "awwwww"
# Method 2
pattern = "aww*"
result = re.findall(pattern, text)
print(result)

['aw', 'aww', 'awww', 'awwww', 'awwwww']


In [17]:
# Find "aw", "aww", "awww", "awwww" and "awwwww"
# Method 3
pattern = "aw+"
result = re.findall(pattern, text)
print(result)

['aw', 'aww', 'awww', 'awwww', 'awwwww']


In [18]:
# Find "aww", "awww", "awwww" and "awwwww"
pattern = "aww+"
result = re.findall(pattern, text)
print(result)

['aww', 'awww', 'awwww', 'awwwww']


In [20]:
# We add an "e" to "awwwww"
text = "Let's see how we can match the following: aw, aww, awww, awwww, awwwwwe"

In [23]:
# Find "aww", "awww" and "awwww"
pattern = r"aw{2,4}\b"
result = re.findall(pattern, text)
print(result)

['aww', 'awww', 'awwww']


## Example 5

In [24]:
text = """
Aeromexico 800-237-6639
Air Canada 888-247-2262
Air Canada Rouge 888-247-2262
Air Creebec 800-567-6567
Air Inuit 800-361-2965
Air North 800-661-0407
Air Tindi 888-545-6794
Air Transat 866-847-1112
"""

In [25]:
text

'\nAeromexico 800-237-6639\nAir Canada 888-247-2262\nAir Canada Rouge 888-247-2262\nAir Creebec 800-567-6567\nAir Inuit 800-361-2965\nAir North 800-661-0407\nAir Tindi 888-545-6794\nAir Transat 866-847-1112\n'

In [26]:
# Find the phone numbers
# Method 1
pattern = r"\d{3}-\d{3}-\d{4}"
result = re.findall(pattern, text)
print(result)

['800-237-6639', '888-247-2262', '888-247-2262', '800-567-6567', '800-361-2965', '800-661-0407', '888-545-6794', '866-847-1112']


In [27]:
# Find the phone numbers
# Method 2
pattern = r"\d+-\d+-\d+"
result = re.findall(pattern, text)
print(result)

['800-237-6639', '888-247-2262', '888-247-2262', '800-567-6567', '800-361-2965', '800-661-0407', '888-545-6794', '866-847-1112']


#### What's the difference between method 1 and method 2?

In [29]:
# We add more numbers to the first phone
text = """
Aeromexico 800-237-663965748893
Air Canada 888-247-2262
Air Canada Rouge 888-247-2262
Air Creebec 800-567-6567
Air Inuit 800-361-2965
Air North 800-661-0407
Air Tindi 888-545-6794
Air Transat 866-847-1112
"""

In [30]:
# Find the phone numbers
# Method 1
pattern = r"\d{3}-\d{3}-\d{4}"
result = re.findall(pattern, text)
print(result)

['800-237-6639', '888-247-2262', '888-247-2262', '800-567-6567', '800-361-2965', '800-661-0407', '888-545-6794', '866-847-1112']


In [31]:
# Find the phone numbers
# Method 2
pattern = r"\d+-\d+-\d+"
result = re.findall(pattern, text)
print(result)

['800-237-663965748893', '888-247-2262', '888-247-2262', '800-567-6567', '800-361-2965', '800-661-0407', '888-545-6794', '866-847-1112']
