# Regular Expression

In [19]:
import re
import os 
os.chdir(r'C:\Users\tanzh\Documents\Python')

# Character Match

In [20]:
"""
.       - Any Character Except New Line
\d      - Digit (0-9)
\D      - Not a Digit (0-9)
\w      - Word Character (a-z, A-Z, 0-9, _)
\W      - Not a Word Character
\s      - Whitespace (space, tab, newline)
\S      - Not Whitespace (space, tab, newline)

\b      - Word Boundary
\B      - Not a Word Boundary
^       - Beginning of a String
$       - End of a String

[]      - Matches Characters in brackets
[^ ]    - Matches Characters NOT in brackets
|       - Either Or
( )     - Group

Quantifiers:
*       - 0 or More
+       - 1 or More
?       - 0 or One
{3}     - Exact Number
{3,4}   - Range of Numbers (Minimum, Maximum)

#### Sample Regexs ####
[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+
"""

'\n.       - Any Character Except New Line\n\\d      - Digit (0-9)\n\\D      - Not a Digit (0-9)\n\\w      - Word Character (a-z, A-Z, 0-9, _)\n\\W      - Not a Word Character\n\\s      - Whitespace (space, tab, newline)\n\\S      - Not Whitespace (space, tab, newline)\n\n\x08      - Word Boundary\n\\B      - Not a Word Boundary\n^       - Beginning of a String\n$       - End of a String\n\n[]      - Matches Characters in brackets\n[^ ]    - Matches Characters NOT in brackets\n|       - Either Or\n( )     - Group\n\nQuantifiers:\n*       - 0 or More\n+       - 1 or More\n?       - 0 or One\n{3}     - Exact Number\n{3,4}   - Range of Numbers (Minimum, Maximum)\n\n#### Sample Regexs ####\n[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+\n'

In [21]:
text_to_search = '''
abcdefghijklmnopqurtuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
1234567890
Ha HaHa
MetaCharacters (Need to be escaped):
. ^ $ * + ? { } [ ] \ | ( )
coreyms.com
321-555-4321
123.555.1234
123*555*1234
800-555-1234
900-555-1234
Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T
'''

In [22]:
print('\n\nthere is a tab')
print(r'\n\nthere is a tab') # raw string will intrepret the inputted string iterally



there is a tab
\n\nthere is a tab


In [23]:
pattern = re.compile(r'abc')
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

print(text_to_search[1:4])
# span refer to the start and end index of the string where the match is found
# refer to the matched string 

<re.Match object; span=(1, 4), match='abc'>
abc


# Escape Keys

In [24]:
pattern = re.compile(r'coreyms\.com') # the .(dot) need to be escaped
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<re.Match object; span=(139, 150), match='coreyms.com'>


# Examples

In [25]:
sentence = 'Start a sentence and then bring it to an end'
pattern = re.compile(r"^Sta")
matches = pattern.finditer(sentence)

for match in matches:
    print(match)

<re.Match object; span=(0, 3), match='Sta'>


In [38]:
pattern = re.compile(r'\d\d\d.\d\d\d.\d\d\d\d')
matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<re.Match object; span=(151, 163), match='321-555-4321'>
<re.Match object; span=(164, 176), match='123.555.1234'>
<re.Match object; span=(177, 189), match='123*555*1234'>
<re.Match object; span=(190, 202), match='800-555-1234'>
<re.Match object; span=(203, 215), match='900-555-1234'>


In [27]:
with open('sample_file_for_reg_ex.txt', 'r') as f:
    content = f.read()

    pattern = re.compile(r'\d\d\d[.-]\d\d\d[.-]\d\d\d\d') 
    # in character set [], we do not need to escape them
    # also note that character set will only match one character, i.e. [.-] --> this will only match either . or -
    
    matches = pattern.finditer(content)
    matches_a = pattern.findall(content)

    for match in matches:
        print(match)

print('\n')
print(matches_a)

<re.Match object; span=(565, 577), match='615-555-7164'>
<re.Match object; span=(655, 667), match='800-555-5669'>
<re.Match object; span=(744, 756), match='560-555-5153'>
<re.Match object; span=(834, 846), match='900-555-9340'>
<re.Match object; span=(931, 943), match='714-555-7405'>
<re.Match object; span=(1020, 1032), match='800-555-6771'>
<re.Match object; span=(1110, 1122), match='783-555-4799'>
<re.Match object; span=(1200, 1212), match='516-555-4615'>
<re.Match object; span=(1293, 1305), match='127-555-1867'>
<re.Match object; span=(1384, 1396), match='608-555-4938'>
<re.Match object; span=(1470, 1482), match='568-555-6051'>
<re.Match object; span=(1558, 1570), match='292-555-1875'>
<re.Match object; span=(1646, 1658), match='900-555-3205'>
<re.Match object; span=(1735, 1747), match='614-555-1166'>
<re.Match object; span=(1826, 1838), match='530-555-2676'>
<re.Match object; span=(1912, 1924), match='470-555-2750'>
<re.Match object; span=(1996, 2008), match='800-555-6089'>
<re.Mat

In [28]:
# now we only want to match either a number that start with either 800 or 900

with open('sample_file_for_reg_ex.txt', 'r') as f:
    content = f.read()

    pattern = re.compile(r'[89]00[.-]\d\d\d[.-]\d\d\d\d') 
    
    matches = pattern.finditer(content)
    matches_a = pattern.findall(content)

    for match in matches:
        print(match)

print('\n')
print(matches_a)

<re.Match object; span=(655, 667), match='800-555-5669'>
<re.Match object; span=(834, 846), match='900-555-9340'>
<re.Match object; span=(1020, 1032), match='800-555-6771'>
<re.Match object; span=(1646, 1658), match='900-555-3205'>
<re.Match object; span=(1996, 2008), match='800-555-6089'>
<re.Match object; span=(2347, 2359), match='800-555-7100'>
<re.Match object; span=(2608, 2620), match='900-555-5118'>
<re.Match object; span=(3383, 3395), match='900-555-5428'>
<re.Match object; span=(3843, 3855), match='800-555-8810'>
<re.Match object; span=(4530, 4542), match='900-555-9598'>
<re.Match object; span=(5504, 5516), match='800-555-2420'>
<re.Match object; span=(6125, 6137), match='900-555-3567'>
<re.Match object; span=(6748, 6760), match='800-555-3216'>
<re.Match object; span=(7450, 7462), match='900-555-7755'>
<re.Match object; span=(8425, 8437), match='800-555-1372'>
<re.Match object; span=(9304, 9316), match='900-555-6426'>


['800-555-5669', '900-555-9340', '800-555-6771', '900-555-

In [29]:
with open('sample_file_for_reg_ex.txt', 'r') as f:
    content = f.read()

    pattern = re.compile(r'[1-5A-Za-z]') # the dash in a character set indicate the range 
    
    matches = pattern.finditer(content)
    matches_a = pattern.findall(content)

    for match in matches:
        print(match)

print('\n')
print(matches_a)

<re.Match object; span=(0, 1), match='S'>
<re.Match object; span=(1, 2), match='k'>
<re.Match object; span=(2, 3), match='i'>
<re.Match object; span=(3, 4), match='p'>
<re.Match object; span=(5, 6), match='t'>
<re.Match object; span=(6, 7), match='o'>
<re.Match object; span=(8, 9), match='c'>
<re.Match object; span=(9, 10), match='o'>
<re.Match object; span=(10, 11), match='n'>
<re.Match object; span=(11, 12), match='t'>
<re.Match object; span=(12, 13), match='e'>
<re.Match object; span=(13, 14), match='n'>
<re.Match object; span=(14, 15), match='t'>
<re.Match object; span=(16, 17), match='W'>
<re.Match object; span=(17, 18), match='h'>
<re.Match object; span=(18, 19), match='y'>
<re.Match object; span=(20, 21), match='G'>
<re.Match object; span=(21, 22), match='i'>
<re.Match object; span=(22, 23), match='t'>
<re.Match object; span=(23, 24), match='H'>
<re.Match object; span=(24, 25), match='u'>
<re.Match object; span=(25, 26), match='b'>
<re.Match object; span=(29, 30), match='T'>
<re

In [30]:
with open('sample_file_for_reg_ex.txt', 'r') as f:
    content = f.read()

    pattern = re.compile(r'[^b]at') # charat sign negate the character set 
    
    matches = pattern.finditer(content)
    matches_a = pattern.findall(content)

    for match in matches:
        print(match)

print('\n')
print(matches_a)

<re.Match object; span=(402, 405), match='dat'>
<re.Match object; span=(457, 460), match='Lat'>
<re.Match object; span=(1100, 1103), match='Pat'>
<re.Match object; span=(1158, 1161), match='pat'>
<re.Match object; span=(2292, 2295), match='wat'>
<re.Match object; span=(3875, 3878), match='wat'>
<re.Match object; span=(4866, 4869), match='Pat'>
<re.Match object; span=(4928, 4931), match='pat'>
<re.Match object; span=(5536, 5539), match='wat'>
<re.Match object; span=(5925, 5928), match='Pat'>
<re.Match object; span=(5973, 5976), match='wat'>
<re.Match object; span=(5988, 5991), match='pat'>
<re.Match object; span=(5994, 5997), match='iat'>
<re.Match object; span=(6110, 6113), match='Pat'>
<re.Match object; span=(6168, 6171), match='pat'>
<re.Match object; span=(7123, 7126), match='wat'>
<re.Match object; span=(7164, 7167), match='Pat'>
<re.Match object; span=(7224, 7227), match='pat'>
<re.Match object; span=(7230, 7233), match='iat'>
<re.Match object; span=(7612, 7615), match='Pat'>
<re.

In [31]:
with open('sample_file_for_reg_ex.txt', 'r') as f:
    content = f.read()

    pattern = re.compile(r'\d{3}.\d{3}.\d{4}') 
    
    matches = pattern.finditer(content)
    matches_a = pattern.findall(content)

    for match in matches:
        print(match)

print('\n')
print(matches_a)

<re.Match object; span=(565, 577), match='615-555-7164'>
<re.Match object; span=(655, 667), match='800-555-5669'>
<re.Match object; span=(744, 756), match='560-555-5153'>
<re.Match object; span=(834, 846), match='900-555-9340'>
<re.Match object; span=(931, 943), match='714-555-7405'>
<re.Match object; span=(1020, 1032), match='800-555-6771'>
<re.Match object; span=(1110, 1122), match='783-555-4799'>
<re.Match object; span=(1200, 1212), match='516-555-4615'>
<re.Match object; span=(1293, 1305), match='127-555-1867'>
<re.Match object; span=(1384, 1396), match='608-555-4938'>
<re.Match object; span=(1470, 1482), match='568-555-6051'>
<re.Match object; span=(1558, 1570), match='292-555-1875'>
<re.Match object; span=(1646, 1658), match='900-555-3205'>
<re.Match object; span=(1735, 1747), match='614-555-1166'>
<re.Match object; span=(1826, 1838), match='530-555-2676'>
<re.Match object; span=(1912, 1924), match='470-555-2750'>
<re.Match object; span=(1996, 2008), match='800-555-6089'>
<re.Mat

In [32]:
pattern = re.compile(r'Mr\.?\s[A-Z]\w*') 
    
matches = pattern.finditer(text_to_search)
matches_a = pattern.findall(text_to_search)

for match in matches:
    print(match)

print('\n')
print(matches_a)

<re.Match object; span=(216, 227), match='Mr. Schafer'>
<re.Match object; span=(228, 236), match='Mr Smith'>
<re.Match object; span=(260, 265), match='Mr. T'>


['Mr. Schafer', 'Mr Smith', 'Mr. T']


In [33]:
pattern = re.compile(r'(Mr|Ms|Mrs)\.?\s[A-Z]\w*') # charat sign negate the character set 
    
matches = pattern.finditer(text_to_search)
matches_a = []

for match in matches:
    print(match)
    matches_a += [match.group(0)]

print('\n')
print(matches_a)

<re.Match object; span=(216, 227), match='Mr. Schafer'>
<re.Match object; span=(228, 236), match='Mr Smith'>
<re.Match object; span=(237, 245), match='Ms Davis'>
<re.Match object; span=(246, 259), match='Mrs. Robinson'>
<re.Match object; span=(260, 265), match='Mr. T'>


['Mr. Schafer', 'Mr Smith', 'Ms Davis', 'Mrs. Robinson', 'Mr. T']


In [44]:
with open('sample_file_for_reg_ex.txt', 'r') as f:
    content = f.read()

    pattern = re.compile(r'.+@.+\.(com|edu|net)') 
    
    matches = pattern.finditer(content)
    matches_lst = pattern.findall(content)
    matches_a = []

    for match in matches:
        print(match)
        matches_a += [match.group()]

print('\n')
print(matches_a)

<re.Match object; span=(613, 638), match='davemartin@bogusemail.com'>
<re.Match object; span=(700, 728), match='charlesharris@bogusemail.com'>
<re.Match object; span=(788, 816), match='laurawilliams@bogusemail.com'>
<re.Match object; span=(878, 907), match='coreyjefferson@bogusemail.com'>
<re.Match object; span=(978, 1006), match='jenniferwhite@bogusemail.com'>
<re.Match object; span=(1070, 1093), match='tomdavis@bogusemail.com'>
<re.Match object; span=(1154, 1182), match='neilpatterson@bogusemail.com'>
<re.Match object; span=(1248, 1277), match='laurajefferson@bogusemail.com'>
<re.Match object; span=(1340, 1367), match='mariajohnson@bogusemail.com'>
<re.Match object; span=(1426, 1454), match='michaelarnold@bogusemail.com'>
<re.Match object; span=(1517, 1544), match='michaelsmith@bogusemail.com'>
<re.Match object; span=(1604, 1631), match='robertstuart@bogusemail.com'>
<re.Match object; span=(1692, 1718), match='lauramartin@bogusemail.com'>
<re.Match object; span=(1782, 1810), match='b

In [35]:
for i in matches_a:
    print(i)

davemartin@bogusemail.com
charlesharris@bogusemail.com
laurawilliams@bogusemail.com
coreyjefferson@bogusemail.com
jenniferwhite@bogusemail.com
tomdavis@bogusemail.com
neilpatterson@bogusemail.com
laurajefferson@bogusemail.com
mariajohnson@bogusemail.com
michaelarnold@bogusemail.com
michaelsmith@bogusemail.com
robertstuart@bogusemail.com
lauramartin@bogusemail.com
barbaramartin@bogusemail.com
lindajackson@bogusemail.com
stevemiller@bogusemail.com
davearnold@bogusemail.com
jenniferjacobs@bogusemail.com
neilwilson@bogusemail.com
kurtjackson@bogusemail.com
maryjacobs@bogusemail.com
michaelwhite@bogusemail.com
jenniferjenkins@bogusemail.com
samwright@bogusemail.com
johndavis@bogusemail.com
neildavis@bogusemail.com
laurajackson@bogusemail.com
johnwilliams@bogusemail.com
michaelmartin@bogusemail.com
maggiebrown@bogusemail.com
kurtwilson@bogusemail.com
elizabetharnold@bogusemail.com
janemartin@bogusemail.com
travisjohnson@bogusemail.com
laurajefferson@bogusemail.com
tomwilliams@bogusemail.com


In [36]:
urls = '''
https://www.google.com
http://coreyms.com
https://youtube.com
https://www.nasa.gov
'''

pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')
sub_url = pattern.sub(r'\2\3', urls) # the sub method of a patter allow us to extract specific group of patterns in our reg ex
print(sub_url)

matches = pattern.finditer(urls)
matches_list = []

for match in matches:
    print(match.group(0))
    matches_list += [match.group(0)]

print('\n')
print(matches_list)


google.com
coreyms.com
youtube.com
nasa.gov

https://www.google.com
http://coreyms.com
https://youtube.com
https://www.nasa.gov


['https://www.google.com', 'http://coreyms.com', 'https://youtube.com', 'https://www.nasa.gov']
