In [None]:
#Regular Expression(RE)
#A regular expression is a special sequence of characters that helps you match or find other strings or sets of strings, using a specialized syntax held in a pattern

In [4]:
import re

text_to_search = '''
abcdefghijklmnopqurtuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
1234567890
Ha HaHa
MetaCharacters (Need to be escaped):
. ^ $ * + ? { } [ ] \ | ( )
coreyms.com
321-555-4321
123.555.1234
123*555*1234
800-555-1234
900-555-1234
Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T
'''
sentence = 'Start a sentence and then bring it to an end'

In [5]:
#rawstring
# Python raw string is created by prefixing a string literal with 'r' or 'R'. Python raw string treats backslash (\) as a literal character. This is useful when we want to have a string that contains backslash and don't want it to be treated as an escape character.
print("\tTab")  #simple string
print(r"\tTab") #raw_string

	Tab
\tTab


In [7]:
#re.compile Compile a regular expression pattern into a regular expression object, which can be used for matching using its match() and search() method
pattern = re.compile(r'abc')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(1, 4), match='abc'>


In [8]:
#if we wanna search for a Metacharacter like a period "." we need to use a escape character (\) 
pattern = re.compile(r'\.')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(111, 112), match='.'>
<re.Match object; span=(146, 147), match='.'>
<re.Match object; span=(167, 168), match='.'>
<re.Match object; span=(171, 172), match='.'>
<re.Match object; span=(218, 219), match='.'>
<re.Match object; span=(249, 250), match='.'>
<re.Match object; span=(262, 263), match='.'>


In [9]:
# #MetaCharacters

# .       - Any Character Except New Line
# \d      - Digit (0-9)
# \D      - Not a Digit (0-9)
# \w      - Word Character (a-z, A-Z, 0-9, _)
# \W      - Not a Word Character
# \s      - Whitespace (space, tab, newline)
# \S      - Not Whitespace (space, tab, newline)

# \b      - Word Boundary
# \B      - Not a Word Boundary
# ^       - Beginning of a String
# $       - End of a String

# []      - Matches Characters in brackets
# [^ ]    - Matches Characters NOT in brackets
# |       - Either Or
# ( )     - Group

# #Quantifiers:
# *       - 0 or More
# +       - 1 or More
# ?       - 0 or One
# {3}     - Exact Number
# {3,4}   - Range of Numbers (Minimum, Maximum)


In [10]:
pattern = re.compile(r'\d{3}.\d{3}.\d{3}')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(151, 162), match='321-555-432'>
<re.Match object; span=(164, 175), match='123.555.123'>
<re.Match object; span=(177, 188), match='123*555*123'>
<re.Match object; span=(190, 201), match='800-555-123'>
<re.Match object; span=(203, 214), match='900-555-123'>


In [11]:
urls = '''
https://www.google.com
http://coreyms.com
https://youtube.com
https://www.nasa.gov
'''

In [13]:
#matching using groups
pattern = re.compile(r'https?://(www\.)?\w+\.\w+')
matches= pattern.finditer(urls)

for match in matches:
    print(match)

<re.Match object; span=(1, 23), match='https://www.google.com'>
<re.Match object; span=(24, 42), match='http://coreyms.com'>
<re.Match object; span=(43, 62), match='https://youtube.com'>
<re.Match object; span=(63, 83), match='https://www.nasa.gov'>


In [27]:
#matching domaing name and top level domain eg. facebook.com
#in this case group 0 is whole match, group 1 is www. group2 is domain name and group 3 is toplevel domain
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')
matches= pattern.finditer(urls)

#accessing group2 domain name and group 3 tom level domain
for match in matches:
    print(match.group(2)+match.group(3))

google.com
coreyms.com
youtube.com
nasa.gov


In [28]:
#substituting using regex

pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')

#subsituting group2 and 3 with whole url
subbed_urls = pattern.sub(r'\2\3', urls)

print(subbed_urls)


google.com
coreyms.com
youtube.com
nasa.gov



In [30]:
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')
matches= pattern.findall(urls)

for match in matches:
    print(match)

('www.', 'google', '.com')
('', 'coreyms', '.com')
('', 'youtube', '.com')
('www.', 'nasa', '.gov')


In [34]:
#match matches only one pattern and should be at begining of line
sentence = 'Start a sentence and then bring it to an end'

pattern = re.compile(r'Start')

matches = pattern.match(sentence)

print(matches)

<re.Match object; span=(0, 5), match='Start'>


In [37]:
#search the entire file and returns the first match
sentence = 'Start a sentence and then bring it to an end'

pattern = re.compile(r'start', re.I) 
#re.I or re.IGNORECASE is a flag for case insensitive 

matches = pattern.search(sentence)

print(matches)

<re.Match object; span=(0, 5), match='Start'>
