In [16]:
import re

# triple single quote allows you to enter newlines etc
text_to_search = '''
abcdefghijklmnpoqrstuvwxyz
ABCDEFGHIJKLMNPOQRSTUVWXYZ
1234567890
123abc

Hello HelloHello

MetaCharacters (Need to be escaped):
.^ $ * + ?  ( ) [ ] / | ( )

utexas.edu

321-555-4321
321.555.1234

daniel-mitchell@utexas.edu

Mr. Johnson
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T
'''

In [12]:
pattern = re.compile(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]{2,4}')
matches = pattern.finditer(text_to_search)
for mat in matches:
    print(mat)
    print(mat.span(0))
    # returns span of match object
    # 0 - group, character group, things in front of parentheses
    # python set group of index 0 to be entire returned sequence
    # returns span from mat above
    print(mat.group(0))
    # entire returned text inside of text_to_search
    # returns contents of group at index 0
    # returns match from mat above
    print(text_to_search[mat.span(0)[0]:mat.span(0)[1]])
    # use span to reference string we're searching
    

<re.Match object; span=(197, 223), match='daniel-mitchell@utexas.edu'>
(197, 223)
daniel-mitchell@utexas.edu
daniel-mitchell@utexas.edu


In [14]:
urls = r'''
https://www.google.com
http://yahoo.com
https://www.whitehouse.gov
https://craigslist.org
'''

In [21]:
pattern = re.compile(r'https?://(www\.)?\w+\.\w+')
# http - match exactly http
# s? - 0 or more s
# :// - match exatcly ://
# (www\.)? - 0 or more
# \w+ - one ore more word character
# \. - a dot
# \w+ - one ore more word character
matches = pattern.finditer(urls)
for mat in matches:
    print(mat)

<re.Match object; span=(1, 23), match='https://www.google.com'>
<re.Match object; span=(24, 40), match='http://yahoo.com'>
<re.Match object; span=(41, 67), match='https://www.whitehouse.gov'>
<re.Match object; span=(68, 90), match='https://craigslist.org'>


In [29]:
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')
# using grouping to be able to reference them by index using group() method
matches = pattern.finditer(urls)
for mat in matches:
#     print(mat)
    print(mat.group(2)+mat.group(3))
#     print(mat.group(0))
#     print(mat.group(1))
#     print(mat.group(2))
#     print(mat.group(3))

google.com
yahoo.com
whitehouse.gov
craigslist.org


In [33]:
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')
# using grouping to be able to reference them by index using group() method
matches = pattern.finditer(urls)
for mat in matches:
    print(mat)
    print(urls[mat.span(2)[0]:mat.span(2)[1]]+urls[mat.span(3)[0]:mat.span(3)[1]])

<re.Match object; span=(1, 23), match='https://www.google.com'>
google.com
<re.Match object; span=(24, 40), match='http://yahoo.com'>
yahoo.com
<re.Match object; span=(41, 67), match='https://www.whitehouse.gov'>
whitehouse.gov
<re.Match object; span=(68, 90), match='https://craigslist.org'>
craigslist.org


In [38]:
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')
# using grouping to be able to reference them by index using group() method
matches = pattern.finditer(urls)
for mat in matches:
#     print(mat.start())
#     print(mat.end())    
#     print(mat.span())
    print(mat.group())

https://www.google.com
http://yahoo.com
https://www.whitehouse.gov
https://craigslist.org
