# Regular Expressions in Python

Reference:  https://www.youtube.com/watch?v=K8L6KVGG-7o   
Corey Schafer YouTube Channel.

In [1]:
# import regex  - regular expression library
import re

In [2]:
#Understand raw string 
print('\tTab')
print(r'\tTab') #r = raw

	Tab
\tTab


# --------------------------------------------------------------------------

## Snippets  
  
.    $\qquad$     - Any Character Except New Line  
\d   $\qquad$    - Digit (0-9)  
\D   $\qquad$   - Not a Digit (0-9)  
\w   $\qquad$   - Word Character (a-z, A-Z, 0-9, _)  
\W   $\qquad$   - Not a Word Character  
\s   $\qquad$  - Whitespace (space, tab, newline)  
\S   $\qquad$   - Not Whitespace (space, tab, newline)  
  
\b   $\qquad$   - Word Boundary  
\B   $\qquad$   - Not a Word Boundary  
^    $\qquad$   - Beginning of a String  
$     - End of a String  
  
[]   $\qquad$  - Matches Characters in brackets  
[^ ]  $\qquad$   - Matches Characters NOT in brackets  
|    $\qquad$   - Either Or  
( )  $\qquad$   - Group  
  
Quantifiers:  
*    $\qquad$   - 0 or More  
+    $\qquad$   - 1 or More  
?    $\qquad$   - 0 or One  
{3}  $\qquad$   - Exact Number  
{3,4}$\qquad$   - Range of Numbers (Minimum, Maximum)  
  
  
#### Sample Regexs ####  
  
[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+  
  

#  -----------------------------------------------------------------------------------------

In [5]:
text_to_search = '''
abcdefghijklmnopqurtuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
1234567890
Ha HaHa
MetaCharacters (Need to be escaped):
. ^ $ * + ? { } [ ] \ | ( )
coreyms.com
321-555-4321
123.555.1234
123*555*1234
800-555-1234
900-555-1234
Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T
'''

In [7]:
sentence = 'Start a sentence and then bring it to an end'

In [8]:
pattern = re.compile(r'abc')       #case and order sensitive
matches = pattern.finditer(text_to_search)

In [9]:
for match in matches:
    print(match)                #it gives index of string which it matches

<re.Match object; span=(1, 4), match='abc'>


In [10]:
text_to_search[1:4]

'abc'

# ------------------------------------------------------------------------

In [11]:
#match phone numbers
pattern = re.compile(r'\d{3}.\d{3}.\d{4}')       #case and order sensitive
matches = pattern.finditer(text_to_search)

In [12]:
for match in matches:
    print(match) 

<re.Match object; span=(151, 163), match='321-555-4321'>
<re.Match object; span=(164, 176), match='123.555.1234'>
<re.Match object; span=(177, 189), match='123*555*1234'>
<re.Match object; span=(190, 202), match='800-555-1234'>
<re.Match object; span=(203, 215), match='900-555-1234'>


# ------------------------------------------------------

In [16]:
# match phone no.s starting with either 800 or 900
pattern1 = re.compile(r'[89]00.\d{3}.\d{4}')
matches1 = pattern1.finditer(text_to_search)

In [17]:
for match in matches1:
    print(match) 

<re.Match object; span=(190, 202), match='800-555-1234'>
<re.Match object; span=(203, 215), match='900-555-1234'>


# --------------------------------------------

In [13]:
# opening data.txt, read and match text from it

with open('data.txt','r') as f:
    contents = f.read()
    
    matches = pattern.finditer(contents)    #using regex of pattern defined above
    
    for match in matches:
        print(match)

<re.Match object; span=(12, 24), match='615-555-7164'>
<re.Match object; span=(102, 114), match='800-555-5669'>
<re.Match object; span=(191, 203), match='560-555-5153'>
<re.Match object; span=(281, 293), match='900-555-9340'>
<re.Match object; span=(378, 390), match='714-555-7405'>
<re.Match object; span=(467, 479), match='800-555-6771'>
<re.Match object; span=(557, 569), match='783-555-4799'>
<re.Match object; span=(647, 659), match='516-555-4615'>
<re.Match object; span=(740, 752), match='127-555-1867'>
<re.Match object; span=(831, 843), match='608-555-4938'>
<re.Match object; span=(917, 929), match='568-555-6051'>
<re.Match object; span=(1005, 1017), match='292-555-1875'>
<re.Match object; span=(1093, 1105), match='900-555-3205'>
<re.Match object; span=(1182, 1194), match='614-555-1166'>
<re.Match object; span=(1273, 1285), match='530-555-2676'>
<re.Match object; span=(1359, 1371), match='470-555-2750'>
<re.Match object; span=(1443, 1455), match='800-555-6089'>
<re.Match object; spa

# ------------------------------------------------------------------

In [25]:
# matching names in text_to_search (present at the bottom)

#pattern2 = re.compile(r'Mr\.?\s[A-Z]\w*')  #for names with Mr
pattern2 = re.compile(r'M(r|s|rs)\.?\s[A-Z]\w*')  #for all names, also did grouping
matches2 = pattern2.finditer(text_to_search)

for match in matches2:
    print(match) 

<re.Match object; span=(216, 227), match='Mr. Schafer'>
<re.Match object; span=(228, 236), match='Mr Smith'>
<re.Match object; span=(237, 245), match='Ms Davis'>
<re.Match object; span=(246, 259), match='Mrs. Robinson'>
<re.Match object; span=(260, 265), match='Mr. T'>


# ---------------------------------------------------------------------------------------   
  
## Emails

In [26]:
emails = '''
CoreyMSchafer@gmail.com
corey.schafer@university.edu
corey-321-schafer@my-work.net
'''

In [31]:
#pattern3 = re.compile(r'[A-za-z0-9-.]*@([\w-]+)\.(\w+)')  #for all emails
pattern3 = re.compile(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+.[a-zA-Z0-9-.]+')   #from internet for emails
matches3 = pattern3.finditer(emails)

for match in matches3:
    print(match) 

<re.Match object; span=(1, 24), match='CoreyMSchafer@gmail.com'>
<re.Match object; span=(25, 53), match='corey.schafer@university.edu'>
<re.Match object; span=(54, 83), match='corey-321-schafer@my-work.net'>


# -----------------------------------------------------------------------------------

# Urls  
   

In [33]:
urls = '''
https://www.google.com
http://coreyms.com
https://youtube.com
https://www.nasa.gov
'''

In [41]:
pattern4 = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')   #we have grouped domain(like google,nasa) and top level domain (.gov,.com, .edu)
matches4 = pattern4.finditer(urls)

for match in matches4:
    for i in range(4):
        print(match.group(i))       #group(0) is entire match, group(1) is optional (www.), group(2) is domain name
    
#for match in matches4:
#    print(match.group(1))         #calling only a specific group

https://www.google.com
www.
google
.com
http://coreyms.com
None
coreyms
.com
https://youtube.com
None
youtube
.com
https://www.nasa.gov
www.
nasa
.gov


In [42]:
# suppose you want to domain names in a column. Use
subbed_urls = pattern4.sub(r'\2\3',urls)
print(subbed_urls)


google.com
coreyms.com
youtube.com
nasa.gov



# -------------------------------------------------------------------------------

## pattern.findall()  
  
returns the matches

In [46]:
pattern1 = re.compile(r'[89]00.\d{3}.\d{4}')
matches1 = pattern1.findall(text_to_search)    # .findall()

for match in matches1:
    print(match) 

800-555-1234
900-555-1234


In [47]:
# If in groups
pattern1 = re.compile(r'([89]00).(\d{3}).(\d{4})')
matches1 = pattern1.findall(text_to_search)    # .findall()

for match in matches1:
    print(match)                 #returns in tuples containing only the groups (here it omitted hyphen in between 
                                #because it is not in group we defined)

('800', '555', '1234')
('900', '555', '1234')


### .search    -- searches entire string but returns only first matched value
### .match  -- searches only at start of string


In [49]:
sentence1 = 'Start sentence and sentence end'
pattern5 = re.compile(r'sentence')
matches5 = pattern5.search(sentence1)

print(matches5)

<re.Match object; span=(6, 14), match='sentence'>


In [54]:
pattern6 = re.compile(r'start', re.IGNORECASE)    #re.I also works to make case insensitivity
matches6 = pattern6.match(sentence1)

print(matches6)

<re.Match object; span=(0, 5), match='Start'>


# --------------------------------------------------------------------------------------------------

In [57]:
# There exists even advanced features for Regular expressions in python