<a href="https://colab.research.google.com/github/kaushanr/python3-docs/blob/main/Section_34.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Regular Expressions (REGEX)

In [None]:
# What are they? 
  
  # a way of describing patterns within search strings

# Validating emails example

  # Syntax conditions 

    # step 1 - starts with one of more letters,+,_,-,. signs then
    # step 2 - a single @ sign then
    # step 3 - one or more letters, numbers or - then
    # step 4 - a single dot then
    # step 5 - ends with one or more letters, numbers, - or .

  # to implement these rules using conditional logic is going to be tedious, 
  # hence an alternative is to use a regular expression here to enforce these checks

  # Corresponding regex syntax for the conditions above

    # step 1 - ^[a-zA-Z0-9+_-.]+
    # step 2 - @
    # step 3 - [a-zA-Z0-9-]+
    # step 4 - \.[a-zA-Z0-9-.]+$

      # regex syntax

        # ^ - designates expression must be at the start (NOTE** - check keyboard settings! - US Qwerty style symbol)
        # [a-z] - designates, character is in lower case a-z range
        # [0-9] -  designates, digit in range 0-9
        # + - designates for one or more characters or digits
        # \. - special syntax to insert . character is required - \insert_char
        # $ - designates expression must end with this
    
    # assembled regex syntax - (^[a-zA-Z0-9+_-.]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)

  # Use cases

    # credit card number validating
    # phone number validating
    # advanced find/replace in text
    # formatting text/output 
    # syntax highlighting


In [None]:
# Writing REGEX

  # https://pythex.org/
  # http://www.rexegg.com/regex-quickstart.html

  # comonly used regex syntax - /insert_your_regex_expression/

    # \d - digit 0-9
    # \w - letter digit or underscore
    # \s - whitespace character
    # \D - not a digit
    # \W - not a word character
    # \S - not a whitespace character
    # . - any character except line break
    # note - regex is case sensitive

  # regex quantifier syntax

    # + - one or more
    # {3} - exactly x times - {3} - 3 times
    # {3,5} - 3 to 5 times
    # {4,} - 4 or more times
    # * - 0 or more times
    # ? - once or none (optional)

  # character classes

    # allows to specify groups or ranges of characters

    # syntax - [enter ranges] - [aeiou] searches for any instances of items in the range occuring within a test text
                              # [a-z] range of all letters from a through z
                              # [ˆvalue] - finds anything that is NOT the specified value 
  
  # anchors and boundaries

    # ˆ - indicates start of a string or line 
    # $ - indicates end of a string or line
    # \b - word boundary
    
  # logical OR - '|'

    # example - (\d{3}|\(\d{3}\)) \d{3} \d{4}
                # for test case - 415 345 0998 and (415) 345 0998

    # example - (Mr\.|Mrs\.) ([A-Za-z]+) ([A-Za-z]+) # parantheses can return matched pairs as groupings 
                # for test case - Mr. Luca Guadagnino and Mrs. Tilda Swinton

    # example - https?://([A-Za-z_-]+\.[A-Za-z_-]+) - matches the url pattern and returns the domain data as a matched group
                # for test case - https://pythex.org and http://google.com 

In [None]:
# REGEX with Python

import re

pattern = re.compile(r'\d{3} \d{3}-\d{4}')
res = pattern.search('Call me at 415 555-4242!')
print(res) # returns a match object if pattern match found, else returns None
print(res.group(),'\n') # to obtain the matches pattern out - res.group()

res = pattern.search('Call me at 415 555-4242 or 310 455-2632') # search method only scans for the first location of a mathcing pattern
print(res)
print(res.group(),'\n')

res = pattern.findall('Call me at 415 555-4242 or 310 455-2632')
print(res,'\n')  # findall returns a list of strings of all matched instances

  # another method without using a instance object method saved to a prior variable - uses a module method

print(re.search(r'\d{3} \d{3}-\d{4}','Call me at 310 455-2632').group()) # downside this expression needs to be called every time
                                                                         # to recompile the regex object, whereas the previous one
                                                                         # was saved to a variable

<re.Match object; span=(11, 23), match='415 555-4242'>
415 555-4242 

<re.Match object; span=(11, 23), match='415 555-4242'>
415 555-4242 

['415 555-4242', '310 455-2632'] 

310 455-2632


In [None]:
#Validation using regex in Python

import re

def extract_phone(input):
  phone_regex = re.compile(r'\b\d{3} \d{3}(-| )?\d{4}\b')
  match = phone_regex.search(input)
  if match:
    return match.group()
  return None


print(extract_phone('My number is 415 555-5661'))
print(extract_phone('My number is 415 555 4700'))
print(extract_phone('My number is 415 555 47004334'))
print(extract_phone('My number is 415 5554711'))

print()

def extract_all_phones(input):
  phone_regex = re.compile(r'\b\d{3} \d{3}(?:-| )?\d{4}\b') # (?:OR-regex) - non capturing version of parentheses
  match = phone_regex.findall(input)
  return match if match else None
  

print(extract_all_phones('Call me at 415 555 4242 or 310 455-2632 or 410 1552662'))
print(extract_all_phones('Call me at 415 55'))

print()

def is_valid_phone(input):
	phone_regex = re.compile(r'^\d{3} \d{3}(?:-| )?\d{4}$') # ^ - exclusive to US Qwerty style setting
	match = phone_regex.search(input)
	return True if match else False


print(is_valid_phone('415 555-4242'))
print(is_valid_phone('sdfsdf 415 555 4242 sfsdsd'))
print(is_valid_phone('415 555 4242 dfds'))

print()

def is_valid_phone(input):
	phone_regex = re.compile(r'\d{3} \d{3}(?:-| )?\d{4}')
	match = phone_regex.fullmatch(input) # .fullmatch - returns True if passed string is an exact match - no need of before/after anchors
	return True if match else False


print(is_valid_phone('415 555-4242'))
print(is_valid_phone('sdfsdf 415 555 4242 sfsdsd'))
print(is_valid_phone('415 555 4242 dfds'))

415 555-5661
415 555 4700
None
415 5554711

['415 555 4242', '310 455-2632', '410 1552662']
None

True
False
False

True
False
False


In [None]:
# Coding exercise

'''
is_valid_time("10:45")       #True
is_valid_time("1:23")        #True
is_valid_time("10.45")       #False
is_valid_time("1999")        #False
is_valid_time("145:23")      #False
is_valid_time("it is 12:15") #False
is_valid_time("12:15")       #True
is_valid_time("34:55") #True
'''

import re

def is_valid_time(input):
    time_regex = re.compile(r'(?:\d|\d{2}):\d{2}')
    match = time_regex.fullmatch(input)
    return True if match else False


print(is_valid_time("10:45"))
print(is_valid_time("1:23"))
print(is_valid_time("10.45"))
print(is_valid_time("1999"))
print(is_valid_time("145:23"))
print(is_valid_time("it is 12:15"))
print(is_valid_time("12:15"))
print(is_valid_time("34:55"))

True
True
False
False
False
False
True
True


In [None]:
# Parsing URLs 

import re

url_regex = re.compile(r'(https?)://(www\.[A-Za-z-]{2,256}\.[a-z]{2,6})([-a-zA-Z0-9@:%_\+.~#?&//=]*)')
match = url_regex.search('https://www.google.com/search?q=cat+videos')
print(match.group())
print(match.group(0)) # same as .group(0)
print(f'Protocol : {match.group(1)}') # returns item in index 0 of iterable object
print(f'Domain : {match.group(2)}')
print(f'Everthing Else : {match.group(3)}','\n')
print(match.groups()) # returns a tuple of the matched captures

https://www.google.com/search?q=cat+videos
https://www.google.com/search?q=cat+videos
Protocol : https
Domain : www.google.com
Everthing Else : /search?q=cat+videos 

('https', 'www.google.com', '/search?q=cat+videos')


In [None]:
# Coding exercise

import re

def parse_bytes(input):
  bytes_regex = re.compile(r'(\b[0-1]{8}\b)')
  match = bytes_regex.findall(input)
  return match

print(parse_bytes("11010101 101 323"))
print(parse_bytes("my data is: 10101010 11100010"))
print(parse_bytes("asdsa"))

['11010101']
['10101010', '11100010']
[]


In [None]:
# Symbolic group names

import re

def parse_name(input):
  name_regex = re.compile(r'^(Mr\.|Mrs\.|Ms\.|Mdme\.) (?P<first>[A-Za-z]+) ([A-Za-z]+)$') # label assigned to first name match group
  match = name_regex.search(input)
  print(f'First Name : {match.group("first")}') # calling the match group using the assigned label
  return match.groups()

print(parse_name('Mrs. Tilda Swinton'))


First Name : Tilda
('Mrs.', 'Tilda', 'Swinton')


In [None]:
# Coding exercise

import re

def parse_date(input):
  input_regex = re.compile(r'^(\d{2})(?:/|\.|,)(\d{2})(?:/|\.|,)(\d{4})$') # another solution - ^(\d\d)[,/.](\d\d)[,/.](\d{4})$ 
  match = input_regex.search(input)
  if match:
    date = {
        'd': match.group(1),
        'm': match.group(2),
        'y': match.group(3)
    }
    return date
  return None

print(parse_date("01/22/1999"))
print(parse_date("12,04,2003"))
print(parse_date("12.11.2003"))
print(parse_date("12.11.200312"))

{'d': '01', 'm': '22', 'y': '1999'}
{'d': '12', 'm': '04', 'y': '2003'}
{'d': '12', 'm': '11', 'y': '2003'}
None


In [6]:
# Compilation Flags

  # Verbose - re.VEROSE or re.X

import re

pat = re.compile(r'^([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})$') # we can break this regex up into commentable/discernable blocks

pattern = re.compile(r'''
  ^([a-z0-9_\.-]+)  # username of email
  @                 # single @ sign
  ([\da-z\.-]+)     # email provider
  \.                # single period
  ([a-z\.]{2,6})$   # com, org, net, etc
''',re.VERBOSE)

match = pattern.search('thomas.123@yahoo.com')
print(match.group())
print(match.groups(),'\n')

  # Ignore - re.IGNORECASE or re.I

pattern = re.compile(r'''
  ^([a-z0-9_\.-]+)  # username of email
  @                 # single @ sign
  ([\da-z\.-]+)     # email provider
  \.                # single period
  ([a-z\.]{2,6})$   # com, org, net, etc
''',re.VERBOSE | re.IGNORECASE) # pipe '|' is not logical OR here - builtin syntax to pass in multiple arguments to re.compile()

match = pattern.search('ThomaS.123@yahoo.com')
print(match.group())
print(match.groups())

thomas.123@yahoo.com
('thomas.123', 'yahoo', 'com') 

ThomaS.123@yahoo.com
('ThomaS.123', 'yahoo', 'com')


In [25]:
# REGEX Substituition

import re

text = 'Last night Mrs. Daisy and Mr. White murdered Ms. Chow'
pattern = re.compile(r'(Mr\.|Mrs\.|Ms\.) [a-z]+', re.IGNORECASE)
match = pattern.findall(text)
print(match) # whats returned is only the matched capture groups - but the whole name is actually matched
censored = pattern.sub('REDACTED',text)
print(censored,'\n')

  # capture groups - \g<group_name/default=number>

text = 'Last night Mrs. Daisy and Mr. White murdered Ms. Chow'
pattern = re.compile(r'(Mr\.|Mrs\.|Ms\.) ([a-z])[a-z]+', re.IGNORECASE) # capture group 1
match = pattern.findall(text)
print(match) # whats returned is only the matched capture groups - but the whole name is actually matched
censored = pattern.sub('\g<1> \g<2>',text)
print(censored)


['Mrs.', 'Mr.', 'Ms.']
Last night REDACTED and REDACTED murdered REDACTED 

[('Mrs.', 'D'), ('Mr.', 'W'), ('Ms.', 'C')]
Last night Mrs. D and Mr. W murdered Ms. C


In [5]:
# Coding exercise

import re

def censor(input):
  pattern = re.compile(r'(frack)[a-z]*', re.IGNORECASE) # alternative regex - \bfrack\w*\b
  sub_prof = pattern.sub('CENSORED',input)
  return sub_prof

print(censor("Frack you"))
print(censor("I hope you fracking die"))
print(censor("you fracking Frack"))
print(censor("I don't give a flying frack!"))

CENSORED you
I hope you CENSORED die
you CENSORED CENSORED
I don't give a flying CENSORED!


In [15]:
# Swapping file names

from pprint import pprint
import re

titles = [
    'Significant Others (1987)',
    'Tales of the City (1978)',
    'The Days of Anna Madrigal (2014)',
    'Mary Ann in Autumn (2010)',
    'Further Tales of the City (1982)',
    'Babycakes (1984)',
    'More Tales of the City (1980)',
    'Sure of You (1989)',
    'Michael Tolliver Lives (2007)'
]

pattern = re.compile(r'^([\w]+[ \b\w+\b]*) \((\d+)\)$')

ord_titles = []
for title in titles:
  result = pattern.sub('\g<2> - \g<1>',title)
  ord_titles.append(result)

ord_titles.sort()
pprint(ord_titles)

['1978 - Tales of the City',
 '1980 - More Tales of the City',
 '1982 - Further Tales of the City',
 '1984 - Babycakes',
 '1987 - Significant Others',
 '1989 - Sure of You',
 '2007 - Michael Tolliver Lives',
 '2010 - Mary Ann in Autumn',
 '2014 - The Days of Anna Madrigal']
