In [3]:
import re

In [4]:
s = "a\tb"
print(s)

a	b


In [5]:
raw_s = r"a\tb"
print(raw_s)

a\tb


## re.match - find first match
Find match at the beginning of a string
Useful to validate input from users

In [6]:
pattern = r"\d+"
text= "42 is my luky number"

In [7]:
match = re.match(pattern, text)

In [8]:
if match:
    print(match.group(0))
else:
    print('no match')


42


##Input validation

In [9]:
def is_integer(text):
    pattern = r"^\d+$"
    
    match = re.search(pattern, text)
    
    if match:
        return True
    else:
        return False

In [10]:
is_integer("2df")

False

In [11]:
def test_is_integer():
    pass_list = ["123", "456", "900", "0991",  "qa"]
    fail_list = ["a123","124a","1 2 3","1\t2"," 12","45 "]
    
    for text in pass_list:
        if not is_integer(text):
            print('\tFailed to detect an integer',text)
    
    for text in fail_list:
        if is_integer(text):
            print('\tIncorrectly classified as an integer',text)
    
    print('Test complete')
    

In [12]:
test_is_integer()

	Failed to detect an integer qa
Test complete


## re.search - Find the first match anywhere

In [15]:
pattern = r"\d+" # one or more digits

text = "42 is my lucky number"

match = re.search(pattern,text)

# check if match was successful
if match:
    print('Found a match:', match.group(0), 'at index:', match.start())
else:
    print ("No match")  

Found a match: 42 at index: 0


In [None]:
pattern = r"\d+" # \d = digit. + = one or more.  This pattern matches one or more digits

# search method will look for the first match anywhere in the text
text = "my lucky number is 42"

match = re.search(pattern, text)

if match:
    print('Found a match:',match.group(0), 'at index:', match.start())
else:
    print("No Match")   

In [None]:
# But, it finds only the first match in the text

pattern = r"\d+" # \d = digit. + = one or more.  This pattern matches one or more digits

# search method will look ONLY for the first match anywhere in the text
text = "my lucky numbers are 42 and 24"

match = re.search(pattern, text)

if match:
    print('Found a match:',match.group(0), 'at index:', match.start())
else:
    print("No Match")   

## re.findall - Find all the matches
    This method returns only after scanning the entire text

In [16]:
pattern = r"\d+"
text = "NY postal code are 10001, 10002, 10003, 10004"

print ('Pattern',pattern)
# successful match
match = re.findall(pattern, text)

if match:
    print('Found matches:', match)
else:
    print("No Match")   

Pattern \d+
Found matches: ['10001', '10002', '10003', '10004']


## re.finditer - Iterator
method returns an iterator with the first match and you have control to ask for more matches

In [19]:
pattern = r"\d+"
text = "NY Postal Codes are 10001, 10002, 10003, 10004"

print ('Pattern',pattern)
# successful match
match_iter = re.finditer(pattern, text)

print ('Matches')
for match in match_iter:
    print('\t', match.group(0), 'at index:', match.start())

Pattern \d+
Matches
	 10001 at index: 20
	 10002 at index: 27
	 10003 at index: 34
	 10004 at index: 41


## groups - find sub matches
group 0 = refers to the text in a string that matched the pattern
group 1..n onwards refer to the sub-groups


In [20]:
# Separate year, month and day
# 1. pattern = r"\d+"
# 2. pattern = r"\d{4}\d{2}\d{2}"
# 3. pattern = r"(\d{4})(\d{2})(\d{2})"

pattern = r"(\d{4})(\d{2})(\d{2})"
text = "Start Date: 20200920"

print("Pattern",pattern)
match = re.search(pattern, text)

if match:
    print('Found a match', match.group(0), 'at index:', match.start())
    
    print('Groups', match.groups())
        
    for idx, value in enumerate(match.groups()):
        print ('\tGroup', idx+1, value, '\tat index', match.start(idx+1))
        
else:
    print("No Match")

Pattern (\d{4})(\d{2})(\d{2})
Found a match 20200920 at index: 12
Groups ('2020', '09', '20')
	Group 1 2020 	at index 12
	Group 2 09 	at index 16
	Group 3 20 	at index 18


## named groups

In [21]:
# Separate year, month and day
pattern = r"(?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2})"
text = "Start Date: 20200920"

print("Pattern",pattern)
match = re.search(pattern, text)

if match:
    print('Found a match', match.group(0), 'at index:', match.start())    
    print('\t',match.groupdict())
else:
    print("No Match") 

Pattern (?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2})
Found a match 20200920 at index: 12
	 {'year': '2020', 'month': '09', 'day': '20'}


## access by group name

In [22]:
# Separate year, month and day
pattern = r"(?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2})"
text = "Start Date: 20200920"

print("Pattern",pattern)
match = re.search(pattern, text)

if match:
    print('Found a match', match.group(0), 'at index:', match.start())    
    print('\tYear:',match.group('year'))
    print('\tMonth:',match.group('month'))
    print('\tDay:',match.group('day'))    
else:
    print("No Match")

Pattern (?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2})
Found a match 20200920 at index: 12
	Year: 2020
	Month: 09
	Day: 20


## re.sub - find and replace

two patterns: one to find the text and another pattern with replacement text

In [24]:
# Format date
#  20200920 => 09-20-2020

pattern = r"(?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2})"
text = "Start Date: 20200920, End Date: 20210920"

# substitute with value space dollars
replacement_pattern = r"\g<month>-\g<day>-\g<year>"

print ('original text\t', text)
print()

# find and replace
new_text= re.sub(pattern, replacement_pattern, text)

print('new text\t', new_text)

original text	 Start Date: 20200920, End Date: 20210920

new text	 Start Date: 09-20-2020, End Date: 09-20-2021


In [25]:
# Make this an exercise
# find one or more digits followed by the word dollars. capture the digits in value group
pattern = r"(?P<value>\d+)dollars" 

text = "movie ticket: 15dollars. popcorn: 8dollars"

# substitute with value space dollars
replacement_pattern = r"\g<value> dollars"

print ('original text\t', text)
print()

# find and replace
new_text= re.sub(pattern, replacement_pattern, text)

print('new text\t', new_text)

original text	 movie ticket: 15dollars. popcorn: 8dollars

new text	 movie ticket: 15 dollars. popcorn: 8 dollars


## custom function to generate replacement text

In [26]:
# Format 
#   20200920 => Sep-20-2020
import datetime

In [27]:
def format_date(match):   
    in_date = match.groupdict()
    
    year = int(in_date['year'])
    month = int(in_date['month'])
    day = int(in_date['day'])
    
    #https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior
    return datetime.date(year,month,day).strftime('%b-%d-%Y')

In [28]:
# Format date
pattern = r"(?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2})"
text = "Start Date: 20200920, End Date: 20210920"

print ('original text\t', text)
print()

# find and replace
new_text= re.sub(pattern, format_date, text)

print('new text\t', new_text)

original text	 Start Date: 20200920, End Date: 20210920

new text	 Start Date: Sep-20-2020, End Date: Sep-20-2021


In [29]:
# Make this an assignment
def celsius_to_fahrenheit(match):
    degCelsius =  float(match.group("celsius"))
    degF = 32.0 + (degCelsius * 9.0 / 5.0);
    return '{0}°F'.format(degF);

In [30]:
def substitution_example_custom_logic():
    pattern = r"(?P<celsius>\d+)\u00B0C"
    text = "Today's temperature is 25°C"
            
    print ('Pattern: {0}'.format(pattern))
    print ('Text before: {0}'.format(text))
        
    new_text = re.sub(pattern, celsius_to_fahrenheit, text)
    
    print('Text after:  {0}'.format(new_text))

In [31]:
substitution_example_custom_logic()

Pattern: (?P<celsius>\d+)\u00B0C
Text before: Today's temperature is 25°C
Text after:  Today's temperature is 77.0°F


## re.split - split text based on specified pattern

In [32]:
pattern = r","

text = "a-c,x,y,1"

re.split(pattern,text)

['a-c', 'x', 'y', '1']