## RegEx (Regular Expressions) ##

In [1]:
import re

## Matching Expressions ##

+ r = raw string
+ finditer() ==> returns object of match with span method (index range of specified pattern)
+ findall() ==> returns all instances of matched pattern
+ match() ==> returns match of pattern at BEGINNING of string
+ search() ==> searches through string and returns any instance of specified pattern

In [2]:
test_string = "38639824abc89032ABC"
pattern = re.compile(r"abc", flags=re.IGNORECASE)
matches_iter = pattern.finditer(test_string)
matches_find = pattern.findall(test_string)
matches_match = pattern.match(test_string)
matches_search = pattern.search(test_string)

## Methods on Match Object ##

+ start() ==> start index of matched pattern (inclusive)
+ end() ==> end index of matched pattern (exclusive)
+ span() ==> tuple of index range (start, stop) [stop is exclusive]
+ group() ==> returns actual string of match object

In [43]:
## print(matches_search)
matches_search.end()
matches_search.start()
matches_search.span()
matches_search.group()

<re.Match object; span=(8, 11), match='abc'>


'abc'

## Meta Characters ##

+ '.' ==>  Any character (except newline)
+ '^' ==> Starts with specified pattern
+ '$' ==> Ends with specified pattern
+ '\x' ==> Find instance of meta character
+ '*' ==> Zero or more occurrences
+ '+' ==> One or more occurrences
+ {N} ==> Exactly N occurrences
+ '|' ==> Either or
+ '()' ==> Capture and group

## Special Characters ##
+ '\d' ==> Matches any decimal digit; [0-9]
+ '\D' ==> Matches any non-digit character
+ '\s' ==> Matches any whitespace character
+ '\S' ==> Matches any non-whitespace character
+ '\w' ==> Matches any alphanumeric character
+ '\W' ==> Matches any non-alphanumeric character
+ '\b' ==> Matches the specified characters at the beginning or end of a word (word boundary)
    + '\b\w+' ==> captures all words
+ '\B' ==> Matches the specified characters; but NOT at beginning of a block

In [22]:
new_test_string = "hfiuehew.iu93293"
boundary_test_string = "hihgrue ih9394-3209"
boundaries = re.compile(r'\b[a-z0-9]+')
meta_character = re.compile(r'.')
meta_character_esc = re.compile(r'\.')
character_starts = re.compile(r'^hi')
digit_characters = re.compile(r'(hf|iu)')
digit_pattern = digit_characters.findall(new_test_string)
boundary_pattern = character_starts.findall(boundary_test_string)
print(boundary_pattern)

['hi']


## Quantifiers ##

+ Quantify how many times a part of regular expression should be repeated

+ '*': Matches pattern 0 or more
+ '+': Matches pattern 1 or more
+ '?': 0 or 1 -> optional character
+ {N}: Matches an exact number
+ {N,M}: range numbers (min, max)

In [8]:
quantifier_test_string = "389_hello_1234x89000"

## Digit is repeated one or more times
num_pattern_digit = re.compile('\d+')

## Non-Digit is repeated one or more times
num_pattern_nondigit = re.compile('\D+')

## Non-Digit is repeated exactly 3 times
num_range_nondigit = re.compile('\D{3}')

## Word then Digit is matched exactly 2 times
num_pattern = re.compile('hello_\d{2}')

## Digit is repeated 1 or 2 times
num_pattern_range = re.compile('\d{1,2}')

num_matches = num_pattern_digit.findall(quantifier_test_string)
print(num_matches)

['389', '1234', '89000']


## Sets & Capture Groups ##

+ Returns all instances of every character inside set
+ [a-z] ==> returns all characters from a-z
+ [a-zA-Z] ==> returns all characters from a-z and A-Z
+ [a-zA-Z0-9] ==> returns all characters from a-z and A-Z and 0-9

In [36]:
test_set = """
Python-engineer@gmx.de
python-engineer123@my-domain.org
pythonengineer@gmail.com"""

set_pattern = re.compile(r'[hd8i]')
set_pattern_range = re.compile(r'[a-z]')
set_pattern_capital = re.compile(r'([a-zA-Z0-9-]+)@([a-zA-Z-]+)\.(com|org|de)')
set_matches = set_pattern_capital.finditer(test_set)

print([setMatch.group(0) for setMatch in set_matches])
## print([setMatch.group(1) for setMatch in set_matches])
## print([setMatch.group(2) for setMatch in set_matches])
## print([setMatch.group(3) for setMatch in set_matches])

['Python-engineer', 'python-engineer123', 'pythonengineer']
[]


## Modifications ##

+ split() ==> splits a string at specified pattern
+ sub() ==> replaces a string at specified pattern (can be used to replace whole pattern with specific groups)

In [42]:
modification_test_string = "320932ddg93i0ddg"
pattern = re.compile(r'ddg')
split_match = pattern.split(modification_test_string)
sub_match = pattern.sub("abc", modification_test_string)

320932abc93i0abc


In [54]:
test_url = """
http://python-engineer.com
https://www.python-engineer.com
http://www.pyeng.net
"""

url_pattern = re.compile(r'https?://(w{3}\.)?([a-zA-Z-]+)(\.(com|net))')
url_matches = url_pattern.finditer(test_url)
##print([match.group(0) for match in url_matches])
##print([match.group(1) for match in url_matches])
##print([match.group(3) for match in url_matches])

formatted_url = url_pattern.sub(r"\2\3", test_url)
print(formatted_url)

['.com', '.com', '.net']

python-engineer.com
python-engineer.com
pyeng.net



## Assertions ##

+ (?<={str}) ==> Positive LookBehind
+ (?<!{str}) ==> Positive LookBehind
+ (?={str}) ==> Positive LookAhead
+ (?!{str}) ==> Negative LookAhead (pattern that is NOT followed by str)

In [38]:
assertion_test_string = "1: Hello World, 2: Hello World"
assertion_pattern_behind = re.compile(r'(?<=1: )hello world', flags=re.IGNORECASE)
assertion_pattern_ahead = re.compile(r'hello world(?=,)', flags=re.IGNORECASE)
assertion_pattern_na = re.compile(r'hello world(?!,)', flags=re.IGNORECASE)
assertion_pattern_nb = re.compile(r'(?<!1: )hello world', flags=re.IGNORECASE)
match = assertion_pattern_nb.findall(assertion_test_string)
print(match)

['Hello World']


In [24]:
names_str = "Michael~Steve-Moses Bob9 Bartholomew"
new_pattern = re.compile(r'\b[a-zA-Z]+\d?')
match = new_pattern.findall(names_str)
print(match)

['Michael', 'Steve', 'Moses', 'Bob9', 'Bartholomew']
