### Source

- https://www.debuggex.com/cheatsheet/regex/python
- https://www.w3schools.com/python/python_regex.asp
- https://www.geeksforgeeks.org/regular-expression-python-examples-set-1/
- https://regex101.com/

In [1]:
# Module Regular Expression is imported using __import__(). 
import re 
  
# compile() creates regular expression character class [a-e], 
# which is equivalent to [abcde]. 
# class [abcde] will match with string with 'a', 'b', 'c', 'd', 'e'. 
p = re.compile('[a-e]') 
  
# findall() searches for the Regular Expression and return a list upon finding 
print(p.findall("Aye, said Mr. Gibenson Stark")) 

['e', 'a', 'd', 'b', 'e', 'a']


In [2]:
re.findall('[a-e]', "Aye, said Mr. Gibenson Stark")

['e', 'a', 'd', 'b', 'e', 'a']

In [5]:
# \d is equivalent to [0-9]. 
p = re.compile('\d') 
print(p.findall("I went to him at 11 A.M. on 4th July 1886")) 
  
# \d+ will match a group on [0-9], group of one or greater size 
p = re.compile('\d{4,4}') 
print(p.findall("I went to him at 11 A.M. on 4th July 1886")) 

['1', '1', '4', '1', '8', '8', '6']
['1886']


In [6]:
p = re.compile('\w') 
print(p.findall("He said * in some_lang.")) 
  
# \w+ matches to group of alphanumeric character. 
p = re.compile('\w+') 
print(p.findall("I went to him at 11 A.M., he said *** in some_language.")) 
  
# \W matches to non alphanumeric characters.
p = re.compile('\W') 
print(p.findall("he said *** in some_language.")) 

['H', 'e', 's', 'a', 'i', 'd', 'i', 'n', 's', 'o', 'm', 'e', '_', 'l', 'a', 'n', 'g']
['I', 'went', 'to', 'him', 'at', '11', 'A', 'M', 'he', 'said', 'in', 'some_language']
[' ', ' ', '*', '*', '*', ' ', ' ', '.']


In [5]:
p = re.compile('ab*') 
print(p.findall("ababbaabbb")) 

['ab', 'abb', 'a', 'abbb']


In [8]:
"wqw eqe".split(" ")

['wqw', 'eqe']

In [9]:
print(re.split('\W+', 'Words, words , Words')) 
print(re.split('\W+', "Word's words Words")) 
  
# Here ':', ' ' ,',' are not AlphaNumeric thus, the point where splitting occurs 
print(re.split('\W+', 'On 12th Jan 2016, at 11:02 AM')) 
  
# '\d+' denotes Numeric Characters or group of characters 
# Splitting occurs at '12', '2016', '11', '02' only 
print(re.split('\d+', 'On 12th Jan 2016, at 11:02 AM')) 

['Words', 'words', 'Words']
['Word', 's', 'words', 'Words']
['On', '12th', 'Jan', '2016', 'at', '11', '02', 'AM']
['On ', 'th Jan ', ', at ', ':', ' AM']


In [7]:
p = re.compile('\W+') 

In [8]:
p.split('Words, words , Words')

['Words', 'words', 'Words']

In [12]:
print(re.split('\d+', 'On 12th Jan 2016, at 11:02 AM', 1)) 
  
# 'Boy' and 'boy' will be treated same when flags = re.IGNORECASE 
print(re.split('[a-f]+', 'Aey, Boy oh boy, come here', flags = re.IGNORECASE)) 
print(re.split('[a-f]+', 'Aey, Boy oh boy, come here')) 

['On ', 'th Jan 2016, at 11:02 AM']
['', 'y, ', 'oy oh ', 'oy, ', 'om', ' h', 'r', '']
['A', 'y, Boy oh ', 'oy, ', 'om', ' h', 'r', '']


In [10]:
re.sub(r"http\S+", "", "t Brussels conference httP://t.co/Ge9Lp7hpyG",
       flags = re.IGNORECASE)

't Brussels conference '

In [17]:
print(re.sub('ub', '~*' , 'Subject has Uber booked already', flags = re.IGNORECASE)) 
  
# Consider the Case Sensitivity, 'Ub' in "Uber", will not be reaplced. 
print(re.sub('ub', '~*' , 'Subject has Uber booked already')) 
  
# As count has been given value 1, the maximum times replacement occurs is 1 
print(re.sub('ub', '~*' , 'Subject has Uber booked already', count=1, flags = re.IGNORECASE)) 
  
# 'r' before the patter denotes RE, \s is for start and end of a String. 
print(re.sub(r'\sAND\s', ' & ', 'Baked Beans And Spam', flags=re.IGNORECASE)) 

S~*ject has ~*er booked already
S~*ject has Uber booked already
S~*ject has Uber booked already
Baked Beans & Spam


In [20]:
p = re.compile(r'ub')
p.sub('~*', 'Subject has Uber booked already')

'S~*ject has Uber booked already'

In [None]:
~

In [11]:
t = re.subn('ub', '~*' , 'Subject has Uber booked already', flags = re.IGNORECASE) 

In [12]:
t

('S~*ject has ~*er booked already', 2)

In [11]:
print(re.subn('ub', '~*' , 'Subject has Uber booked already')) 
t = re.subn('ub', '~*' , 'Subject has Uber booked already', flags = re.IGNORECASE) 
print(t) 
print(len(t)) 
  
# This will give same output as sub() would have  
print(t[0]) 

('S~*ject has Uber booked already', 1)
('S~*ject has ~*er booked already', 2)
2
S~*ject has ~*er booked already
