# Regex

In [1]:
import re

In [2]:
help(re)

Help on module re:

NAME
    re - Support for regular expressions (RE).

MODULE REFERENCE
    https://docs.python.org/3.8/library/re
    
    The following documentation is automatically generated from the Python
    source files.  It may be incomplete, incorrect or include features that
    are considered implementation detail and may vary between Python
    implementations.  When in doubt, consult the module reference at the
    location listed above.

DESCRIPTION
    This module provides regular expression matching operations similar to
    those found in Perl.  It supports both 8-bit and Unicode strings; both
    the pattern and the strings being processed can contain null bytes and
    characters outside the US ASCII range.
    
    Regular expressions can contain both special and ordinary characters.
    Most ordinary characters, like "A", "a", or "0", are the simplest
    regular expressions; they simply match themselves.  You can
    concatenate ordinary characters, so last mat

### Compile
Regular expressions are compiled into pattern objects, which have methods for various operations such as searching for pattern matches or performing string substitutions.

In [4]:
# Module Regular Expression is imported using __import__(). 
import re 

# compile() creates regular expression character class [a-e], 
# which is equivalent to [abcde]. 
# class [abcde] will match with string with 'a', 'b', 'c', 'd', 'e'. 
p = re.compile('[a-e]') # It is equivalent to p = re.compile('[abcde]') 

# findall() searches for the Regular Expression and return a list upon finding 
print(p.findall("Aye, said Mr. Gibenson Stark")) 

['e', 'a', 'd', 'b', 'e', 'a']


### Escape
    Escape special characters in a string.

In [9]:
string = "Himanshu@ Goyal!! How are, You.*"
esc_str = re.escape(string)
print(esc_str)

Himanshu@\ Goyal!!\ How\ are,\ You\.\*


### Findall
The findall() function returns a list containing all matches.



In [10]:
import re

txt = "The rain in Spain"
x = re.findall("ai", txt)
print(x)

['ai', 'ai']


### Finditer

In [14]:
import re
s1 = ' The Blue Berries is good for health'
pattern = 'Blue Berries'
for match in re.finditer(pattern, s1):
    s = match.start()
    e = match.end()
    print('String match "%s" at %d:%d' % (s1[s:e], s, e))

String match "Blue Berries" at 5:17


### Fullmatch

In [31]:
m = re.fullmatch('h...o', 'hello')
print(m)
print(m.start())
print(m.end())

<re.Match object; span=(0, 5), match='hello'>
0
5


### Match

In [41]:
text = 'More with less'
m = re.match('More', text)
print(m)

<re.Match object; span=(0, 4), match='More'>


### Search

In [61]:
import re

txt = "The rain in Spain"
x = re.search("Spain", txt)

print("The first white-space character is located in position:")
print("Start Position:- ",x.start())
print("End Position:- ",x.end())
print("Start and end Position:- ",x.span())
print("String is:- ",x.string)
print("String at index start to end position:- ",x.group())

The first white-space character is located in position:
Start Position:-  12
End Position:-  17
Start and end Position:-  (12, 17)
String is:-  The rain in Spain
String at index start to end position:-  Spain


### Split

In [53]:
import re

txt = "The rain in Spain"
x = re.split("\s", txt)
print(x)

['The', 'rain', 'in', 'Spain']


### Sub
The sub() function replaces the matches with the text of your choice:



In [54]:
import re

txt = "The rain in Spain"
x = re.sub("\s", "9", txt)
print(x)

The9rain9in9Spain


### Subn
subn() method It returns a tuple with count of total of all the replacements as well as the new string.



In [64]:
import re
print(re.subn('i', '*' , 'movie tickets booking in online'))
t = re.subn('i', '*' , 'movie tickets booking in online', flags = re.IGNORECASE)
print(t)
print(len(t))
print(t[0])

('mov*e t*ckets book*ng *n onl*ne', 5)
('mov*e t*ckets book*ng *n onl*ne', 5)
2
mov*e t*ckets book*ng *n onl*ne


### Template
compile a template pattern, returning a Pattern object

In [65]:
a = re.template('[a-e]')

In [66]:
print(a.findall("Aye, said Mr. Gibenson Stark")) 


['e', 'a', 'd', 'b', 'e', 'a']


## Metacharacters


### []
    A set of characters	

In [68]:
txt = "The rain in Spain"
#Find all lower case characters alphabetically between "a" and "m":

x = re.findall("[a-m]", txt)
print(x)

['h', 'e', 'a', 'i', 'i', 'a', 'i']


### \	
    Signals a special sequence (can also be used to escape special characters)

In [69]:
txt = "That will be 59 dollars"

#Find all digit characters:

x = re.findall("\d", txt)
print(x)

['5', '9']


### .
    Any character (except newline character)

In [70]:
txt = "hello world"

#Search for a sequence that starts with "he", followed by two (any) characters, and an "o":

x = re.findall("he..o", txt)
print(x)

['hello']


### ^
    Starts with	

In [73]:
txt = "hello world"

#Check if the string starts with 'hello':

x = re.findall("^hello", txt)
if x:
  print("Yes, the string starts with 'hello'")
else:
  print("No match")

Yes, the string starts with 'hello'


### $
    Ends with

In [74]:
txt = "hello world"

#Check if the string ends with 'world':

x = re.findall("world$", txt)
if x:
  print("Yes, the string ends with 'world'")
else:
  print("No match")

Yes, the string ends with 'world'


### *
    Zero or more occurrences	

In [82]:
### txt = "The rain in Spain falls mainly in the plain!"

#Check if the string contains "ai" followed by 0 or more "x" characters:

x = re.findall("aix*", txt) # x = re.findall("ai*", txt) or x = re.findall("ai0*", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['ai', 'ai', 'ai', 'ai']
Yes, there is at least one match!


### +
    One or more occurrences	

In [87]:
import re

txt = "The rain in Spain falls mainly in the plain!"

#Check if the string contains "ai" followed by 1 or more "x" characters:

x = re.findall("ai", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['ai', 'ai', 'ai', 'ai']
Yes, there is at least one match!


### {}
    Exactly the specified number of occurrences	

In [98]:
import re

txt = "The rain in Spain falls mainly in the plain!"

#Check if the string contains "a" followed by exactly two "l" characters:

x = re.findall("al{2}", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['all']
Yes, there is at least one match!


### |
    Either or	

In [102]:
txt = "The rain in Spain falls mainly in the plain!"

#Check if the string contains either "falls" or "stays":

x = re.findall("falls|stays", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['falls']
Yes, there is at least one match!


### \A
    Returns a match if the specified characters are at the beginning of the string.

In [107]:
txt = "The rain in Spain"

#Check if the string starts with "The":

x = re.findall("\AThe", txt)

print(x)

if x:
  print("Yes, there is a match!")
else:
  print("No match")

['The']
Yes, there is a match!


### \b
    Returns a match where the specified characters are at the beginning or at the end of a word (the "r" in the beginning is making sure that the string is being treated as a "raw string")

In [108]:
txt = "The rain in Spain"

#Check if "ain" is present at the beginning of a WORD:

x = re.findall(r"\bain", txt)

print(x)
if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

x = re.findall(r"ain\b", txt)

print(x)
if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

[]
No match
['ain', 'ain']
Yes, there is at least one match!


### \B
    Returns a match where the specified characters are present, but NOT at the beginning (or at the end) of a word(the "r" in the beginning is making sure that the string is being treated as a "raw string")

In [110]:
txt = "The rain in Spain"

#Check if "ain" is present, but NOT at the beginning of a word:

x = re.findall(r"\Bain", txt)

print(x)
if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

#Check if "ain" is present, but NOT at the end of a word:
x = re.findall(r"ain\B", txt)

print(x)
if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['ain', 'ain']
Yes, there is at least one match!
[]
No match


### \d
    Returns a match where the string contains digits (numbers from 0-9)	"\d"	

In [140]:
txt = "The rain in Spain12"

#Check if the string contains any digits (numbers from 0-9):

x = re.findall("\d", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['1', '2']
Yes, there is at least one match!


### \D
    Returns a match where the string DOES NOT contain digits	"\D"	

In [112]:
txt = "The rain in Spain"

#Return a match at every no-digit character:

x = re.findall("\D", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['T', 'h', 'e', ' ', 'r', 'a', 'i', 'n', ' ', 'i', 'n', ' ', 'S', 'p', 'a', 'i', 'n']
Yes, there is at least one match!


### \s 
    Returns a match where the string contains a white space character	"\s"	

In [113]:
txt = "The rain in Spain"

#Return a match at every white-space character:

x = re.findall("\s", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

[' ', ' ', ' ']
Yes, there is at least one match!


### \S
    Returns a match where the string DOES NOT contain a white space character	"\S"	

In [114]:
txt = "The rain in Spain"

#Return a match at every NON white-space character:

x = re.findall("\S", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['T', 'h', 'e', 'r', 'a', 'i', 'n', 'i', 'n', 'S', 'p', 'a', 'i', 'n']
Yes, there is at least one match!


### \w 
    Returns a match where the string contains any word characters (characters from a to Z, digits from 0-9, and the underscore _ character)	"\w"	

In [117]:
txt = "The rain in Spain"

#Return a match at every word character (characters from a to Z, digits from 0-9, and the underscore _ character):

x = re.findall("\w", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['T', 'h', 'e', 'r', 'a', 'i', 'n', 'i', 'n', 'S', 'p', 'a', 'i', 'n']
Yes, there is at least one match!


### \W
    Returns a match where the string DOES NOT contain any word characters	"\W"	

In [118]:
import re

txt = "The rain in Spain"

#Return a match at every NON word character (characters NOT between a and Z. Like "!", "?" white-space etc.):

x = re.findall("\W", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

[' ', ' ', ' ']
Yes, there is at least one match!


### \Z
    Returns a match if the specified characters are at the end of the string	"Spain\Z"

In [119]:
txt = "The rain in Spain"

#Check if the string ends with "Spain":

x = re.findall("Spain\Z", txt)

print(x)

if x:
  print("Yes, there is a match!")
else:
  print("No match")

['Spain']
Yes, there is a match!


## Sets

### [arn]	
    Returns a match where one of the specified characters (a, r, or n) are present	

In [120]:
txt = "The rain in Spain"

#Check if the string has any a, r, or n characters:

x = re.findall("[arn]", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['r', 'a', 'n', 'n', 'a', 'n']
Yes, there is at least one match!


### [a-n]	
    Returns a match for any lower case character, alphabetically between a and n	

In [121]:
txt = "The rain in Spain"

#Check if the string has any lower case character, alphabetically between a and n  

x = re.findall("[a-n]", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['h', 'e', 'a', 'i', 'n', 'i', 'n', 'a', 'i', 'n']
Yes, there is at least one match!


### [^arn]	
    Returns a match for any character EXCEPT a, r, and n	

In [122]:
txt = "The rain in Spain"

#Check if the string has any character EXCEPT a, r, and n
x = re.findall("[^arn]", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")


['T', 'h', 'e', ' ', 'i', ' ', 'i', ' ', 'S', 'p', 'i']
Yes, there is at least one match!


### [0123]	
    Returns a match where any of the specified digits (0, 1, 2, or 3) are present	

In [133]:
txt = "The1 rain2 in3 Sp3ain"

#Check if the string has any of the specified digits (0, 1, 2, or 3) are present 

x = re.findall("[0123]", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['1', '2', '3', '3']
Yes, there is at least one match!


### [0-9]	
    Returns a match for any digit between 0 and 9	

In [134]:
txt = "The rain21 in22 Spain11"

#Check if the string has any digit between 0 and 9 

x = re.findall("[0-9]", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['2', '1', '2', '2', '1', '1']
Yes, there is at least one match!


### [0-5][0-9]	
    Returns a match for any two-digit numbers from 00 and 59	

In [127]:
txt = "The12 rain2 in32 Spain44"

#Check if the string has any two-digit numbers from 00 and 59

x = re.findall("[0-5][0-9]", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['12', '32', '44']
Yes, there is at least one match!


### [a-zA-Z]	
    Returns a match for any character alphabetically between a and z, lower case OR upper case	

In [135]:
txt = "The rain in Spain"

#Check if the string has any character alphabetically between a and z, lower case OR upper case

x = re.findall("[a-zA-Z]", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['T', 'h', 'e', 'r', 'a', 'i', 'n', 'i', 'n', 'S', 'p', 'a', 'i', 'n']
Yes, there is at least one match!


### [+]	
    In sets, +, *, ., |, (), $,{} has no special meaning, so [+] means: return a match for any + character in the string

In [132]:
import re

txt = "8 times before 11+45 A+M"

#Check if the string has any + characters:

x = re.findall("[+]", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['+', '+']
Yes, there is at least one match!
