# Python RegEx
#A RegEx, or Regular Expression, is a sequence of characters that forms a search pattern.
#RegEx can be used to check if a string contains the specified search pattern.

In [2]:
#RegEx Module
#Python has a built-in package called re, which can be used to work with Regular Expressions.
#Import the re module:

import re

In [4]:
#When you have imported the re module, you can start using regular expressions:

#Example
#Check if the string starts with "The" and ends with "Spain":

import re

txt = "The rain in Spain"
x = re.search("^The.*Spain$", txt)

if x:
  print("YES! We have a match!")
else:
  print("No match")

YES! We have a match!


In [5]:
#RegEx Functions

#The re module offers a set of functions that allows us to search a string for a match:

# findall  - Returns a list containing all matches
# search   - Returns a Match object if there is a match anywhere in the string
# split    - Returns a list where the string has been split at each match
# sub      - Replaces one or many matches with a string

In [6]:
# Metacharacters
# Metacharacters are characters with a special meaning:


#    []     A set of characters                                                                   "[a-m]"
#    \      Signals a special sequence (can also be used to escape special characters)             "\d"
#    .      Any character (except newline character)                                              "he..o" 
#    ^      Starts with                                                                           "^hello"
#    $      Ends with                                                                             "world$"
#    *      Zero or more occurrences                                                               "aix*"
#    +      One or more occurrences                                                                "aix+"
#    {}     Exactly the specified number of occurrences, also {min,max}                      "al{2}" or "al{1,3}"
#    |      Either or                                                                            "falls|stays"
#    ()     Capture and group

In [7]:
#Find all lower case characters alphabetically between "a" and "m":

txt = "The rain in Spain"

x = re.findall("[a-m]", txt)
print(x)

['h', 'e', 'a', 'i', 'i', 'a', 'i']


In [8]:
#Find all digit characters:

txt = "That will be 59 dollars"

x = re.findall("\d", txt)
print(x)

['5', '9']


In [9]:
#Search for a sequence that starts with "he", followed by two (any) characters, and an "o":

txt = "hello world"

x = re.findall("he..o", txt)
print(x)

['hello']


In [10]:
#Check if the string starts with 'hello':

txt = "hello world"

x = re.findall("^hello", txt)

if x:
  print("Yes, the string starts with 'hello'")
else:
  print("No match")

Yes, the string starts with 'hello'


In [11]:
#Check if the string ends with 'world':

txt = "hello world"

x = re.findall("world$", txt)

if x:
  print("Yes, the string ends with 'world'")
else:
  print("No match")

Yes, the string ends with 'world'


In [16]:
#Check if the string contains "ai" followed by 0 or more "x" characters:

txt = "The rain in Spain falls mainly in the plain!"

x = re.findall("aix*", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['ai', 'ai', 'ai', 'ai']
Yes, there is at least one match!


In [17]:
#Check if the string contains "ai" followed by 1 or more "x" characters:

txt = "The rain in Spain falls mainly in the plain!"

x = re.findall("aix+", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

[]
No match


In [18]:
#Check if the string contains "a" followed by exactly two "l" characters:

txt = "The rain in Spain falls mainly in the plain!"

x = re.findall("al{2}", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['all']
Yes, there is at least one match!


In [19]:
#Check if the string contains either "falls" or "stays":

txt = "The rain in Spain falls mainly in the plain!"

x = re.findall("falls|stays", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['falls']
Yes, there is at least one match!


In [20]:
# Special Sequences
# A special sequence is a \ followed by one of the characters in the list below, and has a special meaning:


#   \A   Returns a match if the specified characters are at the beginning of the string                     "\AThe"

#   \b   Returns a match where the specified characters are at the beginning or at the end of a word        r"\bain"
#        (the "r" in the beginning is making sure that the string is being treated as a "raw string")        r"ain\b"

#   \B   Returns a match where the specified characters are present, but NOT at the beginning                r"\Bain"
#        (or at the end) of a word                                                                           r"ain\B"
#        (the "r" in the beginning is making sure that the string is being        
#        treated as a "raw string") 

#   \d   Returns a match where the string contains digits (numbers from 0-9)                                   "\d"
#   \D   Returns a match where the string DOES NOT contain digits                                              "\D"
#   \s   Returns a match where the string contains a white space character                                     "\s"
#   \S   Returns a match where the string DOES NOT contain a white space character                             "\S"

#   \w   Returns a match where the string contains any word characters                                         "\w"
#        (characters from a to Z, digits from 0-9, and the underscore _ character) 

#   \W   Returns a match where the string DOES NOT contain any word characters                                 "\W"
#   \Z   Returns a match if the specified characters are at the end of the string                            "Spain\Z"

In [21]:
#Check if the string starts with "The":

txt = "The rain in Spain"

x = re.findall("\AThe", txt)

print(x)

if x:
  print("Yes, there is a match!")
else:
  print("No match")

['The']
Yes, there is a match!


In [22]:
#Check if "ain" is present at the beginning of a WORD:

txt = "The rain in Spain"

x = re.findall(r"\bain", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

[]
No match


In [23]:
#Check if "ain" is present at the end of a WORD:

txt = "The rain in Spain"

x = re.findall(r"ain\b", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['ain', 'ain']
Yes, there is at least one match!


In [28]:
#Check if "ain" is present, but NOT at the beginning of a word:

txt = "The rain in Spain"

x = re.findall(r"\Bain", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['ain', 'ain']
Yes, there is at least one match!


In [29]:
#Check if "ain" is present, but NOT at the end of a word:

txt = "The rain in Spain"

x = re.findall(r"ain\B", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

[]
No match


In [30]:
#Check if the string contains any digits (numbers from 0-9):

txt = "The rain in Spain"

x = re.findall("\d", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

[]
No match


In [31]:
#Return a match at every no-digit character:

txt = "The rain in Spain"

x = re.findall("\D", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['T', 'h', 'e', ' ', 'r', 'a', 'i', 'n', ' ', 'i', 'n', ' ', 'S', 'p', 'a', 'i', 'n']
Yes, there is at least one match!


In [32]:
#Return a match at every white-space character:

txt = "The rain in Spain"

x = re.findall("\s", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

[' ', ' ', ' ']
Yes, there is at least one match!


In [33]:
#Return a match at every NON white-space character:

txt = "The rain in Spain"

x = re.findall("\S", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['T', 'h', 'e', 'r', 'a', 'i', 'n', 'i', 'n', 'S', 'p', 'a', 'i', 'n']
Yes, there is at least one match!


In [34]:
#Return a match at every word character (characters from a to Z, digits from 0-9, and the underscore _ character):

txt = "The rain in Spain"

x = re.findall("\w", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['T', 'h', 'e', 'r', 'a', 'i', 'n', 'i', 'n', 'S', 'p', 'a', 'i', 'n']
Yes, there is at least one match!


In [35]:
#Return a match at every NON word character (characters NOT between a and Z. Like "!", "?" white-space etc.):

txt = "The rain in Spain"

x = re.findall("\W", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

[' ', ' ', ' ']
Yes, there is at least one match!


In [36]:
#Check if the string ends with "Spain":

txt = "The rain in Spain"

x = re.findall("Spain\Z", txt)

print(x)

if x:
  print("Yes, there is a match!")
else:
  print("No match")

['Spain']
Yes, there is a match!


In [37]:
# Sets
# A set is a set of characters inside a pair of square brackets [] with a special meaning:

#     [arn]        Returns a match where one of the specified characters (a, r, or n) are present
#     [a-n]        Returns a match for any lower case character, alphabetically between a and n
#     [^arn]       Returns a match for any character EXCEPT a, r, and n
#     [0123]       Returns a match where any of the specified digits (0, 1, 2, or 3) are present
#     [0-9]        Returns a match for any digit between 0 and 9
#   [0-5][0-9]     Returns a match for any two-digit numbers from 00 and 59
#    [a-zA-Z]      Returns a match for any character alphabetically between a and z, lower case OR upper case
#      [+]         In sets, +, *, ., |, (), $,{} has no special meaning, so [+] means: return a match for 
#                  any + character in the string

In [38]:
#Check if the string has any a, r, or n characters:

txt = "The rain in Spain"

x = re.findall("[arn]", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['r', 'a', 'n', 'n', 'a', 'n']
Yes, there is at least one match!


In [39]:
#Check if the string has any characters between a and n:

txt = "The rain in Spain"

x = re.findall("[a-n]", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['h', 'e', 'a', 'i', 'n', 'i', 'n', 'a', 'i', 'n']
Yes, there is at least one match!


In [40]:
#Check if the string has other characters than a, r, or n:

txt = "The rain in Spain"

x = re.findall("[^arn]", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['T', 'h', 'e', ' ', 'i', ' ', 'i', ' ', 'S', 'p', 'i']
Yes, there is at least one match!


In [41]:
#Check if the string has any 0, 1, 2, or 3 digits:

txt = "The rain in Spain"

x = re.findall("[0123]", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

[]
No match


In [42]:
#Check if the string has any digits:

txt = "8 times before 11:45 AM"

x = re.findall("[0-9]", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['8', '1', '1', '4', '5']
Yes, there is at least one match!


In [43]:
#Check if the string has any two-digit numbers, from 00 to 59:

txt = "8 times before 11:45 AM"

x = re.findall("[0-5][0-9]", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['11', '45']
Yes, there is at least one match!


In [44]:
#Check if the string has any characters from a to z lower case, and A to Z upper case:

txt = "8 times before 11:45 AM"

x = re.findall("[a-zA-Z]", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['t', 'i', 'm', 'e', 's', 'b', 'e', 'f', 'o', 'r', 'e', 'A', 'M']
Yes, there is at least one match!


In [45]:
#Check if the string has any + characters:

txt = "8 times before 11:45 AM"

x = re.findall("[+]", txt)

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

[]
No match


In [47]:
# findall() 

# The findall() function returns a list containing all matches.
#Return a list containing every occurrence of "ai":
# (the list contains the matches in the order they are found)

txt = "The rain in Spain"
x = re.findall("ai", txt)
print(x)

['ai', 'ai']


In [48]:
# search() 

# The search() function searches the string for a match, and returns a Match object if there is a match.
# If there is more than one match, only the first occurrence of the match will be returned
#If no matches are found, the value None is returned:

txt = "The rain in Spain"
x = re.search("\s", txt)

print("The first white-space character is located in position:", x.start())

The first white-space character is located in position: 3


In [49]:
# split() 

# The split() function returns a list where the string has been split at each match:

#Split the string at every white-space character:

txt = "The rain in Spain"
x = re.split("\s", txt)
print(x)

['The', 'rain', 'in', 'Spain']


In [50]:
# You can control the number of occurrences by specifying the maxsplit parameter:

txt = "The rain in Spain"
x = re.split("\s", txt, 1)
print(x)

['The', 'rain in Spain']


In [51]:
# sub() 

#The sub() function replaces the matches with the text of your choice:
#Replace every white-space character with the number 9:

txt = "The rain in Spain"
x = re.sub("\s", "9", txt)
print(x)

The9rain9in9Spain


In [52]:
#You can control the number of replacements by specifying the count parameter:
#Replace the first 2 occurrences:

txt = "The rain in Spain"
x = re.sub("\s", "9", txt, 2)
print(x)

The9rain9in Spain


In [53]:
# Match Object

#A Match Object is an object containing information about the search and the result.
#Note: If there is no match, the value None will be returned, instead of the Match Object.

txt = "The rain in Spain"
x = re.search("ai", txt)
print(x) #this will print an object

<re.Match object; span=(5, 7), match='ai'>


In [54]:
# The Match object has properties and methods used to retrieve information about the search, and the result:

#  .span()    returns a tuple containing the start-, and end positions of the match.
#  .string    returns the string passed into the function
#  .group()   returns the part of the string where there was a match

In [55]:
# Example
# Print the position (start- and end-position) of the first match occurrence.
# The regular expression looks for any words that starts with an upper case "S":

txt = "The rain in Spain"
x = re.search(r"\bS\w+", txt)
print(x.span())

(12, 17)


In [56]:
# Print the string passed into the function:

txt = "The rain in Spain"
x = re.search(r"\bS\w+", txt)
print(x.string)

The rain in Spain


In [57]:
#Print the part of the string where there was a match.
#The regular expression looks for any words that starts with an upper case "S":
# Note: If there is no match, the value None will be returned, instead of the Match Object.

txt = "The rain in Spain"
x = re.search(r"\bS\w+", txt)
print(x.group())

Spain
