https://docs.python.org/3/library/re.html

In [1]:
# re.compile

# re.match
# re.search - returns a Match object if there is a match anywhere in the string
# re.fullmatch 

# re.findall - returns a list containing all matches

# re.sub - replaces one or many matches with a string

# re.split - returns a list where the string has been split at each match

# re.finditer

# re.pattern

In [2]:
import re

# re.match()

In [3]:
# Try to apply the pattern at the start of the string
# Returning a match object, or None if no match was found

In [4]:
re.match('abc', 'abcdefg')

<re.Match object; span=(0, 3), match='abc'>

In [5]:
re.match('\w+', 'hi there')

<re.Match object; span=(0, 2), match='hi'>

# re.split()

In [6]:
# Split the source string by the occurrences of the pattern
# Returning a list containing the resulting substrings.

In [7]:
re.split('\s', 'Split on spaces.')

['Split', 'on', 'spaces.']

In [8]:
re.split('\d', 'hello 3 kittens, 2 are fine, 5th is missing')

['hello ', ' kittens, ', ' are fine, ', 'th is missing']

In [9]:
re.split('\w+', '75 model car is 45 year old')

['', ' ', ' ', ' ', ' ', ' ', ' ', '']

# re.search()

In [10]:
# Scan through string looking for a match to the pattern
# Returning a match object, or None if no match was found.

In [11]:
re.search('cd', 'abcde')

<re.Match object; span=(2, 4), match='cd'>

In [12]:
text = "we need help"

if re.search("help", text):
    print("the word 'help' is the string")
else:
    print("The word 'help' is not in the string")

the word 'help' is the string


# re.findall()

In [13]:
# Return a list of all non-overlapping matches in the string.

In [14]:
re.findall("inform", "you need to inform all the information")

['inform', 'inform']

In [15]:
re.findall('[hmp]at', '1at bank pat hat cat bat yes dart')

['pat', 'hat']

In [16]:
re.findall('[a-z]at', '1at bank pat hat cat bat yes dart')

['pat', 'hat', 'cat', 'bat']

In [17]:
re.findall('.at', '1at bank pat hat cat bat yes dart')

['1at', 'pat', 'hat', 'cat', 'bat']

In [18]:
re.findall('\dat', '1at bank pat hat cat bat yes dart')

['1at']

In [19]:
re.findall('(\d+|\w+)' , 'He has 11 cats.')

['He', 'has', '11', 'cats']

# re.sub()

In [20]:
# re.sub(pattern, repl, string, count=0, flags=0)
# Return the string obtained by replacing the leftmost non-overlapping 
# occurrences of the pattern in string by the replacement repl.

In [21]:
re.sub('he', 'app', 'hello')

'appllo'

In [22]:
text = "hey rat , fat cat sat on the mat, but the bat is fast"
re.sub('.at', '***', text)

'hey *** , *** *** *** on the ***, but the *** is fast'

# re.finditer()

In [23]:
text = "you inform all the information"
for i in re.finditer("inform", text):
    print(i.span())

(4, 10)
(19, 25)


# Group

>* Entire expression has to match  
>* But only the grouped expression is returned

In [24]:
text = "Clary has 2 friends who she spends a lot time with. Susan has 3 brothers while John has 4 sisters."

In [25]:
re.findall('[A-Za-z]+\s\w+\s\d+\s\w+', text)

['Clary has 2 friends', 'Susan has 3 brothers', 'John has 4 sisters']

In [26]:
re.findall('([A-Za-z]+)\s\w+\s\d+\s\w+', text)

['Clary', 'Susan', 'John']

In [27]:
re.findall('[A-Za-z]+\s\w+\s(\d+)\s\w+', text)

['2', '3', '4']

In [28]:
re.findall('([A-Za-z]+)\s\w+\s(\d+)\s\w+', text)

[('Clary', '2'), ('Susan', '3'), ('John', '4')]

In [29]:
text = 'My lucky numbers 8755 and 33'

In [30]:
re.findall(r"(\d+)\1", text)

['5', '3']

In [31]:
re.findall(r"\d+", text)

['8755', '33']

In [32]:
re.findall(r"(\d)+", text)

['5', '3']

In [33]:
re.findall(r"(\d+)", text)

['8755', '33']

# Pipe or Or

In [34]:
my_string = "I want to have a pet. But I don't know if I want a cat, a dog or a bird."
re.findall(r"cat|dog|bird", my_string)

['cat', 'dog', 'bird']

In [35]:
my_string = "I want to have a pet. But I don't know if I want 2 cats, 1 dog or a bird."
re.findall(r"\d+\s(cat|dog|bird)", my_string)

['cat', 'dog']

In [36]:
my_string = "I want to have a pet. But I don't know if I want 2 cats, 1 dog or a bird."
re.findall(r"\d+\scat|dog|bird", my_string)


['2 cat', 'dog', 'bird']

# Capturing group

In [52]:
my_date = "Today is 23rd May 2019. Tomorrow is 24th May 19."
information = re.findall(r'(\d+)(th|rd)', my_date)
information

[('23', 'rd'), ('24', 'th')]

In [54]:
information = re.search(r'(\d+)(th|rd)', my_date)
information

<re.Match object; span=(9, 13), match='23rd'>

In [56]:
information.group(0)

'23rd'

In [59]:
information.group(1)

'23'

In [61]:
information.group(2)

'rd'

# Non capturing groups

> * Match but not capture a group
> * When group is not backreferenced
> * Add ?: : (?:regex)

In [39]:
my_date = "Today is 23rd May 2019. Tomorrow is 24th May 19."
re.findall(r'(\d+)(?:th|rd)', my_date)

['23', '24']

In [50]:
my_string = "John Smith: 34-34-34-042-980, Rebeca Smith: 10-10-10-434-425"
information = re.findall(r"(?:\d{2}-){3}(\d{3}-\d{3})", my_string)

['042-980', '434-425']

In [104]:
information

<re.Match object; span=(9, 13), match='23rd'>

# Backreferences

In [72]:
sentence = "I wish you a happy happy birthday!"
re.findall(r"(\w+)\s\1", sentence)

['happy']

# Numbered groups

In [40]:
text = "Python 3.0 was released on 12-03-2008."
information = re.search('(\d{1,2})-(\d{2})-(\d{4})', text)
information.group(3)

'2008'

In [41]:
information.group(1)

'12'

In [42]:
information.group(2)

'03'

# Named groups

In [43]:
# (?P<group_name>[regular_expression])

In [73]:
sentence = "Your new code number is 23434. Please, enter 23434 to open the door."
re.findall(r"(?P<code>\d{5}).*?(?P=code)", sentence)

['23434']

In [44]:
text = "Austin, 78701"
cities = re.search(r"(?P<city>[A-Za-z]+).*?(?P<zipcode>\d{5})", text)
cities.group("city")

'Austin'

In [45]:
cities.group("zipcode")

'78701'

# Look around

In [None]:
# Allow us to conrm that sub-pattern is "ahead or behind" main pattern

In [100]:
my_text = "tweets.txt transferred, mypass.txt transferred, keywords.txt error"
re.findall(r"\w+\.txt ", my_text)

['tweets.txt ', 'mypass.txt ', 'keywords.txt ']

In [103]:
# Positive look-ahead   =>  (?=regex)
# Negative look-ahead   =>  (?!regex)

# Positive look-behind  =>  (?<=regex)
# Negative look-behind  =>  (?<!regex)

## 1. Look ahead

In [88]:
# Non-capturing group
# Checks that the rst part ofthe expression "is followed or not" by the lookahead expression
# Return only the rst part ofthe expression

### 1.1 Positive look ahead

In [89]:
# Non-capturing group
# Checks that the rst part ofthe expression "is followed "by the lookahead expression
# Return only the rst part ofthe expression

In [90]:
# .txt file "followed" by a space and then word 'transferred'
my_text = "tweets.txt transferred, mypass.txt transferred, keywords.txt error"
re.findall(r"\w+\.txt(?=\stransferred)", my_text)

['tweets.txt', 'mypass.txt']

### 1.2 Negative look-ahead

In [91]:
# Non-capturing group
# Checks that the rst part ofthe expression "is not followed" by the lookahead expression
# Return only the rst part ofthe expression

In [99]:
# .txt file "not followed" by a space and then word 'transferred'
my_text = "tweets.txt transferred, mypass.txt transferred, keywords.txt error"
re.findall(r"\w+\.txt(?!\stransferred)", my_text)

['keywords.txt']

## 2. Look-Behind

In [93]:
# Non-capturing group
# Get allthe matches that are "preceded or not" by a specic pattern.
# Return pattern after look-behind expression

### 2.1 Positive look-behind

In [94]:
# Non-capturing group
# Get all the matches that are "preceded "by a specic pattern.
# Return pattern after look-behind expression

In [95]:
# Two words preceded by 'Member: '
my_text = "Member: Angus Young, Member: Chris Slade, Past: Malcolm Young, Past: Cliff Williams."
re.findall(r"(?<=Member:\s)\w+\s\w+", my_text)

['Angus Young', 'Chris Slade']

### 2.2 Negative look-behind

In [96]:
# Non-capturing group
# Get all the matches that are "not preceded" by a specic pattern.
# Return pattern after look-behind expression

In [97]:
# Two words not preceded by 'Member: '
my_text = "My white cat sat at the table. However, my brown dog was lying on the couch."
re.findall(r"(?<!brown\s)(cat|dog)", my_text)

['cat']