In [1]:
# An extension to dealing with strings is the concept of regular expressions. 
# The broad concept is that we can do advanced searches beyond the simple ones.

text = "Hello in this is string"
"Hello" in text

True

In [2]:
# The first example looks at finding all the characters a to m within the string Rob Mastrodomenico. 
# Now to do this we pass a string containing a-m within list parenthesis and pass this into the 'findall' method 
# with the string of interest which has been called name. 
# The result from this is a list containing all the characters in a-m which appear in the string.

import re

name = 'Rob Mastrodomenico'
x = re.findall("ob",name)
x

['ob']

In [3]:
x = re.findall("ab",name)
x

[]

In [4]:
x = re.findall("at",name)
x

[]

In [5]:
x = re.findall("[a-m]",name)
x

['b', 'a', 'd', 'm', 'e', 'i', 'c']

In [6]:
# Next, we see how we can find the integer values 0–9 within a sequence. 
# We can do this in two ways; the first is by mimicking what we used in the previous example with the list convention 
# but also by using d which gives us all values that are between 0 and 9. 
# Both return a list of values that occur within the string.

txt = 'Find all numerical values like 1, 2, 3'
x = re.findall("[0-9]",txt)
x

['1', '2', '3']

In [7]:
x = re.findall("\d",txt)
x

['1', '2', '3']

In [8]:
txt = 'Find all numerical values like 1, 2, 3, 3'
x = re.findall("[0-9]",txt)
x

['1', '2', '3', '3']

In [9]:
x = re.findall("\d",txt)
x

['1', '2', '3', '3']

In [10]:
# In next example, we use the standard text hello world and look for a specific pattern. 
# We can pass the string “he..o” into the findall method and what this does is search for a sequence which starts
# with he and has any two characters and is followed by an o, which fits nicely with the word hello. 
# So in passing this in we get back the list containing the string hello. 

txt = "hello world"
x = re.findall("he..o",txt)
x

['hello']

In [11]:
x = re.findall("he.o",txt)
x

[]

In [12]:
x = re.findall("he....o",txt)
x

[]

In [13]:
x = re.findall("ho..o",txt)
x

[]

In [14]:
# We can expand on this by changing the string to “hello helpo hesoo” in doing so we see that all these words are passed 
# back from the findall. 
# In using a different example like this, we can see how this could be applied across a bigger piece of text to see all 
# the words that match this sequence.

txt = "hello helpo hesoo"
x = re.findall("he..o",txt)
x

['hello', 'helpo', 'hesoo']

In [15]:
# Next, we look at how to search specifically on the start of the string. 
# To do so you use the ^ symbol prefixed to the string of interest, 
# in this case we look for a string that starts with the string start. 
# What the result of this gives is a list containing the word that is found 
# so in the first example we get back the list containing the string start 
# and in the second example we get an empty list.

txt ='starts at the end'
x = re.findall("^start",txt)
x

['start']

In [16]:
txt ='ends at the start'
x = re.findall("^start",txt)
x

[]

In [17]:
# We can achieve the same thing for looking at the last word in the 
# string by using ending the searched string with the $ sign. 
# In this example, we show what we get when searching for the last part of a given string 
# and in a similar way to previous example we return a list containing that string if it 
# does exist and an empty list if it doesn’t.

txt = 'the last word is end'
x = re.findall("end$",txt)
x

['end']

In [18]:
txt = 'the last word is end sometimes'
x = re.findall("end$",txt)
x

[]

In [19]:
# The last two examples look at finding something specific at the start or end of a given string.
# In the next example we look at all instances of a given string with another string.
# What we are looking to do here is find the occurrences of ai followed by 0 or more x values.
# So the first example shows that there are four instances of the string ai within the string when we search for aix. 
# As in the previous examples if we don’t have any instances then we get returned an empty string.

txt = "The rain in Spain falls mainly in the plain!"
x = re.findall("aix*",txt)
x

['ai', 'ai', 'ai', 'ai']

In [20]:
txt = 'This isnt like the other'

x = re.findall("aix*",txt)
x

[]

In [21]:
# Expanding on the previous example you can find the number of instances of the string ai followed 
# by one or more x by adding the + symbol. 
# Applying that to the same string as before gives us the result of an empty string as we don’t have aix within it.

txt = "The rain in Spain falls mainly in the plain!"
x = re.findall("aix+",txt)
x

[]

In [22]:
x = re.findall("ainl+",txt)
x

['ainl']

In [23]:
# If we are after a specified number of characters that we want to see we can use curly
# brackets containing the number of instances we are interested in. 
# So in the next example we want to find moo within the string so we can do it as mo2 or moo, 
# with each returning a string containing the character we have searched for.

txt = 'The cow said moo'
x = re.findall("mo{2}",txt)
x

['moo']

In [24]:
x = re.findall("mo{3}",txt)
x

[]

In [25]:
x = re.findall("mo{1}",txt)
x

['mo']

In [26]:
x = re.findall("moo",txt)
x

['moo']

In [27]:
# If we want to find one or another value we can do so by using the | symbol between the two strings 
# that we are interested in searching for. 
# In the example that we show we are looking for the strings avengers or heroes in our string. 
# As we have the string Avengers with a capitalised A we only have an exact match on heroes. 
# The second example uses Avengers with a capital A and therefore as that is exactly matched within the 
# string we get back a list containing both strings. 
# The last example shows what happens if we have multiple instances of one of the words that we are searching 
# for giving us the number of instances in the order that we see them.

txt = "The Avengers are earths mightiest heroes"
x = re.findall("avengers|heroes",txt)
x

['heroes']

In [28]:
x = re.findall("Avengers|heroes",txt)
x

['Avengers', 'heroes']

In [29]:
txt = "The Avengers are earths mightiest heroes go Avengers"
x = re.findall("Avengers|heroes",txt)
x

['Avengers', 'heroes', 'Avengers']

In [30]:
# We can also use special sequences like the one below which returns the whitespace in a given string:
# \s Returns a match where the string contains a white space character "\s"

txt = "Is there whitespace_1_2_3"
x = re.findall("\s",txt)
x

[' ', ' ']

In [31]:
# \S Returns a match where the string DOES NOT contain a white space character "\S"

x = re.findall("\S",txt)
x

['I',
 's',
 't',
 'h',
 'e',
 'r',
 'e',
 'w',
 'h',
 'i',
 't',
 'e',
 's',
 'p',
 'a',
 'c',
 'e',
 '_',
 '1',
 '_',
 '2',
 '_',
 '3']

In [32]:
# \A: This matches if the characters defined are at the beginning of the string "\AIt"

x = re.findall("\AIs",txt)
x

['Is']

In [33]:
x = re.findall("\Ath",txt)
x

[]

In [34]:
# \b: This matches if the characters defined are at the beginning or at the end of a word r"\bain" r"ain\b"

txt = "The rain in Spain falls mainly in the plain!"
x = re.findall(r"\bain",txt)
x

[]

In [35]:
x = re.findall(r"ain\b",txt)
x

['ain', 'ain', 'ain']

In [36]:
# \B Returns a match where the specified characters are present, but NOT at the beginning (or at the end) 
# of a word (the "r" in the beginning is making sure that the string is being treated as a "raw string") 
# r"\Bain" r"ain\B"

x = re.findall(r"\Bain",txt)
x

['ain', 'ain', 'ain', 'ain']

In [37]:
x = re.findall(r"ain\B",txt)
x

['ain']

In [38]:
# \d Returns a match where the string contains digits (numbers from 0-9) "\d"

txt = "Is there whitespace_1_2_3"
x = re.findall("\d",txt)
x

['1', '2', '3']

In [39]:
# \D Returns a match where the string DOES NOT contain digits "\D"

x = re.findall("\D",txt)
x

['I',
 's',
 ' ',
 't',
 'h',
 'e',
 'r',
 'e',
 ' ',
 'w',
 'h',
 'i',
 't',
 'e',
 's',
 'p',
 'a',
 'c',
 'e',
 '_',
 '_',
 '_']

In [40]:
# \w Returns a match where the string contains any word characters (characters from a to Z, 
# digits from 0-9, and the underscore _ character) "\w"

x = re.findall("\w",txt)
x

['I',
 's',
 't',
 'h',
 'e',
 'r',
 'e',
 'w',
 'h',
 'i',
 't',
 'e',
 's',
 'p',
 'a',
 'c',
 'e',
 '_',
 '1',
 '_',
 '2',
 '_',
 '3']

In [41]:
# \W Returns a match where the string DOES NOT contain any word characters "\W"

x = re.findall("\W",txt)
x

[' ', ' ']

In [42]:
# \Z Returns a match if the specified characters are at the end of the string

x = re.findall("2 3\Z",txt)
x

[]

In [43]:
# split method.

txt = "The rain in Spain"
x = re.split("\s",txt)
x

['The', 'rain', 'in', 'Spain']

In [44]:
# Expanding on this we can specify the number of times we want the split to be done by using the maxsplit argument. 
# The below examples set the value to 1, 2, and 3. In each example, we see that the number of splits increases, 
# so setting the value to 1 provides us with a list containing the results of a single split. 
# As this increases we get more and more splits included.

x = re.split("\s", txt, maxsplit=1)
x

['The', 'rain in Spain']

In [45]:
x = re.split("\s", txt, maxsplit=2)
x

['The', 'rain', 'in Spain']

In [46]:
x = re.split("\s", txt, maxsplit=3)
x

['The', 'rain', 'in', 'Spain']

In [47]:
# The next method we demonstrate is the 'sub' method which behaves in a similar way to replace on a string. 
# In the below example we replace the white space with the value 9:

x = re.sub("\s", "_", txt)
x

'The_rain_in_Spain'

In [48]:
# As with the previous example of split we have an extra argument that can be used here namely count,  
# and again we apply it with values 1, 2, and 3. 
# The result of this is the number of values that are replaced in the string with 1 giving only the 
# first space being replaced by _, 2 giving the first 2 spaces being replaced and so on.

x = re.sub("\s", "_", txt, 1)
x

'The_rain in Spain'

In [49]:
x = re.sub("\s", "_", txt, 2)
x

'The_rain_in Spain'

In [50]:
x = re.sub("\s", "_", txt, 3)
x

'The_rain_in_Spain'

In [51]:
# The last example that we look at is using the 'span' method from a search result. 
# Here, if we search for a set of characters in.

x = re.search("ai",txt)
x

<re.Match object; span=(5, 7), match='ai'>

In [52]:
x.span()

(5, 7)