# Regular Expressions

Regular expressions are basically powerful methods for searching text strings and finding specific matches. Some examples have been included below but to be honest you're probably going to have to reference a table of what each char means to build these search strings for a while.

In [22]:
# Set up an example list as if I've read in a text file
multiLineEx = ['Hello!-this: is line one\n',
               'line two line two\n',
               'third line, mighty fine\n',
               'Hey-this: is line four\n',
               'this is the -- fifth -- line\n',
               'Hi!- this: is line six\n',
               'Hi!-this: is line seven\n']
for line in multiLineEx:
    print(line)

Hello!-this: is line one

line two line two

third line, mighty fine

Hey-this: is line four

this is the -- fifth -- line

Hi!- this: is line six

Hi!-this: is line seven



In [23]:
import re   # Import regex library

# Example of searching for string at beginning of line
for line in multiLineEx:
    line = line.rstrip() # Included \n chars in example list
    
    # Search line for the specified string; in regex, ^ = starts with
    if re.search('^(th.*)', line):
        print(line)

third line, mighty fine
this is the -- fifth -- line


In [24]:
# . matches any character - wildcard. * means any number of times, specifically 0 or more
# So, .* = any number of any characters
# Example of searching for f*** anywhere in line
for line in multiLineEx:
    line = line.rstrip() # Included \n chars in example list
    
    # Search line for the specified string
    if re.search('f.*', line):
        print('First query: ' + line)
    
    # Similarly, could look only for f*** --; * applies once, retroactively;
    # can still build string up from there
    if re.search('f.* --', line):
        print('Second query: ' + line)
        
    # Can put together a more complicated search. This searches for, in plain language,
    # strings that start with H, followed by any number of any character, followed by a -,
    # followed by one or more non-whitespace characters, followed by a colon!
    # NOTE especially that this does not pick up line six due to space after - !
    # It does however pick up line seven, which is nearly identical except for lack
    # of space. So, regex are pretty powerful, if weird to read.
    if re.search('^H.*-\S+:', line):
        print('Third query: ' + line)

Third query: Hello!-this: is line one
First query: third line, mighty fine
First query: Hey-this: is line four
Third query: Hey-this: is line four
First query: this is the -- fifth -- line
Second query: this is the -- fifth -- line
Third query: Hi!-this: is line seven


In [42]:
# Now, instead of simple searching and returning a bool, try extracting some data!
exString = '''The fine flower bloomed near the flooded riverbank; it took 21 days, 2 gentle hands, 
              nurturing its growth, and 230 carefully-applied drops of the purest water.'''

# Get all digits; anything in the range 0-9, one or more in a row!
# Note that this returns a list.
print(re.findall('[0-9]+', exString))

# Get all vowels, then only double vowels
# Be careful to not put spaces in between entries in the [], or it'll
# search for spaces as well. Note also that you don't need commas!
print(re.findall('[aeiou]+', exString))
print(re.findall('[aeiou]{2}', exString))

# Get all uppercase vowels
# Note that this is case sensitive, and returns an empty list if not present in string
print(re.findall('[AEIOU]+', exString))

['21', '2', '230']
['e', 'i', 'e', 'o', 'e', 'oo', 'e', 'ea', 'e', 'oo', 'e', 'i', 'e', 'a', 'i', 'oo', 'a', 'e', 'e', 'a', 'u', 'u', 'i', 'i', 'o', 'a', 'a', 'e', 'u', 'a', 'ie', 'o', 'o', 'e', 'u', 'e', 'a', 'e']
['oo', 'ea', 'oo', 'oo', 'ie']
[]


In [58]:
# Example of what greedy matching means - basically, re will look for the
# longest string that fits the search and try to return that. This comes
# into play when there are two or more possible subsets of the string
# that would match a search, like below.
# Also note that greedy matching goes in both directions!
exStr = 'From: boaty@gmail.com: subject: boats'

# If I do nothing and allow greedy matching, it grabs until the last colon
# So, it looks for longest result, it does not stop at the first colon it finds
print('Greedy: ' + str(re.findall('^F.+:', exStr)))
print('Non-greedy: ' + str(re.findall('^F.+?:', exStr)))

# If I want up through the second colon, can just refine search; note
# that it would still grab through the end of the string if I leave the first
# .+ as greedy!
print('Through second colon: ' + str(re.findall('^F.+?: .+?:', exStr)))

# Grab only email address; \S+ = at least one non-whitespace
print('Email: ' + str(re.findall('\S+@\S+', exStr)))

# Use parentheses to tell findall when to start extracting; can match longer
# string and then only extract subset, in parentheses, if desired
# Can stop getting the colon after by refining the parentheses.
print('Email, with more specific search: ' + str(re.findall('^F.*?: (\S+@\S+):', exStr)))

# Get only 'gmail.com'. For [], if the first character is ^, that means
# NOT, i.e., here get only those characters which are NOT spaces
print('Email, only domain: ' + str(re.findall('@([^ ]*):', exStr)))

Greedy: ['From: boaty@gmail.com: subject:']
Non-greedy: ['From:']
Through second colon: ['From: boaty@gmail.com:']
Email: ['boaty@gmail.com:']
Email, with more specific search: ['boaty@gmail.com']
Email, only domain: ['gmail.com']


In [59]:
# Example of escaping to find actual character that is otherwised used for searches in regex,
# e.g. the $
exStr = 'Girl scout cookies cost $10.00, which is quite high I think.'

# Can do this by escaping the $ with \
# [0-9.] indicates it should be looking for floating points! I think just
# through looking for .
print(re.findall('\$[0-9.]+', exStr))

['$10.00']


In [64]:
# Running through HW from course; good example of reading a file as well using with
import re

sumVals = 0

with open('regex-homework.txt') as f:
    content = f.readlines()
    
for line in content:
    numsInLine = re.findall('[0-9]+', line)
    for val in numsInLine:
        sumVals += int(val)
        
print(sumVals)

299461
