Python for Everybody
## Chapter 11:  Regular Expressions

The following page shows a complete list of special chracters used in regular expressions:

https://docs.python.org/3/library/re.html

Don't try to read and remember everything at once.
It is best to learn regular expressions as you need them.


In [2]:
# Some examples

# Searching and extracting is such a common task that Python has a very powerful library that handles these tasks
# These are 'regular expressoins'

import re
fhand = open("text/a_dream_within_a_dream.txt")

print("# if there is at least one \'in\' in the line")
for line in fhand:
    res = re.search('in', line.strip().lower())                    # searches and prints all lines that contain
    if res:                                                        # the string 'in'
        print(line.strip())                                        # doesn't check for spaces before or after, so
fhand.close()                                                      # this would also print words that contain 'in'

fhand = open("text/a_dream_within_a_dream.txt")
print("\n# if the line starts with \'in\'")
for line in fhand:
    res = re.search('^in', line.strip().lower())                   # searches and prints all lines that begin with
    if res:                                                        # the string 'in'
        print(line.strip())                                        # '^in' means the line starts with 'in'
fhand.close()

fhand = open("text/a_dream_within_a_dream.txt")
print("\n# if the line ends with \'?\'")
for line in fhand:
    res = re.search('\?$', line.strip().lower())                   # searches and prints all lines that end with
    if res:                                                        # a question mark
        print(line.strip())                                        # '\?$' means the line ends with a question mark
fhand.close()

fhand = open("text/a_dream_within_a_dream.txt")
print("\n# if \'in\' is surrounded by white space characters")
for line in fhand:
    res = re.search('\\sin\\s', line.strip().lower())              # searches and prints all lines that contain
    if res:                                                        # the word 'in', surrounded by spaces
        print(line.strip())                                        # '\\sin\\s' means 'in' with white space around it
fhand.close()

fhand = open("text/a_dream_within_a_dream.txt")
print("\n# if \'in\' is surrounded by NON white space characters")
for line in fhand:    
    res = re.search('\\Sin\\S', line.strip().lower())              # searches and prints all lines that contain the
    if res:                                                        # string 'in' surrounded by other letters or chars
        print(line.strip())                                        # '\\Sin\\S' matches NON white space characters
fhand.close()


# if there is at least one 'in' in the line
A Dream Within A Dream
And, in parting from you now,
In a night, or in a day,
In a vision, or in none,
Is but a dream within a dream.
And I hold within my hand
Grains of the golden sand—
Through my fingers to the deep,
But a dream within a dream?

# if the line starts with 'in'
In a night, or in a day,
In a vision, or in none,

# if the line ends with '?'
Is it therefore the less gone?
Them with a tighter clasp?
One from the pitiless wave?
But a dream within a dream?

# if 'in' is surrounded by white space characters
And, in parting from you now,
In a night, or in a day,
In a vision, or in none,

# if 'in' is surrounded by NON white space characters
And, in parting from you now,
Grains of the golden sand—
Through my fingers to the deep,


In [147]:
# 11.1 Character matching in regular expressions

# Special characters let you build more powerful expressions
# The character '.' matches any character except the newline character
# For example, 'F..m' would match any four character string starting with 'F' and ending with 'm'
# Such as 'From', 'Foam', 'F12m', 'F!@m', etc

# Search for lines that start with 'F', followed by
# 2 characters, followed by 'm:'

import re
hand = open('text/mbox-short.txt')
for line in hand:
    line = line.rstrip()
    if re.search('^F..m:', line):    # '.' (dot) matches a single character, except the newline chracter.
        print(line)


From: stephen.marquard@uct.ac.za
From: louis@media.berkeley.edu
From: zqian@umich.edu
From: rjlowe@iupui.edu
From: zqian@umich.edu
From: rjlowe@iupui.edu
From: cwen@iupui.edu
From: cwen@iupui.edu
From: gsilver@umich.edu
From: gsilver@umich.edu
From: zqian@umich.edu
From: gsilver@umich.edu
From: wagnermr@iupui.edu
From: zqian@umich.edu
From: antranig@caret.cam.ac.uk
From: gopal.ramasammycook@gmail.com
From: david.horwitz@uct.ac.za
From: david.horwitz@uct.ac.za
From: david.horwitz@uct.ac.za
From: david.horwitz@uct.ac.za
From: stephen.marquard@uct.ac.za
From: louis@media.berkeley.edu
From: louis@media.berkeley.edu
From: ray@media.berkeley.edu
From: cwen@iupui.edu
From: cwen@iupui.edu
From: cwen@iupui.edu


In [252]:
# More examples

import re

#url = '<p>Hello World</p><a href="http://example.com">More Examples</a><a href="http://example2.com">Even More Examples</a>'
#urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', url)
#print(urls)

s= '(412) 234-5678'
print(re.findall('\([0-9]+\)', s))    # find the area code, the numbers surrounded by parenthesis
print(re.findall('\\s[0-9]+', s))     # find the exchange, the numbers with ' ' on the left and a '-' on the right
print(re.findall('[0-9]+$', s))       # find the phone number, the numbers with '-' on the left

q = "Two things are infinite: the universe and human stupidity; and I\'m not sure about the universe. ― Albert Einstein"

# one or more characters followed by a space then 'universe'. (i.e., bi-gram ends with 'universe')
print(re.findall('\w+\suniverse', q))  

# 'universe' followed by a space, a word, a space, a word (i.e., tri-gram starts with 'universe')
print(re.findall('universe\s\w+\s\w+', q))  

# '.*' matches any number of characters. (i.e., a pattern begins and ends with 'universe')
print(re.findall('universe.*universe', q))  

# '|' (vertical bar) can be used to list options. (i.e., 'infinite or 'human')
print(re.findall('(infinite|human)', q))  

# You can construct a query in your code.
w1 = 'infinite'
w2 = 'human'
query1 = w1 + "|" + w2
query2 = w1 + ".*" + w2
query3 = "\w+\\s" + w2 + "\\s\w+"
print(re.findall(query1, q))
print(re.findall(query2, q))
print(re.findall(query3, q))            


['(412)']
[' 234']
['5678']
['the universe', 'the universe']
['universe and human']
["universe and human stupidity; and I'm not sure about the universe"]
['infinite', 'human']
['infinite', 'human']
['infinite: the universe and human']
['and human stupidity']


In [150]:
# 11.1 (cont.)

# Search for lines that start with 'From' and have an at sign ('@')

import re
hand = open('text/mbox-short.txt')
for line in hand:
    line = line.rstrip()
    if re.search('^From:.+@', line):
        print(line)
        

From: stephen.marquard@uct.ac.za
From: louis@media.berkeley.edu
From: zqian@umich.edu
From: rjlowe@iupui.edu
From: zqian@umich.edu
From: rjlowe@iupui.edu
From: cwen@iupui.edu
From: cwen@iupui.edu
From: gsilver@umich.edu
From: gsilver@umich.edu
From: zqian@umich.edu
From: gsilver@umich.edu
From: wagnermr@iupui.edu
From: zqian@umich.edu
From: antranig@caret.cam.ac.uk
From: gopal.ramasammycook@gmail.com
From: david.horwitz@uct.ac.za
From: david.horwitz@uct.ac.za
From: david.horwitz@uct.ac.za
From: david.horwitz@uct.ac.za
From: stephen.marquard@uct.ac.za
From: louis@media.berkeley.edu
From: louis@media.berkeley.edu
From: ray@media.berkeley.edu
From: cwen@iupui.edu
From: cwen@iupui.edu
From: cwen@iupui.edu


In [261]:
# 11.2 Extracting data using regular expressions

# the method 'findall()' extracts all the substrings which match a regular expression

import re
s = 'A message from csev@umich.edu to cwen@iupui.edu about meeting @2PM'
lst = re.findall('\S+@\S+', s)     # searches 's' for all substrings with '@' surrounded by non-white space characters
print(lst)

s = 'A message from csev@umich.edu to cwen@iupui.edu about meeting @2PM'
lst = re.findall('\w+\\s\w+(?=@)', s)    # '(?=...)'  if ... matches next
print(lst)


['csev@umich.edu', 'cwen@iupui.edu']
['from csev', 'to cwen']


In [109]:
# 11.3 Combining Searching and Extracting

# Search for lines that start with 'X' followed by any non-whitespace character
# and containing the character ':' followed by a space and any number.
# The number can include a decimal.

import re
hand = open('text/mbox-short.txt')
for line in hand:
    line = line.rstrip()
    if re.search('^X\S*: [0-9.]+', line):          # search() returns the whole line
        print(line)
        

X-DSPAM-Confidence: 0.8475
X-DSPAM-Probability: 0.0000
X-DSPAM-Confidence: 0.6178
X-DSPAM-Probability: 0.0000
X-DSPAM-Confidence: 0.6961
X-DSPAM-Probability: 0.0000
X-DSPAM-Confidence: 0.7565
X-DSPAM-Probability: 0.0000
X-DSPAM-Confidence: 0.7626
X-DSPAM-Probability: 0.0000
X-DSPAM-Confidence: 0.7556
X-DSPAM-Probability: 0.0000
X-DSPAM-Confidence: 0.7002
X-DSPAM-Probability: 0.0000
X-DSPAM-Confidence: 0.7615
X-DSPAM-Probability: 0.0000
X-DSPAM-Confidence: 0.7601
X-DSPAM-Probability: 0.0000
X-DSPAM-Confidence: 0.7605
X-DSPAM-Probability: 0.0000
X-DSPAM-Confidence: 0.6959
X-DSPAM-Probability: 0.0000
X-DSPAM-Confidence: 0.7606
X-DSPAM-Probability: 0.0000
X-DSPAM-Confidence: 0.7559
X-DSPAM-Probability: 0.0000
X-DSPAM-Confidence: 0.7605
X-DSPAM-Probability: 0.0000
X-DSPAM-Confidence: 0.6932
X-DSPAM-Probability: 0.0000
X-DSPAM-Confidence: 0.7558
X-DSPAM-Probability: 0.0000
X-DSPAM-Confidence: 0.6526
X-DSPAM-Probability: 0.0000
X-DSPAM-Confidence: 0.6948
X-DSPAM-Probability: 0.0000
X-DSPAM-Co

In [112]:
# 11.3 (cont. #2)

# Search for lines that start with 'X' followed by any non-whitespace character
# and containing the character ':' followed by a space and any number.
# The number can include a decimal.
# Then print the number if it is greater than zero.

import re
hand = open('text/mbox-short.txt')
for line in hand:
    line = line.rstrip()
    x = re.findall('^X\S*: ([0-9.]+)', line)               # findall() only returns the number, not the whole line
    if len(x) > 0:
        print(x)

['0.8475']
['0.0000']
['0.6178']
['0.0000']
['0.6961']
['0.0000']
['0.7565']
['0.0000']
['0.7626']
['0.0000']
['0.7556']
['0.0000']
['0.7002']
['0.0000']
['0.7615']
['0.0000']
['0.7601']
['0.0000']
['0.7605']
['0.0000']
['0.6959']
['0.0000']
['0.7606']
['0.0000']
['0.7559']
['0.0000']
['0.7605']
['0.0000']
['0.6932']
['0.0000']
['0.7558']
['0.0000']
['0.6526']
['0.0000']
['0.6948']
['0.0000']
['0.6528']
['0.0000']
['0.7002']
['0.0000']
['0.7554']
['0.0000']
['0.6956']
['0.0000']
['0.6959']
['0.0000']
['0.7556']
['0.0000']
['0.9846']
['0.0000']
['0.8509']
['0.0000']
['0.9907']
['0.0000']


In [114]:
# 11.3 (cont. #3)

# Search for lines that start with 'Details: rev='
# followed by numbers and '.'
# Then print the number if it is greater than zero

import re
hand = open('text/mbox-short.txt')
for line in hand:
    line = line.rstrip()
    x = re.findall('^Details:.*rev=([0-9.]+)', line)
    if len(x) > 0:
        print(x)
        

['39772']
['39771']
['39770']
['39769']
['39766']
['39765']
['39764']
['39763']
['39762']
['39761']
['39760']
['39759']
['39758']
['39757']
['39756']
['39755']
['39754']
['39753']
['39752']
['39751']
['39750']
['39749']
['39746']
['39745']
['39744']
['39743']
['39742']


In [115]:
# 11.3 (cont. #4)

# Search for lines that start with 'From'
# and a character, followed by a two digit number between 00 and 99, followed by the character ':'
# Then print the number if it is greater than zero

import re
hand = open('text/mbox-short.txt')
for line in hand:
    line = line.rstrip()
    x = re.findall('^From .* ([0-9][0-9]):', line)
    if len(x) > 0: 
        print(x)
        

['09']
['18']
['16']
['15']
['15']
['14']
['11']
['11']
['11']
['11']
['11']
['11']
['10']
['10']
['10']
['09']
['07']
['06']
['04']
['04']
['04']
['19']
['17']
['17']
['16']
['16']
['16']


In [117]:
# 11.4 Escape Character

# Since you use special characters to match the beginning or end of a line,
# you can indicate that you want to match the actual character (for example, the dollar sign $)
# by using a backslash ('\') before the character
# such as '\$' to indicate that you want to find the '$' character

import re
x = 'We just received $10.00 for cookies.'
y = re.findall('\$[0-9.]+',x)
print(y)


['$10.00']


In [5]:
# 11.5 Summary



In [None]:
# 11.6 Bonus Section for Unix / Linux users

# This section is optional