# Regular Expressions (for string or bytes-like object only)

#

## The basic rules of regular expression search for a pattern within a string are:
##    The search proceeds through the string from start to end, stopping at the first match found
##    All of the pattern must be matched, but not all of the string
##    If match = re.search(pat, str) is successful, match is not None and in particular match.group() is the matching text 

In [2]:
import re

## \w -- (lowercase w) matches a "ALPHANUMERIC" character: a letter or digit or underbar [a-zA-Z0-9_]. Note that although "word" is the mnemonic for this, it only matches a single word char, not a whole word. 
## \W (upper case W) matches any non-ALPHANUMERIC character.

In [9]:
str = "Fri 10th ÃƒÂ¢Ã‚Â€Ã‚Â“ BarclaysÃƒÂ¢Ã‚Â€Ã‚Â™ bonus announcement. Wanted: Barclays customers wanting to #moveyourmoney 8.30am, Holborn. http://moveyourmoney.org.uk/blog... - http://twitter.com/Naomi_F..."
temp = str.encode("ascii", "ignore")
temp

b'Fri 10th  Barclays bonus announcement. Wanted: Barclays customers wanting to #moveyourmoney 8.30am, Holborn. http://moveyourmoney.org.uk/blog... - http://twitter.com/Naomi_F...'

In [10]:


str = "Hope <e>Obama</e> bring His A Game 2nite!! #PresidentialDebateðŸ‡ºðŸ‡¸ðŸ‡ºðŸ‡¸ðŸ‡ºðŸ‡"
temp = str.encode("ascii", "ignore")
temp

b'Hope <e>Obama</e> bring His A Game 2nite!! #PresidentialDebate'

In [7]:
str = 'â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–’â–’'

match = re.search(r'[a-zA-Z0-9_]', str)

if match:
    print('found')
    print(match.group())
else:
    print('not found')


not found


In [4]:
str = 'cat'
match = re.search(r'\w\w\w', str)

if match:
    print('found')
    print(match.group())
else:
    print('not found')


found
cat


In [5]:
str = 'ca'
match = re.search(r'\w\w\w', str)

if match:
    print('found')
    print(match.group())
else:
    print('not found')


not found


In [6]:
str = 'ca9'
match = re.search(r'\w\w\w', str)

if match:
    print('found')
    print(match.group())
else:
    print('not found')


found
ca9


In [7]:
str = 'ca_'
match = re.search(r'\w\w\w', str)

if match:
    print('found')
    print(match.group())
else:
    print('not found')


found
ca_


In [8]:
str = 'an example word:cat!!'
match = re.search(r'word:\w\w\w', str)
# If-statement after search() tests if it succeeded
if match:
#   print 'found', match.group() ## 'found word:cat'
    print('found')
    print(match.group())
else:
    print('not found')


found
word:cat


In [9]:
str = 'an example word:cat!!'
match = re.search(r'word:\W', str)
# If-statement after search() tests if it succeeded
if match:
#   print 'found', match.group() ## 'found word:cat'
    print('found')
    print(match.group())
else:
    print('not found')
    print('Not a non-ALPHANUMERIC character')


not found
Not a non-ALPHANUMERIC character


In [10]:
str = 'an example word:cat!!'
match = re.search(r'word:\w\w\w\W', str)
# If-statement after search() tests if it succeeded
if match:
#   print 'found', match.group() ## 'found word:cat'
    print('found')
    print(match.group())
else:
    print('not found')


found
word:cat!


## . (a period) -- matches any single character except newline '\n' 

In [8]:
str = 'an example word:cat!!'
match = re.search(r'word...', str)
# If-statement after search() tests if it succeeded
if match:
#   print 'found', match.group() ## 'found word:cat'
    print('found')
    print(match.group())
else:
    print('not found')


found
word:ca


In [9]:
str = 'an example word:\ncat!!'
match = re.search(r'word...', str)
# If-statement after search() tests if it succeeded
if match:
#   print 'found', match.group() ## 'found word:cat'
    print('found')
    print(match.group())
else:
    print('not found')
    print("new line character is present")


not found
new line character is present


In [10]:
str = 'piiig'
match = re.search(r'....g', str)

if match:
    print('found')
    print(match.group())
else:
    print('not found')


found
piiig


In [11]:
str = 'piig'
match = re.search(r'....g', str)

if match:
    print('found')
    print(match.group())
else:
    print('not found')


not found


## \d -- decimal digit [0-9] (some older regex utilities do not support but \d, but they all support \w and \s) 

In [12]:
str = '99'
match = re.search(r'\d\d', str)

if match:
    print('found')
    print(match.group())
else:
    print('not found')


found
99


In [13]:
str = 99
match = re.search(r'\d\d', str)

if match:
    print('found')
    print(match.group())
else:
    print('not found')


TypeError: expected string or bytes-like object

## a, X, 9, < -- ordinary characters just match themselves exactly. The meta-characters which do not match themselves because they have special meanings are: . ^ $ * + ? { [ ] \ | ( ) (details below)

In [14]:
str = 'ax9'
match = re.search(r'ax9', str)

if match:
    print('found')
    print(match.group())
else:
    print('not found')


found
ax9


In [15]:
str = '[]['
match = re.search(r'[][', str)

if match:
    print('found')
    print(match.group())
else:
    print('not found')


error: unterminated character set at position 0

## \b -- boundary between word and non-word 

In [18]:
str = "Hello, world!"
match = re.search(r'\b', str)
# If-statement after search() tests if it succeeded
if match:
#   print 'found', match.group() ## 'found word:cat'
    print('found')
    print(match.group())
else:
    print('not found')
    print("new line character is present")


found



In [5]:
str = "Hello, world!"
match = re.findall(r'llo\b', str)
print(match)

['llo']


In [6]:
str = "Hello, world!"
match = re.findall(r'\b\w', str)
print(match)

['H', 'w']


In [20]:
str = "Hello"
match = re.search(r'\w\b', str)
# If-statement after search() tests if it succeeded
if match:
#   print 'found', match.group() ## 'found word:cat'
    print('found')
    print(match.group())
else:
    print('not found')



found
o


In [26]:
str = "Hello, world!"
match = re.search(r'\b\w', str)
# If-statement after search() tests if it succeeded
if match:
#   print 'found', match.group() ## 'found word:cat'
    print('found')
    print(match.group())
else:
    print('not found')


found
H


In [29]:
str = "Hello, world!"
match = re.search(r'\w\b\W\s\w', str)
# If-statement after search() tests if it succeeded
if match:
#   print 'found', match.group() ## 'found word:cat'
    print('found')
    print(match.group())
else:
    print('not found')


found
o, w


## \s -- (lowercase s) matches a single whitespace character -- space, newline, return, tab, form [ \n\r\t\f]. \S (upper case S) matches any non-whitespace character. 

In [31]:
str = "Hello, world!"
match = re.search(r'\s', str)
# If-statement after search() tests if it succeeded
if match:
#   print 'found', match.group() ## 'found word:cat'
    print('found')
    print(match.group())
else:
    print('not found')


found
 


In [14]:
str = "#Hello, world!"
match = re.search(r'(#\S+)', str)
# If-statement after search() tests if it succeeded
if match:
#   print 'found', match.group() ## 'found word:cat'
    print('found')
    print(match.group())
else:
    print('not found')
    print('No space character found')


found
#Hello,


In [37]:
str = "Hello\nworld!"
match = re.search(r'\s', str)
# If-statement after search() tests if it succeeded
if match:
#   print 'found', match.group() ## 'found word:cat'
    print('found')
    print(match.group())
else:
    print('not found')


found




In [38]:
str = "Hello\tworld!"
match = re.search(r'\s', str)
# If-statement after search() tests if it succeeded
if match:
#   print 'found', match.group() ## 'found word:cat'
    print('found')
    print(match.group())
else:
    print('not found')


found
	


In [44]:
str = "Hello,world!"
match = re.search(r'\s\w', str)
# If-statement after search() tests if it succeeded
if match:
#   print 'found', match.group() ## 'found word:cat'
    print('found')
    print(match.group())
else:
    print('not found')
    print('No space character found')


not found
No space character found


In [33]:
str = "Hello, world!"
match = re.search(r'\w\b\W\s\w', str)
# If-statement after search() tests if it succeeded
if match:
#   print 'found', match.group() ## 'found word:cat'
    print('found')
    print(match.group())
else:
    print('not found')


found
o, w


In [40]:
str = "Hello\tworld!"
match = re.search(r'\S', str)
# If-statement after search() tests if it succeeded
if match:
#   print 'found', match.group() ## 'found word:cat'
    print('found')
    print(match.group())
else:
    print('not found')


found
H


In [41]:
str = " "
match = re.search(r'\S', str)
# If-statement after search() tests if it succeeded
if match:
#   print 'found', match.group() ## 'found word:cat'
    print('found')
    print(match.group())
else:
    print('not found')


not found


In [43]:
str = "\t"
match = re.search(r'\S', str)
# If-statement after search() tests if it succeeded
if match:
#   print 'found', match.group() ## 'found word:cat'
    print('found')
    print(match.group())
else:
    print('not found')
    print('No non-space character found')


not found
No non-space character found


## \t, \n, \r -- tab, newline, return 

In [48]:
str = "\tP"
match = re.search(r'\t\w', str)
# If-statement after search() tests if it succeeded
if match:
#   print 'found', match.group() ## 'found word:cat'
    print('found')
    print(match.group())
else:
    print('not found')


found
	P


In [49]:
str = "\nP"
match = re.search(r'\t\w', str)
# If-statement after search() tests if it succeeded
if match:
#   print 'found', match.group() ## 'found word:cat'
    print('found')
    print(match.group())
else:
    print('not found')


not found


In [50]:
str = "\nP"
match = re.search(r'\n\w', str)
# If-statement after search() tests if it succeeded
if match:
#   print 'found', match.group() ## 'found word:cat'
    print('found')
    print(match.group())
else:
    print('not found')


found

P


## ^ = start, $ = end -- match the start or end of the string 

In [54]:
str = "Hello\tworld!"
match = re.search(r'^\s', str)
# If-statement after search() tests if it succeeded
if match:
#   print 'found', match.group() ## 'found word:cat'
    print('found')
    print(match.group())
else:
    print('not found')
    print('No space character found')


not found
No space character found


In [57]:
str = " Hello\tworld!"
match = re.search(r'^\s\w', str)
# If-statement after search() tests if it succeeded
if match:
#   print 'found', match.group() ## 'found word:cat'
    print('found')
    print(match.group())
else:
    print('not found')
    print('No space character found')


found
 H


In [8]:
str = " Hello world!"
match = re.search(r'\w\S$', str)
# If-statement after search() tests if it succeeded
if match:
#   print 'found', match.group() ## 'found word:cat'
    print('found')
    print(match.group())
else:
    print('not found')
    print('No non-space character found')


not found
No non-space character found


In [75]:

## ^ = matches the start of string, so this fails:
print(re.search(r'^b\w+', 'foobar')) # not found, match == None
## but without the ^ it succeeds:
print(re.search(r'b\w+', 'foobar').group()) # found, match.group() == "bar"


None
bar


## Repetition

## Things get more interesting when you use + and * to specify repetition in the pattern

###    + -- 1 or more occurrences of the pattern to its left, e.g. 'i+' = one or more i's
###    * -- 0 or more occurrences of the pattern to its left
###   ? -- match 0 or 1 occurrences of the pattern to its left 

## Leftmost & Largest

### First the search finds the leftmost match for the pattern, and second it tries to use up as much of the string as possible -- i.e. + and * go as far as possible (the + and * are said to be "greedy").

In [62]:
str = "Hello, world!"
match = re.search(r'\w+', str)
# If-statement after search() tests if it succeeded
if match:
#   print 'found', match.group() ## 'found word:cat'
    print('found')
    print(match.group())
else:
    print('not found')


found
Hello


In [63]:
str = "Hello, world!"
match = re.search(r'\w*', str)
# If-statement after search() tests if it succeeded
if match:
#   print 'found', match.group() ## 'found word:cat'
    print('found')
    print(match.group())
else:
    print('not found')


found
Hello


In [65]:
str = " "
match = re.search(r'\w*', str)
# If-statement after search() tests if it succeeded
if match:
#   print 'found', match.group() ## 'found word:cat'
    print('found')
    print(match.group())
else:
    print('not found')


found



In [70]:
str = "Hello, world!"
match = re.search(r'\w\w?', str)
# If-statement after search() tests if it succeeded
if match:
#   print 'found', match.group() ## 'found word:cat'
    print('found')
    print(match.group())
else:
    print('not found')


found
He


In [9]:
str = "Hello, world!"
match = re.search(r'\w+\W?', str)
# If-statement after search() tests if it succeeded
if match:
#   print 'found', match.group() ## 'found word:cat'
    print('found')
    print(match.group())
else:
    print('not found')


found
Hello,


In [71]:
str = "!Hello, world!"
match = re.search(r'\S\w+', str)
# If-statement after search() tests if it succeeded
if match:
#   print 'found', match.group() ## 'found word:cat'
    print('found')
    print(match.group())
else:
    print('not found')


found
!Hello


In [78]:
## Finds the first/leftmost solution, and within it drives the +
## as far as possible (aka 'leftmost and largest').
## In this example, note that it does not get to the second set of i's.

re.search(r'i+', 'piigiiii') # found, match.group() == "ii"


<re.Match object; span=(1, 3), match='ii'>

In [79]:
## i+ = one or more i's, as many as possible.
re.search(r'pi+', 'piiig') # found, match.group() == "piii"

<re.Match object; span=(0, 4), match='piii'>

## Logical operators

In [80]:
3 and 1

1

In [81]:
1 and 3

3

In [82]:
2 and 5

5

In [83]:
0 and 1

0

In [84]:
1 and 0

0

In [85]:
0 or 0

0

In [86]:
0 or 1

1

In [87]:
1 or 0

1

In [1]:
2 or 5

2

In [3]:
9//5

1

In [4]:
9/5

1.8