'''
REFERENCE GUIDE: Regular Expressions
'''

'''
Rules for Searching:
Search proceeds through string from start to end, stopping at first match
All of the pattern must be matched
Basic Patterns:
Ordinary characters match themselves exactly
. matches any single character except newline \n
\w matches a word character (letter, digit, underscore)
\W matches any non-word character
\b matches boundary between word and non-word
\s matches single whitespace character (space, newline, return, tab, form)
\S matches single non-whitespace character
\d matches single digit (0 through 9)
\t matches tab
\n matches newline
\r matches return
\ match a special character, such as period: \.
Basic Python Usage:
match = re.search(r'pattern', string_to_search)
Returns match object
If there is a match, access match using match.group()
If there is no match, match is None
Use 'r' in front of pattern to designate a raw string

In [1]:
import re

In [3]:
s='my 1st string!!'

In [4]:
match=re.search(r'my',s)
if match:
    print match.group()

my


#or

In [6]:
re.search(r'my',s).group()

'my'

In [8]:
re.search(r'st',s).group()

'st'

In [9]:
re.search(r'sta',s).group()

AttributeError: 'NoneType' object has no attribute 'group'

In [10]:
re.search(r'\w\w\w',s).group()

'1st'

In [11]:
re.search(r'\W',s).group()

' '

In [12]:
re.search(r'\W\W',s).group()

'!!'

In [13]:
re.search(r'\s',s).group()

' '

In [14]:
re.search(r'\s\s',s).group()

AttributeError: 'NoneType' object has no attribute 'group'

In [15]:
re.search(r'..t',s).group()

'1st'

In [17]:
re.search(r'\s\St',s).group()

' st'

In [18]:
re.search(r'\bst',s).group()

'st'

In [19]:
s='sid is missing class'

In [20]:
re.search(r'miss\w+',s).group()

'missing'

In [22]:
re.search(r'miss\w?',s).group()

'missi'

In [24]:
re.search(r'miss\w*',s).group()

'missing'

In [25]:
re.search(r'is\w+',s).group()

'issing'

In [26]:
re.search(r'is\w*',s).group()

'is'

In [27]:
s='<h1>my heading </h1>'

In [28]:
re.search(r'<.+>',s).group()

'<h1>my heading </h1>'

In [30]:
re.search(r'<.+?>',s).group()

'<h1>'

In [33]:
s="sid is missing class"

In [35]:
re.search(r'^miss',s).group()

AttributeError: 'NoneType' object has no attribute 'group'

In [34]:
re.search(r'..ss',s).group()

'miss'

In [36]:
re.search(r'..ss$',s).group()

'lass'

In [37]:
s="missing class"

In [38]:
re.search(r'^miss',s).group()

'miss'

In [39]:
s = 'my email is kaushik-s90@gmail.com'

In [40]:
re.search(r'\w+@\w+',s).group()

's90@gmail'

In [41]:
re.search(r'[\w.-]+@[\w.-]+',s).group()

'kaushik-s90@gmail.com'

In [54]:
s = 'emails: joe@gmail.com, bob@gmail.com'

In [56]:
re.findall(r'[\w.]+@[\w.]+',s)

['joe@gmail.com', 'bob@gmail.com']

In [59]:
re.findall(r'([\w.]+)@([\w.]+)',s)

[('joe', 'gmail.com'), ('bob', 'gmail.com')]

In [60]:
s = 'emails: nicole@ga.co, joe@gmail.com, PAT@GA.CO'

In [62]:
re.findall(r'\w+@ga\.co',s)

['nicole@ga.co']

In [64]:
re.findall(r'\w+@ga\.co',s,re.I)

['nicole@ga.co', 'PAT@GA.CO']

In [65]:
s="sid is missing class"

In [67]:
re.sub(r'is',r'was',s)

'sid was mwassing class'

In [68]:
s = 'emails: joe@gmail.com, bob@gmail.com'

In [70]:
re.sub(r'([\w.-]+)@([\w.-]+)', r'\1@yahoo.com', s) 

'emails: joe@yahoo.com, bob@yahoo.com'

In [72]:
s="$100 $200"
#Find all instances that match excat &

In [74]:
re.findall(r'\$',s)

['$', '$']

In [86]:
text = 'Microsoft™'
#find all the unicode characters

In [88]:
re.findall(r'\W+',text)

['\xe2\x84\xa2']

In [90]:
text = 'The quick brown fox jumped over the lazy brown bear.'
# Find any word of three letters

In [99]:
re.findall(r'\b...\b',text)

['The', 'fox', 'the']

In [101]:
# Find anything with a 'T' and then the next two characters
re.findall(r'T..',text)

['The']

In [102]:
# Find all instances of any vowel
re.findall(r'[aeiou]',text)

['e', 'u', 'i', 'o', 'o', 'u', 'e', 'o', 'e', 'e', 'a', 'o', 'e', 'a']

In [104]:
# Find any of fox, snake, or bear
re.findall(r'fox|snake|bear',text)

['fox', 'bear']

In [105]:
# Find any of fox, snake, or bear
re.findall(r'(fox|snake|bear)',text)

['fox', 'bear']

In [107]:
text = 'My birthday is 09/15/1983. My brother\'s birthday is 01/01/01. My other two brothers have birthdays of 9/3/2001 and 09/1/83.'

In [114]:
re.findall(r'[0-9]+\/[0-9]+\/[0-9]+',text)

['09/15/1983', '01/01/01', '9/3/2001', '09/1/83']

In [115]:
text =  'My email is chris@hotmail.com, thanks! No, I am at bob@data.ninja.'
# Find all email addresses

In [118]:
re.findall(r'\w+@\w+\.\w+',text)

['chris@hotmail.com', 'bob@data.ninja']

In [119]:
text = 'The quick brown fox jumped over the lazy brown bear.'

In [121]:
# Find all instances of the exact match 'The'
re.findall(r'The',text)

['The']

In [123]:
text='21 scouts and 3 tanks fought against 4,003 protestors.'

In [130]:
# Find any character block that is a integer of any length
re.findall(r'\d+',text)

['21', '3', '4', '003']

In [131]:
text = '<p>The quick brown fox.</p><p>The lazy brown bear.</p>'

In [135]:
re.findall(r'<p>(.*?)</p>',text)

['The quick brown fox.', 'The lazy brown bear.']

In [2]:
text = 'Chris: 12:34am. Steve: 16:30'

In [7]:
# Find any text that fits the regex
re.findall(r'([0-1]\d:[0-5]\d)\s*(?:am|pm)?',text)

['12:34', '16:30']

In [9]:
text = 'My blog is http://www.chrisalbon.com and not http://chrisalbon.com'

In [20]:
# Find any ISBN-10 or ISBN-13 number
re.findall(r'\w+:\/\/\w+\.*\w+\.\w+',text)

['http://www.chrisalbon.com', 'http://chrisalbon.com']

In [22]:
text = 'My phone number is 415-333-3922. His phone number is 4239389283'

In [31]:

newtxt=re.sub(r'-',r'',text)
newtxt
re.findall(r'[0-9]+',newtxt)

['4153333922', '4239389283']

In [40]:
text = 'Capitalism, Communism, Neorealism, Liberalism,Kaushik,fffismnbbb'

In [44]:
re.findall(r'\b\w*ism\b',text)

['Capitalism', 'Communism', 'Neorealism', 'Liberalism']