## Regex part 2

- Understand important regex metacharacters
- Use Groups to extract the targe strings

In [2]:
import re

### More about `[]`

In [3]:
# allow all alphabet, both upper and lower,  and numbers
regex = r'[a-zA-Z0-9]foo'

m = re.search(regex, 'afoo')
print(m)

<_sre.SRE_Match object; span=(0, 4), match='afoo'>


## Metacharacters for special matching

`\w`  - match a word character. Same as [a-zA-Z0-9_)

`\d`  - match number character. Same as [0-9]

`\s`  - match space


In [4]:
# \w
print(re.search(r"\wfoo", "afoo"))
print(re.search(r"\wfoo", "1foo"))  # number is also considered a word
print(re.search(r"\wfoo", "$foo"))  # not a word

# \d
print(re.search(r"\dfoo", "1foo"))  

# \s
print(re.search(r"\sfoo", " foo"))


<_sre.SRE_Match object; span=(0, 4), match='afoo'>
<_sre.SRE_Match object; span=(0, 4), match='1foo'>
None
<_sre.SRE_Match object; span=(0, 4), match='1foo'>
<_sre.SRE_Match object; span=(0, 4), match=' foo'>


In [5]:
id_1 = "My id is abcd1234"
id_2 = "The id: efgh5678"

regex = "\w{4}\d{4}"

print(re.search(regex, id_1))  # this is good
print(re.search(regex, id_2))  # this is good
print(re.search(regex, "12341234"))  # but I don't want this..


<_sre.SRE_Match object; span=(9, 17), match='abcd1234'>
<_sre.SRE_Match object; span=(8, 16), match='efgh5678'>
<_sre.SRE_Match object; span=(0, 8), match='12341234'>


In [6]:
regex = "[a-z]{4}\d{4}"  # this works better :)

print(re.search(regex, "My id is abcd1234"))
print(re.search(regex, "12341234"))

<_sre.SRE_Match object; span=(9, 17), match='abcd1234'>
None


### Escape character

Use backslash `\` to escape metacharacters.

In [7]:
string = 'gmail.com'

regex = 'gmail\.com'  # match exact "."

re.search(regex, string)

<_sre.SRE_Match object; span=(0, 9), match='gmail.com'>

In [8]:
string = "2+4=3"
regex = "\d\+\d=\d"  # use \ to match literal "+" (escape "+" as a quantifier metacharacter)
re.search(regex, string)

<_sre.SRE_Match object; span=(0, 5), match='2+4=3'>

In [9]:
string = 'my email address is abc123@gmail.com'

regex = '\w+@\w+\.\w+'
re.search(regex, string)

<_sre.SRE_Match object; span=(20, 36), match='abc123@gmail.com'>

##  Groups

`()` - specify groups

This is very helpful for extracting the string you want.

In [10]:
regex = r'(\w\d)foo'
m = re.search(regex, 'a2foo')
m  # the match object gives you the match result for the whole pattern

<_sre.SRE_Match object; span=(0, 5), match='a2foo'>

In [11]:
# use the .group for the extracted content for a group
m.group(1)

'a2'

In [12]:
m.group(0) # group zero points to the whole match

'a2foo'

### Use `re.I` to ignore case

In [16]:
regex = "ap{2}le"
m = re.search(regex, 'I like Apple') 
print(m)

None


In [18]:
# rather than using [aA] in the pattern, 
# we can use the re.I flag to make it ignore case
regex = "ap{2}le"
m = re.search(regex, 'I like Apple', re.I) # Upper case A won't match
print(m) 

<_sre.SRE_Match object; span=(7, 12), match='Apple'>
