In [1]:
text = "my phone number is 408-555-1234. call soon"

In [2]:
"408-555-1234" in text

True

In [3]:
"408-123-1234" in text

False

### regular expression allows for pattern searching in a text document
### 408-555-1234 : xxx-xxx-xxxx : r'\d{3}-\d{3}-\d{4}'
### Every character type has a corresponding pattern code
### Digits : \d ( '\' back slash for tell 'd' is a special character)

In [4]:
import re

In [5]:
pattern = "phone"

In [6]:
re.search(pattern, text)

<re.Match object; span=(3, 8), match='phone'>

In [7]:
my_match = re.search(pattern, text)

In [8]:
my_match.span()

(3, 8)

In [9]:
# (3, 8)
#  |  |__ go upto index 8
#  |_____ start from index 3

In [10]:
my_match.start()

3

In [11]:
my_match.end()

8

In [12]:
text = "my phone number is new phone"
pattern = "phone"

In [13]:
my_match = re.search(pattern, text)
my_match.span() # only get first match

(3, 8)

In [14]:
all_matches = re.findall(pattern, text)
all_matches

['phone', 'phone']

In [15]:
len(all_matches)

2

In [16]:
for match in re.finditer(pattern, text):
    print(match.span())

(3, 8)
(23, 28)


## Pattern searching

In [17]:
text = "my phone numbers are 408-555-1234 and 408-123-4567. call soon"

In [18]:
pattern = r'\d\d\d-\d\d\d-\d\d\d\d'

In [19]:
phone_number = re.search(pattern, text)

In [20]:
phone_number

<re.Match object; span=(21, 33), match='408-555-1234'>

In [21]:
phone_number.group()

'408-555-1234'

## with quantifiers

In [22]:
pattern = r'\d{3}-\d{3}-\d{4}'
phone_number = re.search(pattern, text)
phone_number.group()

'408-555-1234'

In [23]:
all_matches = re.findall(pattern, text)
all_matches

['408-555-1234', '408-123-4567']

In [24]:
for match in re.finditer(pattern, text):
    print(match.span())

(21, 33)
(38, 50)


## Groups

In [25]:
text = "my phone numbers are 408-555-1234 and 408-123-4567. call soon"

In [26]:
pattern = r'(\d{3}-)(\d{3})-(\d{4})'
#              |       |      |
#  Group ()    1       2      3

In [27]:
mymatch = re.search(pattern, text)

In [28]:
mymatch.group(1)

'408-'

In [29]:
mymatch.group(2)

'555'

In [30]:
mymatch.group(3)

'1234'

In [31]:
mymatch.group()

'408-555-1234'

## Pipe operator

In [32]:
# Or '|'
re.search(r"man|woman", "This man was here.")

<re.Match object; span=(5, 8), match='man'>

In [33]:
re.search(r"man|woman", "This woman was here.")

<re.Match object; span=(5, 10), match='woman'>

## Wildcard characters

In [34]:
re.findall(r".at", "The cat in the hat sat splat.")
#            |_ Wildcard character

['cat', 'hat', 'sat', 'lat']

In [35]:
re.findall(r"..at", "The cat in the hat sat splat.")

[' cat', ' hat', ' sat', 'plat']

## Start or end with ( ^ : start with, $ : end with)

In [36]:
re.findall(r"\d$", "This ends with number 2") # this is for entire string

['2']

In [37]:
re.findall(r"^\d", "1 is here and, this ends with number 2")

['1']

In [38]:
phrase = "there are 3 numbers 34 inside 5 this sentence"

In [39]:
re.findall(r"[^\d]", phrase) # [] : exclude

['t',
 'h',
 'e',
 'r',
 'e',
 ' ',
 'a',
 'r',
 'e',
 ' ',
 ' ',
 'n',
 'u',
 'm',
 'b',
 'e',
 'r',
 's',
 ' ',
 ' ',
 'i',
 'n',
 's',
 'i',
 'd',
 'e',
 ' ',
 ' ',
 't',
 'h',
 'i',
 's',
 ' ',
 's',
 'e',
 'n',
 't',
 'e',
 'n',
 'c',
 'e']

In [40]:
re.findall(r"\d", phrase)

['3', '3', '4', '5']

In [41]:
re.findall(r"\d+", phrase)

['3', '34', '5']

In [42]:
re.findall(r"[\d]+", phrase)

['3', '34', '5']

In [43]:
re.findall(r"[^\d]+", phrase)

['there are ', ' numbers ', ' inside ', ' this sentence']

In [44]:
test_phrase = "there is a string! but it has punctuation. How to remove it?"

In [45]:
re.findall(r"[^ ]+", test_phrase)
#              |_ space (exclude space)

['there',
 'is',
 'a',
 'string!',
 'but',
 'it',
 'has',
 'punctuation.',
 'How',
 'to',
 'remove',
 'it?']

In [46]:
mylist = re.findall(r"[^!.? ]+", test_phrase)
#             ||||__ exclude '^', '!', '.', '?', ' '

In [47]:
mylist

['there',
 'is',
 'a',
 'string',
 'but',
 'it',
 'has',
 'punctuation',
 'How',
 'to',
 'remove',
 'it']

In [48]:
' '.join(mylist) # join the list with spaces between.

'there is a string but it has punctuation How to remove it'

In [49]:
text = "Only find the hyphen-words. Were are the long-ish dash words."

In [50]:
re.findall(r"\w+-\w+", text)

['hyphen-words', 'long-ish']

In [51]:
re.findall(r"[\w]+-[\w]+", text)

['hyphen-words', 'long-ish']