In [2]:
import re
# re.search()    -> returns a match object
# re.findall()   -> returns a list object of all matches value
# re.finditer()  -> returns an match iterator object
# match.group()  -> returns the value of match object

## No grouping


In [3]:
text = 'This is the text with 415-555-5874 please seach in it.'
pattern = r'\d{3}-\d{3}-\d{4}'
match = re.search(pattern, text)
if match:
    print(match.group())

415-555-5874


## grouping with ()


In [4]:
text = 'This is the text with 415-555-5874 please seach in it.'
pattern = r'(\d{3})-(\d{3}-\d{4})'
match = re.search(pattern, text)
if match:
    print(match)
    print('match.group()  =>', match.group())
    print('match.group(0) =>', match.group(0))
    print('match.group(1) =>', match.group(1))
    print('match.group(2) =>', match.group(2))

<re.Match object; span=(22, 34), match='415-555-5874'>
match.group()  => 415-555-5874
match.group(0) => 415-555-5874
match.group(1) => 415
match.group(2) => 555-5874


## pipe with |


In [5]:
text = 'This is the text: Batman and Batwoman are in the text'
pattern = r'Bat(man|woman|mobile)'
matches = re.finditer(pattern, text)
for match in matches:
    print(match.group())

Batman
Batwoman


## ? zero or one


In [6]:
text = 'This is the text with: Batman Batwoman Batwowoman Batwowowoman'
pattern = r'Bat(wo)?man'
matches = re.finditer(pattern, text)
for match in matches:
    print(match.group())

Batman
Batwoman


## \* zero or more


In [7]:
text = 'Batman Batwoman Batwowoman Batwowowoman'
pattern = r'Bat(wo)*man'
matches = re.finditer(pattern, text)
for match in matches:
    print(match.group())

Batman
Batwoman
Batwowoman
Batwowowoman


## + one or more


In [17]:
text = 'Batman Batwoman Batwowoman Batwowowoman'
pattern = r'Bat(wo)+man'
matches = re.finditer(pattern, text)
for match in matches:
    print(match.group())

Batwoman
Batwowoman
Batwowowoman


## {min, max}


In [18]:
text = 'When you have a long sentense and you want to find all 3-5 letters.'
pattern = r'\w{3,5}' # greedy match the longest (default)
matches = re.finditer(pattern, text)
for match in matches:
    print(match.group(), end=',')

When,you,have,long,sente,nse,and,you,want,find,all,lette,

In [19]:
text = 'When you have a long sentense and you want to find all 3-5 letters.'
pattern = r'\w{3,5}?' # non-greedy match the shortest
matches = re.finditer(pattern, text)
for match in matches:
    print(match.group(), end=",")

Whe,you,hav,lon,sen,ten,and,you,wan,fin,all,let,ter,

In [21]:
text = 'When you have a long sentense and we want to find all 3-5 letters.'
pattern = r'\w{3,}' # 3 and more
matches = re.finditer(pattern, text)
for match in matches:
    print(match.group(), end=",")

When,you,have,long,sentense,and,want,find,all,letters,

In [22]:
text = 'When you have a long sentense and we wanto to find all 3-5 letters.'
pattern = r'(\w{3})+' # 3, 6, 9, ...
matches = re.finditer(pattern, text)
for match in matches:
    print(match.group(), end=",")

Whe,you,hav,lon,senten,and,wan,fin,all,letter,

## example


In [32]:
text = 'There are some patters in the text: 1 name 2 family 10 class 236 yard please find them all.'
# pattern = r'\d+\s\w+'
pattern = r'\d+ \w+'
match_list = re.findall(pattern, text)
print(match_list)

['1 name', '2 family', '10 class', '236 yard']


## User defined pattern `[]`


In [34]:
text = 'There are some patters in the text: 1 name 2 family 10 class 236 yard please find them all.'
pattern = r'[a-zA-Z]'
match_list = re.findall(pattern, text)
print(match_list)

['T', 'h', 'e', 'r', 'e', 'a', 'r', 'e', 's', 'o', 'm', 'e', 'p', 'a', 't', 't', 'e', 'r', 's', 'i', 'n', 't', 'h', 'e', 't', 'e', 'x', 't', 'n', 'a', 'm', 'e', 'f', 'a', 'm', 'i', 'l', 'y', 'c', 'l', 'a', 's', 's', 'y', 'a', 'r', 'd', 'p', 'l', 'e', 'a', 's', 'e', 'f', 'i', 'n', 'd', 't', 'h', 'e', 'm', 'a', 'l', 'l']


In [40]:
text = 'There are some patters in the text: 1 name 2 family 10 class 236 yard please find them all.'
pattern = r'[a-zA-Z]+'
match_list = re.findall(pattern, text)
print(" ".join(match_list))

There are some patters in the text name family class yard please find them all


In [44]:
text = 'There are some patters in the text: 1 name 2 family 10 class 236 yard please find them all.'
pattern = r'[aeiouAEIOU]'
match_list = re.findall(pattern, text)
print(match_list)

['e', 'e', 'a', 'e', 'o', 'e', 'a', 'e', 'i', 'e', 'e', 'a', 'e', 'a', 'i', 'a', 'a', 'e', 'a', 'e', 'i', 'e', 'a']


## Negative pattern `[^]`


In [47]:
text = 'There are some patters in the text: 1 name 2 family 10 class 236 yard please find them all.'
pattern = r'[^aeiouAEIOU]'
match_list = re.findall(pattern, text)
print("".join(match_list))

Thr r sm pttrs n th txt: 1 nm 2 fmly 10 clss 236 yrd pls fnd thm ll.


## start with `^`


In [50]:
textlist = ['01 file', 'file', '02 file', 'file 03']
patterns = [r'\d', r'^\d']
for pattern in patterns:
    print('pattern: ', pattern)
    for text in textlist:
        match = re.search(pattern, text)
        if match:
            print(text)
        else:
            print('No match found')
    print('-'*30)

pattern:  \d
01 file
No match found
02 file
file 03
------------------------------
pattern:  ^\d
01 file
No match found
02 file
No match found
------------------------------


## end with `$`


In [51]:
textlist = ['main.py', 'text.txt', 'run.py', 'hello.pynb']
patterns = [r'\.py', r'\.py$']  # \ is for scaping dot
for pattern in patterns:
    print('pattern: ',pattern)
    for text in textlist:
        match = re.search(pattern, text)
        if match:
            print(text)
        else:
            print('No match found')
    print('-'*30)

pattern:  \.py
main.py
No match found
run.py
hello.pynb
------------------------------
pattern:  \.py$
main.py
No match found
run.py
No match found
------------------------------


## start with `^` and end with `$`


In [52]:
textlist = ['01 file', 'file', '02 file', 'file 03', '04 filenames']
patterns = [r'^\d\s\w+', r'^\d+\s\w+', r'^\d+\s\w+e$']
for pattern in patterns:
    print('pattern: ',pattern)
    for text in textlist:
        match = re.search(pattern, text)
        if match:
            print(match.group())
        else:
            print('No match found')
    print('-'*30)

pattern:  ^\d\s\w+
No match found
No match found
No match found
No match found
No match found
------------------------------
pattern:  ^\d+\s\w+
01 file
No match found
02 file
No match found
04 filenames
------------------------------
pattern:  ^\d+\s\w+e$
01 file
No match found
02 file
No match found
No match found
------------------------------


## using `.` anything except newline


In [54]:
textlist = ['main.py', '\n', 'run py', '  ', 'hello friends']
pattern = r'.*'
print('pattern: ', pattern)
for text in textlist:
    match = re.search(pattern, text)
    if match:
        print(match.group())
    else:
        print('Not match found')

pattern:  .*
main.py

run py
  
hello friends


In [55]:
text = 'Hello my name is Ali \nI am from Iran.'
pattern = r'.*'
print('pattern: ', pattern)
match = re.search(pattern, text)
if match:
    print(match.group())
else:
    print('Not match found')

pattern:  .*
Hello my name is Ali 


In [56]:
text = 'Everythins that is <Ali> and <Hossein> and with> and to continue'
patterns = [r'<.*>', r'<.*?>', r'<\w+>']
for pattern in patterns:
    print('pattern: ', pattern)
    match = re.findall(pattern, text)
    if match:
        print(match)
    else:
        print('No match found')
    print('-'*30)

pattern:  <.*>
['<Ali> and <Hossein> and with>']
------------------------------
pattern:  <.*?>
['<Ali>', '<Hossein>']
------------------------------
pattern:  <\w+>
['<Ali>', '<Hossein>']
------------------------------


## escape all special characters with `\`


### escape `()`


In [57]:
text = '(415) 555-5874'
pattern = r'(\(\d{3}\)) (\d{3}-\d{4})'
match = re.search(pattern, text)
if match:
    print(match)
    print('index=0 =>', match.group(0))
    print('index=1 =>', match.group(1))
    print('index=2 =>', match.group(2))

<re.Match object; span=(0, 14), match='(415) 555-5874'>
index=0 => (415) 555-5874
index=1 => (415)
index=2 => 555-5874


### escape `?`


In [58]:
text = 'Batman and? Batwoman are? in the text'
pattern = r'\w{3}\?'
matches = re.findall(pattern, text)
print(matches)

['and?', 'are?']


## `re.sub()` -> returns a new string


In [61]:
text = 'person John give person Jane a number'
pattern = r'person \w+'
print(re.findall(pattern, text))
print(re.sub(pattern, 'person ***', text))

['person John', 'person Jane']
person *** give person *** a number


In [62]:
text = 'person Khosro give person Hossein a number'
pattern = r'person (\w)\w*(\w)'
print(re.findall(pattern, text))
print(re.sub(pattern, r'person \1****\2', text)) # scape group 1 and 2

[('K', 'o'), ('H', 'n')]
person K****o give person H****n a number


In [63]:
text = 'The winner phone numbers are 09367415689, 09157894512, 09126547414'
pattern = r'(\d{7})\d{4}'
print(re.findall(pattern, text))
print(text)
print(re.sub(pattern, r'\1****', text)) # scape group 1

['0936741', '0915789', '0912654']
The winner phone numbers are 09367415689, 09157894512, 09126547414
The winner phone numbers are 0936741****, 0915789****, 0912654****


## example of cleaning a text


In [64]:
x = "I dont /*want 123  8//8$%^# 155 numbers-=@# 47 and 874 puctuations ()== and multiple +-*5   spaces in this text 2 "
x = re.sub(r'[^a-zA-Z ]', ' ', x) # replace everything that is not an alphabet with a space
x = re.sub(r'\s+', ' ', x)        # replace multiple spaces with one space
x = re.sub(r'and', '&', x)        # replace and with &
x

'I dont want numbers & puctuations & multiple spaces in this text '