In [1]:
import re

In [2]:
# * - Star
# The star symbol * matches zero or more occurrences of the pattern left to it.
# + - Plus
# The plus symbol + matches one or more occurrences of the pattern left to it.
# ? - Question Mark
# The question mark symbol ? matches zero or one occurrence of the pattern left to it.

# * -  0 and more
# + -  1 and more
# ? -  0 and 1

In [3]:
re.findall(r'a.?a', 'ana amma  ata aa')

['ana', 'ata', 'aa']

In [4]:
re.findall(r'a.+a', 'ana amma  ata aa')

['ana amma  ata aa']

In [5]:
re.findall(r'a.+a', 'tana amma  ata aa')

['ana amma  ata aa']

In [6]:
re.findall(r'a.+a', 'aa')

[]

In [7]:
re.findall(r'a.*a', 'aa')

['aa']

In [8]:
re.findall(r'a.*a', 'ana amma  ata aa')

['ana amma  ata aa']

In [18]:
# {} - Braces
# Consider this code: {n,m}. This means at least n, and at most m repetitions of the pattern left to it.
# it find m then n
# it fin max then min

In [16]:
re.findall(pattern=r'[0-9]{3}', string='3223423424')

['322', '342', '342']

In [19]:
re.findall(pattern=r'[0-9]{2,4}', string='3223423424')

['3223', '4234', '24']

In [24]:
re.findall(pattern=r'[0-9]{2,4}', string='322342342')

['3223', '4234']

In [28]:
re.findall(pattern=r'[0-9]{3,4}', string='32234234211')

['3223', '4234', '211']

In [29]:
re.findall(pattern=r'[0-9]{3,4}', string='3223423421')

['3223', '4234']

In [30]:
re.findall(r'a.*a', 'ana amma  ata aa')

['ana amma  ata aa']

In [7]:
re.findall(r'a.{2}a', 'ana amma  ata aa')

['amma', 'a aa']

In [8]:
re.findall(r'a.{2}a', 'ana  amma  ata aa')

['a  a', 'a  a', 'a aa']

In [33]:
re.findall(r'a.{2}a', 'anaa')

['anaa']

In [36]:
re.findall(r'[0-9]{2,4}', 'ab123csde')

['123']

In [37]:
re.findall(r'[0-9]{2,4}', '12 and 345673')

['12', '3456', '73']

In [40]:
re.findall(r'[0-9]{2,4}', '12 and 456a73')

['12', '456', '73']

In [41]:
# | - Alternation
# Vertical bar | is used for alternation (or operator).

In [42]:
re.findall(r'a|b', 'ana ban can')

['a', 'a', 'b', 'a', 'a']

In [43]:
re.findall(r'[ab]', 'ana ban can')

['a', 'a', 'b', 'a', 'a']

In [45]:
# () - Group
# Parentheses () is used to group sub-patterns. For example, (a|b|c)xz match any string
# that matches either a or b or c followed by xz

In [53]:
re.findall(r'[ab]na', 'ana ban can')

['ana']

In [54]:
re.findall(r'(a|b)na', 'ana ban can')

['a']

In [55]:
re.findall(r'(c|b)an', 'ana ban can dan')

['b', 'c']

In [56]:
re.findall(r'(?:c|b)an', 'ana ban can dan')

['ban', 'can']

In [58]:
re.findall(r'(c|ban)', 'ana ban can dan')

['ban', 'c']

In [60]:
# \ - Backslash
# Backlash \ is used to escape various characters including all metacharacters.

# For example, • \$a match if a string contains $ followed by a. Here, $ is not interpreted by a RegEx engine in a special way.

# If you are unsure if a character has special meaning or not, you can put \ in front of it.
# This makes sure the character is not treated in a special way.

In [118]:
re.findall(r'(an)?', 'cann? ban? dan?') # take 'an' joint movzudan kenar

['', 'an', '', '', '', '', 'an', '', '', '', 'an', '', '']

In [119]:
re.findall('an', 'an an anta') # test code movzudan kenar

['an', 'an', 'an']

In [120]:
re.findall('(an)', 'an an anta') # test code movzudan kenar

['an', 'an', 'an']

In [121]:
re.findall(r'an?', 'cann? ban? dan?')

['an', 'an', 'an']

In [122]:
re.findall(r'an\?', 'cann? ban? dan?')

['an?', 'an?']

In [123]:
print(re.findall(r'.', 'alfa . beta . qamma'))

['a', 'l', 'f', 'a', ' ', '.', ' ', 'b', 'e', 't', 'a', ' ', '.', ' ', 'q', 'a', 'm', 'm', 'a']


In [124]:
print(re.findall(r'\.', 'alfa . beta . qamma'))

['.', '.']


In [128]:
re.findall('[alfa]', '[alfa] [beta] [qamma]')

['a', 'l', 'f', 'a', 'a', 'a', 'a']

In [131]:
re.findall('\[alfa\]', '[alfa] [beta] [qamma]')

['[alfa]']

In [132]:
re.findall('\[alfa\]', '[alfa] [beta] [qamma] [alfa]')

['[alfa]', '[alfa]']

## Special Sequences

In [133]:
# Special sequences make commonly used patterns easier to write.
# Here's a list of special sequences:

# \A - Matches if the specified characters are at the start of a string.

In [134]:
re.findall(r'\Aa', 'alfa beta')

['a']

In [139]:
re.findall(r'^a', 'alfa beta') # ^a and \A  the same mean

['a']

In [140]:
re.findall(r'^apple', 'alfa beta')

[]

In [143]:
re.findall(r'^apple', 'apple alfa beta')

['apple']

In [151]:
# \b - Matches if the specified characters are at the beginning or end of a word.

In [161]:
re.findall(r'\Aa', 'alfa beta ana')

['a']

In [166]:
re.findall(r'\ba', 'alfa beta ana')

['a', 'a']

In [167]:
re.findall(r'\ba', 'alfa beta?ana')

['a', 'a']

In [170]:
re.findall(r'\ba', 'alfa beta ?ana')

['a', 'a']

In [173]:
re.findall(r'a\b', 'alfa beta ?ana')

['a', 'a', 'a']

In [178]:
re.findall(r'a\b', 'alfa beta ?ana alfa')

['a', 'a', 'a', 'a']

In [180]:
re.findall(r'a\b', 'alfa beta ?a? ?ana alfa')

['a', 'a', 'a', 'a', 'a']

In [181]:
# \B - Opposite of \b. Matches if the specified characters are not at the beginning or end of a word.

In [183]:
re.findall(r'\Ba', 'alfa beta ana alfa')

['a', 'a', 'a', 'a']

In [185]:
re.findall(r'\Ba', 'laf beat an laf')

['a', 'a', 'a']

In [193]:
re.findall(r'\Baf', 'laf beat an laf')

['af', 'af']

In [9]:
re.findall(r'\Ba.', 'laf beat an laf')

['af', 'at', 'af']

In [10]:
re.findall(r'\bball', 'ballman football voleyball')

['ball']

In [16]:
re.findall(r'ball\b', 'ballman football voleyball')

['ball', 'ball']

In [17]:
re.findall(r'\Bball', 'ballman football voleyball noballyes')

['ball', 'ball', 'ball']

In [18]:
# \d - Matches any decimal digit. Equivalent to [0-9]
# \D - Matches any non-decimal digit. Equivalent to [^0-9]

# \d is same as [0-9]
# \D is same as [^0-9]

In [19]:
re.findall(r'\d{3}', '1242 232 122 438 ')

['124', '232', '122', '438']

In [20]:
re.findall(r'\d{4}', '1242 232 122 438 ')

['1242']

In [21]:
re.findall(r'[0-9]{4}', '1242 232 122 438 ')

['1242']

In [22]:
re.findall(r'\D{4}', ' cb 1242 ab 232 db 122 mb 438 ')

[' cb ', ' ab ', ' db ', ' mb ']

In [32]:
# \s - Matches where a string contains any whitespace character. Equivalent to [\t\n\r\f\v].
# \S - Matches where a string contains any non-whitespace character. Equivalent to [^\t\n\r\f\v]
# \S is opposite \s

In [27]:
re.findall(r'\s', ' cb 1242 ab 232 db 122 mb 438 ')

[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']

In [30]:
print(re.findall(r'\S', ' cb 1242 ab 232 db 122 mb 438 '))

['c', 'b', '1', '2', '4', '2', 'a', 'b', '2', '3', '2', 'd', 'b', '1', '2', '2', 'm', 'b', '4', '3', '8']


In [31]:
print(re.findall(r'\S+', ' cb 1242 ab 232 db 122 mb 438 '))

['cb', '1242', 'ab', '232', 'db', '122', 'mb', '438']


In [53]:
print(re.findall(r'\d\S+', ' cb 1242 ab 232 db 122 mb 438 '))

['1242', '232', '122', '438']


In [58]:
print(re.findall(r'[a-z]\S+', ' cb 1242 ab 232 db 122 mb 438 '))

['cb', 'ab', 'db', 'mb']


In [59]:
# \w - Matches any alphanumeric character (digits and alphabets). Equivalent to [a-zA-Z0-9_].
# By the way, underscore _ is also considered an alphanumeric character.

In [60]:
print(re.findall(r'\w', ' cb 1242 ab 232 db 122 mb 438 '))

['c', 'b', '1', '2', '4', '2', 'a', 'b', '2', '3', '2', 'd', 'b', '1', '2', '2', 'm', 'b', '4', '3', '8']


In [61]:
print(re.findall(r'\w+', ' cb 1242 ab 232 db 122 mb 438 '))

['cb', '1242', 'ab', '232', 'db', '122', 'mb', '438']


In [62]:
print(re.findall(r'\d+', ' cb 1242 ab 232 db 122 mb 438 '))

['1242', '232', '122', '438']


In [63]:
print(re.findall(r'[a-z]+', ' cb 1242 ab 232 db 122 mb 438 '))

['cb', 'ab', 'db', 'mb']


In [64]:
# \W - Matches any non-alphanumeric character. Equivalent to [^a-zA-Z0-9_]

In [68]:
print(re.findall(r'\W+', ' cb 1242 .,> ab 232 db 122 mb 438 '))

[' ', ' ', ' .,> ', ' ', ' ', ' ', ' ', ' ', ' ']


In [72]:
# \Z - Matches if the specified characters are at the end of a string.
# \Z is same as \$

In [71]:
re.findall(r'a$', 'alfa beta')

['a']

In [74]:
re.findall(r'a\Z', 'alfa beta')

['a']

## Functions and Constants
## To work with RegEx

In [133]:
list_name = re.findall(r'[A-Za-z]+', 'Celil 1 Arif 2 Mehemmed 3 Lale 14')
list_name

['Celil', 'Arif', 'Mehemmed', 'Lale']

In [134]:
list_id = re.findall(r'[0-9]+', 'Celil 1 Arif 2 Mehemmed 3 Lale 14')
list_id

['1', '2', '3', '14']

In [135]:
dict(zip(list_name, list_id))

{'Celil': '1', 'Arif': '2', 'Mehemmed': '3', 'Lale': '14'}

In [136]:
result = list(zip(list_name, list_id))
result

[('Celil', '1'), ('Arif', '2'), ('Mehemmed', '3'), ('Lale', '14')]

In [137]:
result = list(map(lambda x, y: (x,y), list_name, list_id))
result

[('Celil', '1'), ('Arif', '2'), ('Mehemmed', '3'), ('Lale', '14')]

In [138]:
dict_result = dict(result)
dict_result

{'Celil': '1', 'Arif': '2', 'Mehemmed': '3', 'Lale': '14'}

In [139]:
import pandas as pd

In [141]:
pd.DataFrame(list(dict_result.items()), columns=['Name', 'Id'])

Unnamed: 0,Name,Id
0,Celil,1
1,Arif,2
2,Mehemmed,3
3,Lale,14


In [142]:
list(dict_result.items())

[('Celil', '1'), ('Arif', '2'), ('Mehemmed', '3'), ('Lale', '14')]

In [143]:
pd.DataFrame(dict_result.items(), columns=['Name', 'Id'])

Unnamed: 0,Name,Id
0,Celil,1
1,Arif,2
2,Mehemmed,3
3,Lale,14


In [144]:
dict_result.items()

dict_items([('Celil', '1'), ('Arif', '2'), ('Mehemmed', '3'), ('Lale', '14')])

In [169]:
re.findall(r'\bIST\d{4}', 'IST2354 BS2327 IST321 AQR3627 IST123') # istehlak krediti reqem 4 eded olur

['IST2354']

In [170]:
re.findall(r'\bIST[0-9]{4}', 'IST2354 BS2327 IST321 AQR3627 IST123')

['IST2354']

In [210]:
re.findall(r'\b[A-Z]{1}\d{2}-\d{2}\b', 'AT32-42 B123-14 D42-23 31-12 F22-444 32-13') # example k32-45

['D42-23']

In [243]:
re.findall(r'\b\d{3}\.\d{2,3}\.\d{2,3}\.\d{2}\b', '12.43.54.23 121.223.121.22 44.232.13.24 121.211.23.34 121.211.23.343')
# ip address 3num.2-3num.2-3num.2num

['121.223.121.22', '121.211.23.34']

## Functions and Constants

In [244]:
# re.search()
# The re.search() method takes two arguments: a pattern and a string. The method looks
# for the first location where the RegEx pattern produces a match with the string.

# If the search is successful, re.search() returns a match object; if not, it returns None.

In [249]:
re.search(r'\d{2}', '23 434 32')

<re.Match object; span=(0, 2), match='23'>

In [250]:
a = re.search(r'\d{2}', '23 434 32')

In [266]:
print(a.start())
print(a.end())
print(a.span())
print(a.re.pattern)
print(a.string)

0
2
(0, 2)
\d{2}
23 434 32


In [267]:
# re.finditer(pattern, string, flags=0)
# Return an iterator yielding Match objects over all non-overlapping matches for the RE pattern in string. 
# The string is scanned left-to-right, and matches are returned in the order found. Empty matches are included in the result.

In [281]:
a = re.finditer(r'\d{2}', '23 434 32')
a

<callable_iterator at 0xebc48ce680>

In [282]:
print(next(a))
print(next(a))
print(next(a))

<re.Match object; span=(0, 2), match='23'>
<re.Match object; span=(3, 5), match='43'>
<re.Match object; span=(7, 9), match='32'>


In [317]:
a = re.finditer(r'\d{2}', '23 434 32')

In [318]:
for i in a:
    print(i)

<re.Match object; span=(0, 2), match='23'>
<re.Match object; span=(3, 5), match='43'>
<re.Match object; span=(7, 9), match='32'>


In [360]:
credit_it = lambda x: re.findall(r'\bIST\d{4}', x)

In [361]:
credit_it('IST2354')

['IST2354']

In [362]:
credit_it('BS2323')

[]

In [363]:
credits = ['IST2323', 'IST23232', 'BS2323', 'AQR3627', 'IS2323']

In [364]:
list(map(credit_it, credits))

[['IST2323'], ['IST2323'], [], [], []]

In [365]:
list(filter(credit_it, credits))

['IST2323', 'IST23232']

In [382]:
# compile() - function returns the specified source as a code object, ready to be executed.

In [383]:
alfa = re.compile(pattern=r'\bIST\d{4}\b')

In [384]:
alfa.findall('IST2323')

['IST2323']

In [385]:
alfa.findall('IST23233')

[]

In [390]:
#  re.split(pattern, string, maxsplit=0, flags=0)
# method splits the string where there is a match and returns a list of strings where the splits have occurred.
# If the pattern is not found, re.split() returns a list containing the original string.

# You can pass maxsplit argument to the re.split() method. It's the maximum number of splits that will occur.

In [394]:
string = "Twelve: 12 Eighty nine: 89. "
pattern = r"\d+"
result = re.split(pattern, string)
print(result)

['Twelve: ', ' Eighty nine: ', '. ']


In [396]:
string = "Twelve: 12 Eighty nine: 89 Nine:9. "
pattern = r"\d+"
# maxsplit = 1
# split only at the first occurrence
result = re.split(pattern, string, 1 )
print(result)

['Twelve: ', ' Eighty nine: 89 Nine:9. ']


In [397]:
re.split(r'\d', 'Namiq 1 Vusal 2 Ramiz 3 Namin 4')

['Namiq ', ' Vusal ', ' Ramiz ', ' Namin ', '']

In [404]:
re.split(r'\d', 'Namiq 1 Vusal 2 Ramiz 3 Namin 4', maxsplit=2)

['Namiq ', ' Vusal ', ' Ramiz 3 Namin 4']

In [418]:
re.split(r'(\d)', 'Namiq 1 Vusal 2 Ramiz 3 Namin 4')

['Namiq ', '1', ' Vusal ', '2', ' Ramiz ', '3', ' Namin ', '4', '']

In [427]:
re.split(r'(\d)', 'Namiq 1 Vusal 2 Ramiz 3 Namin 4')

['Namiq ', '1', ' Vusal ', '2', ' Ramiz ', '3', ' Namin ', '4', '']

In [429]:
re.split(r'-', '13-32-234-212-242')

['13', '32', '234', '212', '242']

In [430]:
re.split(r'-', '13-32-234-212-242', 2)

['13', '32', '234-212-242']

In [431]:
re.split(r'(-)', '13-32-234-212-242')

['13', '-', '32', '-', '234', '-', '212', '-', '242']

In [432]:
# re.sub()
# The syntax of re.sub() is:
#                 re.sub(pattern, repl, string, count=0, flags=0)

# The method returns a string where matched occurrences are replaced with the content of replace variable.

In [15]:
# multiline string
string = "abc 12\
de 23 \n f45 6"
# matches all whitespace characters
pattern = "\s+"
# empty string
replace = " "
new_string = re.sub(pattern, replace, string)
print(new_string)

abc 12de 23 f45 6


In [16]:
re.sub(pattern=r'-', repl=' ', string='13-32-234-212-242')

'13 32 234 212 242'

In [17]:
re.sub(pattern=r'-', repl='', string='13-32-234-212-242')

'1332234212242'

In [19]:
re.sub(r'-', '', '13-32-234-212-242', count=2)

'1332234-212-242'

In [24]:
re.sub('-','_', 'Ramiz-Mehdi Ramin-Kamalov')

'Ramiz_Mehdi Ramin_Kamalov'

In [27]:
re.split(r' ', re.sub('-','_', 'Ramiz-Mehdi Ramin-Kamalov'))

['Ramiz_Mehdi', 'Ramin_Kamalov']

In [186]:
txt = 'id-1 Namin-Mehdiyev id-2 Lala-Mammadov'

In [187]:
list_id = re.findall(r'\d', txt)
list_id

['1', '2']

In [188]:
list_name = re.findall(r'([A-Z][a-z]+)-', txt)
list_name

['Namin', 'Lala']

In [189]:
list_surname = re.findall(r'-([A-Z][a-z]+)', txt)
list_surname

['Mehdiyev', 'Mammadov']

In [190]:
list_ = list(zip(list_id, list_name, list_surname))
list_

[('1', 'Namin', 'Mehdiyev'), ('2', 'Lala', 'Mammadov')]

In [191]:
re.findall(r'([A-Za-z][a-z]+)-', 'id-1 namin-Mehdiyev id-2 lala-Mammadov') # gives error result

['id', 'namin', 'id', 'lala']

In [192]:
re.findall(r'[A-Za-z]+-([A-Za-z]+)', 'id-1 namin-Mehdiyev id-2 lala-Mammadov')

['Mehdiyev', 'Mammadov']

In [193]:
re.findall(r'([A-Za-z]+)-[A-Za-z]+', 'id-1 namin-Mehdiyev id-2 lala-Mammadov')

['namin', 'lala']

In [194]:
re.findall(r'[A-Za-z]+-([A-Za-z]+)', 'id-1 namin-mehdiyev id-2 lala-mammadov')

['mehdiyev', 'mammadov']

In [195]:
re.findall(r'([A-Za-z]+)-[A-Za-z]+', 'id-1 namin-Mehdiyev id-2 lala-mammadov')

['namin', 'lala']

In [196]:
txt = '11-22 33-44 55-77'
txt

'11-22 33-44 55-77'

In [197]:
a = re.sub(r'-', '', txt)
a

'1122 3344 5577'

In [198]:
re.split(' ', a)

['1122', '3344', '5577']