# Regex Examples

In [3]:
import re

In [54]:
pattern_ignore_case=re.compile("[a-z]*",re.I)   #re.I is used to ignore the case

In [55]:
pattern_ignore_case.findall("Hello kusumakar how are you")

['Hello', '', 'kusumakar', '', 'how', '', 'are', '', 'you', '']

## Ignoring new lines

In [56]:
pattern_ignore_new_line = re.compile("[a-z]+",re.S)

In [58]:
pattern_ignore_new_line.findall("hello kusumakar  whats up")

['hello', 'kusumakar', 'whats', 'up']

## Date Logic

In [42]:
pattern_date = re.compile("\w+\W\w+\W\w+\s*")

In [43]:
pattern_date.findall("2020-20-12 20/20/2020 ")

['2020-20-12 ', '20/20/2020 ']

## re.DOTALL or re.S
### (Dot.) In the default mode, this matches any character except a newline. If the DOTALL flag has been specified, this matches any character including a newline.

## re.LOCALE or re.L
### re.LOCALE to make \w match all characters that are considered letters given the current locale settings

In [72]:
patter_locale=re.compile("\w*")

In [75]:
patter_locale.findall("hello kusumakar how are you",re.LOCALE)

['o', '', 'kusumakar', '', 'how', '', 'are', '', 'you', '']

# re.Verbose
##  This flag allows you to write regular expressions that look nicer and are more readable by allowing you to visually separate logical sections of the pattern and add comments.

In [165]:
verbose_pattern = re.compile("""
[A|The]+[\w\s]+
\.

""",re.VERBOSE|re.IGNORECASE|re.S)

In [166]:
verbose_pattern.findall("A cat that was running.That cat has fallen into the river.")

['A cat that was running.', 'That cat has fallen into the river.']

# Grouping

## Grouping is a powerful operation that allows operations such as :
- Creating sub expressions to apply quantifiers
- Limiting the scope of alternation
- Extracting information from the matched text
- Using extracted information in the regex

### Grouping is achieved by the using ( ). Pattern written inside ( ) is treated as 1 unit.

#### i.  Regular expression for ababababc

In [173]:
pattern1=re.compile("(ab)+c")

In [176]:
pattern1.search("abababababc")

<re.Match object; span=(0, 11), match='abababababc'>

## Capturing
### Another important feature in grouping. Groups capture the matched pattern which helps in using in other operations such as sub or in regex itself.

In [210]:
pattern1=re.compile(r"(ab)+(cd)+(ef)+")

In [211]:
it=pattern1.finditer(r"ababcdefefef")

In [213]:
groups=next(it)

In [217]:
print ( groups.group(1), groups.group(2), groups.group(3))

ab cd ef


# Named groups
## Syntax : (?P < name > pattern)

In [386]:
patter=re.compile("(?P<first>\w+)-(?P<second>\w+)")

In [387]:
match=patter.search("hello-world")

In [388]:
match.group('first')

'hello'

In [389]:
match.group('second')

'world'

In [420]:
logfile = "VolDis_log_23-10-2020.log"

In [428]:
patt= re.compile("(?P<file>\w+_+\w+)_(?P<date>\d+\W+\d+\W+\d+).log")

In [429]:
match=patt.search(logfile)

In [430]:
match.group('file')

'VolDis_log'

In [431]:
match.group('date')

'23-10-2020'

In [434]:
new_file= "Voldis_log_24-10-2020.log"

In [437]:
patt.sub("\g<date>-\g<file>",new_file)

'24-10-2020-Voldis_log'

# Atomic Groups
## These are special groups in regex module. They are designed to improve performance because when regex engine fails to match, it doesnt keep trying with every character in the data

## Zero width assertions:
#### Metacharacters that indicate position rather than actual content. Example : ^ or dollar
#### Look around assertion : These are powerful assertions that match a certain previous or ulterior value to the current position. They effectively do assertion without consuming characters, and just return a positive or a negative result of the match.

### Positive Look Ahead: 
#### This mechanism is represented as an expression preceded by a question mark and an equal to sign. ?= inside the parenthesis. For example , a passed (?=) will match if the passed regex do match the forthcoming input.

### Negative Look Ahead:
#### Preceded by ?!  . It will match if the passed regex doesnt against the forthcoming input.

### Positive Look Behind
#### Preceded by a ?<= . It will match if the passed regex do match against the previous input


### Negative Look behind
#### ?<!  It will match if the passed regex do not match agains the previous input

# Examples:


In [12]:
# Wth look ahead mechanism
pattern = re.compile("(?=fox)")
position = pattern.search ("A fox sat on the wall")
print("With look ahead mechanism ",position)
pattern = re.compile("fox")
position = pattern.search ("A fox sat on the wall")
print("Without look ahead mechanism ",position)

# Conclusion - > Look around does not consume characters so it can be used to filter where the expression should match.

With look ahead mechanism  <re.Match object; span=(2, 2), match=''>
Without look ahead mechanism  <re.Match object; span=(2, 5), match='fox'>


## Using Positive Look Ahead approach to find out comma seperated words

In [21]:
pattern = re.compile ("\w+(?=,)")
positions = pattern.findall ("I ate noodles,pakodas,maggi and tea")
print("With look ahead approach ",positions)
pattern = re.compile ("\w+,")
positions = pattern.findall ("I ate noodles,pakodas,maggi and tea")
print("Without look ahead approach ",positions)

# Conclusion -> Look around does not include , as a part of the result where as the other one matches both the words + ,

With look ahead approach  ['noodles', 'pakodas']
Without look ahead approach  ['noodles,', 'pakodas,']


## Negative Look ahead Examples:


In [40]:
pattern = re.compile ("John(?!\sSmith)")
positions = pattern.findall ("I went with John Cena not with John Smith")
print("With look ahead approach ",positions)
pattern = re.compile ("\w+,")
positions = pattern.findall ("I ate noodles , pakodas , maggi and tea")
print("Without look ahead approach ",positions) 

With look ahead approach  ['John']
Without look ahead approach  []


## Converting a number to a , based number i.e 123456789  as 123,456,789

In [82]:
pattern= re.compile('\d{1,3}(?=(\d{3})+(?!\d))')

In [83]:
result=pattern.finditer('12345678901')


In [84]:
string="12345678901"
for i in result:
    print(i.start(),i.end())

0 2
2 5
5 8


In [86]:
pattern.sub("\g<0>,",'12345678901')

'12,345,678,901'

# Matching twitter usernames @kusumakar_shukla

In [103]:
pattern= re.compile("(?<=@)[\w+]+")

In [105]:
pattern.findall(" my username is @kusumakar_00_012_shukla")

['kusumakar_00_012_shukla']

# Remember: look around mechanisms only match for fixed width patterns.

## Q. Taking those sentences which don't have error in them .that is , discard ERROR-

In [106]:
#Logformat = dd-mm-yyyy:ERROR : Authentication failed 

In [181]:
pattern = re.compile("\d{2}-\d{2}-\d{4}:(.*)(?<!ERROR):(.*)")

In [182]:
pattern.findall("10-10-2020:ERROR:Failed to login")

[]

In [183]:
pattern.findall("10-10-2020:INFO:Data Downloaded")

[('INFO', 'Data Downloaded')]