In [52]:
'''
Regular Expression in Python (RegEx)
Covers:
* Python RE module
* Regular expressions and their syntax
* Regex methods and objects
* Regex Metacharacters, special sequences, and character classes
* Regex option flags
* Capturing groups
* Extension notations and assertions
* A real-world example of regular expression
'''

'''
What is Regular Expression in Python?
- Uses regular expression to match, search, replace, manipulate strings (like String class in Java)
- RegEx to validate Passwords
- RegEx to extract info from text, srpeadsheets, textual documents
- RegEx to searching and replacing text in files
- RegEx to validating text input, such as password and email address
- Regex to rename a hundred files at a time
'''

#### RE module (for handling pattern and regular expressions)
import re
#print(help(re))     # Print Info of regEx https://docs.python.org/3.10/library/re.html#regular-expression-syntax

### Example 1: Write a regular expression to search digit inside a string
target_str = "My roll number is 25"
res = re.findall(r"\d", target_str)    
print(res)    

## Use raw string to define a regex
''' 
    r"d" --> raw(r) string("\d"). 
    The raw string (r"\d") indicates that this string is a regular expression 
    This is useful to avoid issues with \ (backslash) 
'''
# path_to_search = "c:\example\task\new"
target_string = r"c:\example\task\new\exercises\session1"

# regex pattern without raw string (This gives error because \n and \t has meaning in python)
# pattern = "^c:\\example\\task\\new"

# regex pattern with raw string (this solve the previous issue)
pattern = r"^c:\\example\\task\\new"        # ^ means start matching from the start of the string

# Output
res = re.search(pattern, target_string)
print(res.group())

## Python regex methods
'''
re.compile('pattern')       Compile a regular expression pattern provided as a string into a re.Pattern object.
re.search(pattern, str)     Search for occurrences of the regex pattern inside the target string and return only the first match.
re.match(pattern, str)      Try to match the regex pattern at the start of the string. It returns a match only if the pattern is located at the beginning of the string.
re.fullmatch(pattern, str)	Match the regular expression pattern to the entire string from the first to the last character.
re.findall(pattern, str)	Scans the regex pattern through the entire string and returns all matches.
re.finditer(pattern, str)	Scans the regex pattern through the entire string and returns an iterator yielding match objects.
re.split(pattern, str)      It breaks a string into a list of matches as per the given regular expression pattern.

re.sub(pattern, replacement, str)	Replace one or more occurrences of a pattern in the string with a replacement.

re.subn(pattern, replacement, str)	Same as re.sub(). The difference is it will return a tuple of two elements.
First, a new string after all replacement, and second the number of replacements it has made.
'''


### Example 2: How to use regular expression in Python (Using All RegEx Methods)
# import the RE module
import re

target_string = "Jessa salary is 8000$"

# compile regex pattern
# pattern to match phone number
str_pattern = r'[(]\d{3}[)]\d{3}[-]\d{4}'   #(xxx)xxx-xxxx      # \d{3} means that digit only repeats 3 times  
pattern = re.compile(str_pattern)
# Using the pattern object:
if pattern.fullmatch("(202)403-4055"):
    print("It matches")
else:
    print("No matches")
# Another way to use the pattern object:
if re.fullmatch(pattern, "(2230)403-4055"):
    print("It matches")
else:
    print("No matches")


# pattern to match any character
str_pattern = r"\w"
pattern = re.compile(str_pattern)
# match regex pattern at start of the string
res = pattern.match(target_string)
# match character
print(res.group())  
# Output 'J'

# search regex pattern anywhere inside string
# pattern to search any digit
res = re.search(r"\d", target_string)
print(res.group())
# Output 8

# pattern to find all digits
res = re.findall(r"\d", target_string)
print(res)  
# Output ['8', '0', '0', '0']

# regex to split string on whitespaces
res = re.split(r"\s", target_string)
print("All tokens:", res)
# Output ['Jessa', 'salary', 'is', '8000$']

# regex for replacement
# replace space with hyphen
res = re.sub(r"\s", "-", target_string)
# string after replacement:
print(res)
# Output Jessa-salary-is-8000$


## The Match object methods
'''
After match to regex, returns Match object. We can use these methods to extract values from the matches. 
    group()	    Return the string matched by the regex pattern. See capturing groups.
    groups()	Returns a tuple containing the strings for all matched subgroups.
    start()	    Return the start position of the match.
    end()	    Return the end position of the match.
    span()	    Return a tuple containing the (start, end) positions of the match.
'''

### RegEx Metacharacters
'''
    . (DOT)	Matches any character except a newline.
    
    ^ (Caret)	Matches pattern only at the start of the string.
    
    $ (Dollar)	Matches pattern at the end of the string.
    
    * (asterisk)	Matches 0 or more repetitions of the regex.
    
    + (Plus)	Match 1 or more repetitions of the regex.
    
    ? (Question mark)	Match 0 or 1 repetition of the regex.
    
    [] (Square brackets)	Used to indicate a set of characters. Matches any single character in brackets. For example, [abc] will match either a, or, b, or c character.
    
    | (Pipe)	used to specify multiple patterns. For example, P1|P2, where P1 and P2 are two different regexes.

    \ (backslash)	Use to escape special characters or signals a special sequence. For example, If you are searching for one of the special characters you can use a \ to escape them.
    
    [^...]	Matches any single character not in brackets.

    (...)	Matches whatever regular expression is inside the parentheses. For example, (abc) will match to substring 'abc'
'''

### RegEx Special Sequences
'''
    \A	    Matches pattern only at the start of the string.
    
    \Z	    Matches pattern only at the end of the string.

    \d	    Matches to any digit. Short for character classes [0-9].
    
    \D	    Matches to any non-digit. short for [^0-9].

    \s	    Matches any whitespace character. short for character class [ \t\n\x0b\r\f].
    
    \S	    Matches any non-whitespace character. Short for [^ \t\n\x0b\r\f].

    \w	    Matches any alphanumeric character. Short for character class [a-zA-Z_0-9].
    
    \W	    Matches any non-alphanumeric character. Short for [^a-zA-Z_0-9]

    \b	    Matches the empty string, but only at the beginning or end of a word. Matches a word boundary where a word character is [a-zA-Z0-9_].
            For example, '\bJessa\b' matches 'Jessa', 'Jessa.', '(Jessa)', 'Jessa Emma Kelly' but not 'JessaKelly' or 'Jessa5'.
    
    \B	Opposite of a \b. Matches the empty string, but only when it is not at the beginning or end of a word
'''

### Regex Quantifiers
'''
    *	    Match 0 or more repetitions of the preceding regex. For example, a* matches any string that contains zero or more occurrences of 'a'.

    +	    Match 1 or more repetitions of the preceding regex. For example, a+ matches any string that contains at least one a, i.e., a, aa, aaa, or any number of a's.

    ?	    Match 0 or 1 repetition of the preceding regex. For example, a? matches any string that contains zero or one occurrence of a.

    {2}	    Matches only 2 copies of the preceding regex. For example, p{3} matches exactly three 'p' characters, but not four.

    {2, 4}	Match 2 to 4 repetitions of the preceding regex. For example, a{2,4} matches any string that contains 3 to 5 'a' characters.

    {3,}	Matches minimum 3 copies of the preceding regex. It will try to match as many repetitions as possible.
            For example, p{3,} matches a minimum of three 'p' characters.
'''

### Regex flags
'''

    re.A	re.ASCII	        Perform ASCII-only matching instead of full Unicode matching.

    re.I	re.IGNORECASE	    Perform case-insensitive matching.

    re.M	re.MULTILINE	    This flag is used with metacharacter ^ (caret) and $ (dollar).
                                When this flag is specified, the metacharacter ^ matches the pattern at beginning of the string and each newline’s beginning (\n).
                                And the metacharacter $ matches pattern at th   e end of the string and the end of each new line (\n)

    re.S	re.DOTALL	        Make the DOT (.) special character match any character at all, including a newline. Without this flag, DOT(.) will match anything except a newline.

    re.X	re.VERBOSE	        Allow comment in the regex. This flag is useful to make regex more readable by allowing comments in the regex.

    re.L	re.LOCALE	        Perform case-insensitive matching dependent on the current locale. Use only with bytes patterns.
'''

['2', '5']
c:\example\task\new
It matches
No matches
J
8
['8', '0', '0', '0']
All tokens: ['Jessa', 'salary', 'is', '8000$']
Jessa-salary-is-8000$


'\n\n    re.A\tre.ASCII\t        Perform ASCII-only matching instead of full Unicode matching.\n\n    re.I\tre.IGNORECASE\t    Perform case-insensitive matching.\n\n    re.M\tre.MULTILINE\t    This flag is used with metacharacter ^ (caret) and $ (dollar).\n                                When this flag is specified, the metacharacter ^ matches the pattern at beginning of the string and each newline’s beginning (\n).\n                                And the metacharacter $ matches pattern at th   e end of the string and the end of each new line (\n)\n\n    re.S\tre.DOTALL\t        Make the DOT (.) special character match any character at all, including a newline. Without this flag, DOT(.) will match anything except a newline.\n\n    re.X\tre.VERBOSE\t        Allow comment in the regex. This flag is useful to make regex more readable by allowing comments in the regex.\n\n    re.L\tre.LOCALE\t        Perform case-insensitive matching dependent on the current locale. Use only with bytes pa