<h1 align="center" style="color: orange">Regular Expressions in Python</h1>

In [1]:
import re

In [4]:
text_to_search = '''

abcdefghijklmnopqurtuvwxyz

ABCDEFGHIJKLMNOPQRSTUVWXYZ

1234567890
$2347.43

Ha HaHa Ha Hii Haa

MetaCharacters :
. ^ $ * + ? { } [ ] | ( )

Google.com
Amazon.com
Facebook.com

$97.43

abcd
This is abcd

123-456-789
123.456.789
123*555*1234
123/456/789

877-500-1234
980-555-1234
930-234-3455
9830234345

Total due
$670.54

Mr. Peter Griffin
Mr Stewie Griffin
Ms Glen Quagmire
Mrs. Griffin
Ms. Meg Griffin
Mrs. U
Mr. Joe Swanson
'''

sentence = "Finally some more things to test the RegEx using Python re module"

In [6]:
# Extract phone numbers
phone_numbers_v1 = re.search(r'\d{3}-\d{3}-\d{4}', text_to_search)

if phone_numbers_v1:
    print("Phone numbers found : ", phone_numbers_v1.group(0))

Phone numbers found :  877-500-1234


In [11]:
pattern = re.compile(r'abcd')

hits = pattern.finditer(text_to_search)
for hit in hits:
    print(hit)

<re.Match object; span=(2, 6), match='abcd'>
<re.Match object; span=(170, 174), match='abcd'>
<re.Match object; span=(183, 187), match='abcd'>


In [12]:
# we need to escape meta characters as they hold different meanings in regex
p2 = re.compile(r'\.')
m1 = p2.finditer(text_to_search)

for x in m1:
    print(x)

# typical use cases are URLs
p3 = re.compile(r'Google\.com')
m2 = p3.finditer(text_to_search)

for x in m2:
    print(x)

<re.Match object; span=(107, 108), match='.'>
<re.Match object; span=(140, 141), match='.'>
<re.Match object; span=(151, 152), match='.'>
<re.Match object; span=(164, 165), match='.'>
<re.Match object; span=(204, 205), match='.'>
<re.Match object; span=(208, 209), match='.'>
<re.Match object; span=(292, 293), match='.'>
<re.Match object; span=(346, 347), match='.'>
<re.Match object; span=(358, 359), match='.'>
<re.Match object; span=(375, 376), match='.'>
<re.Match object; span=(381, 382), match='.'>
<re.Match object; span=(134, 144), match='Google.com'>


In [29]:
# matching phone numbers
ph_num = re.compile(r'[89][78]\d[-.]')

valid_numbers = ph_num.finditer(text_to_search)

for num in valid_numbers:
    print(num.group(), ':', num.span())

877- : (239, 243)
980- : (252, 256)


In [15]:
# finding all the occurrences without upper case letters
p5 = re.compile(r'[^a-zA-Z\d]')

no_upper = p5.finditer(text_to_search)
no_upper_count = 0
for lower_case in no_upper:
    no_upper_count += 1

print(no_upper_count)

104


In [27]:
string2 = """
pun
bun
one
won
"""

# Anything except p then 'un'
p6 = re.compile(r'[^p]un')

result= p6.finditer(string2)

for x in result:
    print(x.group())

bun


In [25]:
# using quantifiers
ph_num = re.compile(r'\d{3}.\d{3}.\d{4}')

valid_numbers = ph_num.finditer(text_to_search)

for num in valid_numbers:
    print(num.group())

123*555*1234
877-500-1234
980-555-1234
930-234-3455


In [24]:
# finding all with the `MR.` prefix
pattern = re.compile(r'(Mr|Mrs|Ms)\.?\s[A-Z]\w*')

result = pattern.finditer(text_to_search)

# for r in result:
#     print(r)

for val in result:
    print(val.group(), ':', val.span())

Mr. Peter : (290, 299)
Mr Stewie : (308, 317)
Ms Glen : (326, 333)
Mrs. Griffin : (343, 355)
Ms. Meg : (356, 363)
Mrs. U : (372, 378)
Mr. Joe : (379, 386)


In [30]:
# Simple e-mail validator
def validate_email(email: str) -> bool:

    pattern = re.compile(r"""
                                ^                # Start of the string
                                [a-zA-Z0-9_.+-]+   # One or more of any alphanumeric character, underscore, dot, plus, or hyphen
                                @                # The "@" symbol
                                [a-zA-Z0-9-]+      # One or more of any alphanumeric character or hyphen
                                \.               # The literal dot character (escaped with backslash)
                                [a-zA-Z0-9-.]+     # One or more of any alphanumeric character, dot, or hyphen
                                $                # End of the string
                         """, re.VERBOSE)
    
    validator = pattern.search(email)

    return validator


result = validate_email('asdf@one.two.in')

if result:
    print("success")
else:
    print("Enter a valid email..")

success


In [32]:
# Masking Email address and Phone number
def mask_email(email: str) -> str:
    if validate_email(email):
        name, domain = email.split('@')
        return f"Your Email address is {name[0]}#####{name[-1]}@{domain}"
    

print(mask_email("abcd@mit.edu.in"))

Your Email address is a#####d@mit.edu.in


In [2]:
def mask_phone(num: str) -> str:
    """
    Masks part of the phone number, preserving the country code and last three digits.

    Args:
        num (str): The phone number as a string.

    Returns:
        str: The masked phone number with the format "<country_code>#####<last_3_digits>",
             or 'Invalid phone number' if the input doesn't match the pattern.
    """
    
    # Regex pattern to match country code and phone number parts
    pattern = re.compile(r"""
        ^                    # Start of string
        (\+?\d{1,3})         # Captures country code (1-3 digits with optional '+')
        [\s.-]?              # Optional separator (space, hyphen, or dot)
        \(?\d{1,4}\)?        # Area code (1-4 digits with optional parentheses)
        [\s.-]?              # Optional separator
        \d{1,4}              # First part of the phone number (1-4 digits)
        [\s.-]?              # Optional separator
        (\d{1,9})            # Last part of the phone number (1-9 digits)
        $                    # End of string
    """, re.VERBOSE)
    
    # Try to match the pattern
    matching = pattern.match(num)
    
    if matching:
        country_code, last_digits = matching.group(1), matching.group(2)
        # Mask middle part with '#####'
        return f"{country_code}##### {last_digits}"
    
    return "Invalid phone number"


print(mask_phone("+91 123-456-7890"))  # Output: +91#####7890
print(mask_phone("+1-800-555-1234"))   # Output: +1#####1234
print(mask_phone("1234567890"))        # Output: Invalid phone number
print(mask_phone("+44 (1234) 567 890"))  # Output: +44#####890


+91##### 7890
+1##### 1234
123##### 0
+44##### 890


In [22]:
# extracting price in dollar

all_price = r'\$[0-9]*\.[0-9]*'
total_due = r'Total due\n\$[0-9]*\.[0-9]*'

pattern = re.compile(all_price)
amounts = pattern.findall(text_to_search)
if amounts:
   print(amounts)


pattern = re.compile(total_due)
total_due = pattern.findall(text_to_search)
if total_due:
   print(total_due[0].split('\n')[1])

['$2347.43', '$97.43', '$670.54']
$670.54
