In [1]:
# Regular Expressions (regex) are patterns used to match character
# combinations in strings. Think of them as advanced "Find" functionality on steroids


import re # Python's regex module


# Basic example: Findinf id a string contains a pattern

text = "Hello, my email is nikhil@example.com"
pattern = r"@\w+\.\w+"    # Pattern to find email domain


match = re.search(pattern, text)
if match:
  print("Found:", match.group())    # Found: @example.com



Found: @example.com


In [2]:
# LITERAL CHARACTERS

import re

# Literal matching - finds exact text

text = "The cat sat on the mat."
pattern = r"cat" # 'r' means raw string (recommended for regex)


match = re.search(pattern, text)
print(match.group() if match else "Not found")

cat


In [4]:
# SPECIAL CHARACTERS (METACHARACTERS)

# . (dot) - Matches ANY single character except newline

text = "cat bat hat rat"
pattern = r".at"    # Matches any 3-letter word ending with 'at'
matches = re.findall(pattern, text)
print(matches)      # ['cat', 'bat', 'hat', 'rat']


# ^ - Start of String
# $ - End of string

text = "Hello World"
print(re.search(r"Hello", text))
print(re.search(r"World$", text))
print(re.search(r"^World", text))

['cat', 'bat', 'hat', 'rat']
<re.Match object; span=(0, 5), match='Hello'>
<re.Match object; span=(6, 11), match='World'>
None


In [7]:
# CHARACTER CLASSES


# [] - Character class (matches any ONE character inside)
text = "The rain in Spain falls mainly on the plain"


# Find all vowels
vowels = re.findall(r"[aeiou]", text)
print("Vowels:", vowels)


# [a-z] - Range of characters
# [0-9] - Range of digits

text = "Room 101, Building 2A, Floor 3"
numbers = re.findall(r"[0-9]", text)
print("Numbers:", numbers)

# [^] - Negation (matches characters NOT in brackets)
text = "abc123def"
non_digits = re.findall(f"[^0-9]", text)
print("Non-digits:", non_digits)

Vowels: ['e', 'a', 'i', 'i', 'a', 'i', 'a', 'a', 'i', 'o', 'e', 'a', 'i']
Numbers: ['1', '0', '1', '2', '3']
Non-digits: ['a', 'b', 'c', 'd', 'e', 'f']


In [12]:
# QUANTIFIERS- How many times ?

# * - Zero or more times
# + - One or more times
# ? - Zero or one time
# {n} - Exactly n times
# {n, } - n or nore times
# {n,m} - Between n and m times



text = "ct cat caat caaat caaaat"


print("c*t:", re.findall(r"ca*t", text))  # 'ct, 'cat', 'caat', 'caaat'
print("ca+t:", re.findall(r"ca+t", text))
print("ca?t:", re.findall(r"ca?t", text))
print("ca{2}t:", re.findall(r"ca{2}t", text))

c*t: ['ct', 'cat', 'caat', 'caaat', 'caaaat']
ca+t: ['cat', 'caat', 'caaat', 'caaaat']
ca?t: ['ct', 'cat']
ca{2}t: ['caat']


In [20]:
# Shorthand character classes

text = "Hello World 123! Email: test@example.com"


# \d - Digits (0-9)
print("Digits:", re.findall(r"\d", text))

# \D - Non-digits
print("Non-digits:", re.findall(r"\D", text))

# \w - Word characters (a-z, A-Z, 0-9, _)
print("Words chars:", re.findall(r"\w", text))


# \W - Non-word characters
print("Non-word chars:", re.findall(r"\W", text))

# \s - Whitespace (spaces, tabs, newlines)
print("Whitespace:", re.findall(r"\s", text))

# \S - Non-whitespace
print("Non-whitespace:", re.findall(r"\S", text))





Digits: ['1', '2', '3']
Non-digits: ['H', 'e', 'l', 'l', 'o', ' ', 'W', 'o', 'r', 'l', 'd', ' ', '!', ' ', 'E', 'm', 'a', 'i', 'l', ':', ' ', 't', 'e', 's', 't', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm']
Words chars: ['H', 'e', 'l', 'l', 'o', 'W', 'o', 'r', 'l', 'd', '1', '2', '3', 'E', 'm', 'a', 'i', 'l', 't', 'e', 's', 't', 'e', 'x', 'a', 'm', 'p', 'l', 'e', 'c', 'o', 'm']
Non-word chars: [' ', ' ', '!', ' ', ':', ' ', '@', '.']
Whitespace: [' ', ' ', ' ', ' ']
Non-whitespace: ['H', 'e', 'l', 'l', 'o', 'W', 'o', 'r', 'l', 'd', '1', '2', '3', '!', 'E', 'm', 'a', 'i', 'l', ':', 't', 'e', 's', 't', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm']


In [24]:
# GROUPING AND CAPTURING

# () - Capturing groups

text = "Jhon: 30, Jane: 25, Bob: 35"

# Extract name-age pairs
pattern = r"(\w+): (\d+)"
matches = re.findall(pattern, text)
print("Name-Age pairs:", matches)

# Acess individual groups
pattern = r"(\w+): (\d+)"
match = re.search(pattern, text)
if match:
  print("Full match:", match.group())
  print("Name:", match.group(1))
  print("Age:", match.group(2))


Name-Age pairs: [('Jhon', '30'), ('Jane', '25'), ('Bob', '35')]
Full match: Jhon: 30
Name: Jhon
Age: 30


In [35]:
# COMMON REGEX METHODS


text = "The quick brown fox jumps over 124 lazy dogs. Email: user@domain.com 567 nikhiladhikari1@gmail.com"


# re.search() - Find first match anywhere
match = re.search(r"\d+", text)
print("First number:", match.group() if match else None)

# re.findall() - find all matches
emails = re.findall(r"\w+@\w+\.\w+", text)
print("Emails:", emails)

# re.finditer() - Find all matches as iterator

for match in re.finditer(r"\b\w{4}\b", text):
  print(f"4-letter word: {match.group()} at position {match.start()}")


# re.sub() - Replace matches
new_text = re.sub(r"fox", "cat", text)
print("After replacement:", new_text)

# re.split() - split by pattern
words = re.split(r"\W+", text)
print("Words:", words)



First number: 124
Emails: ['user@domain.com', 'nikhiladhikari1@gmail.com']
4-letter word: over at position 26
4-letter word: lazy at position 35
4-letter word: dogs at position 40
4-letter word: user at position 53
After replacement: The quick brown cat jumps over 124 lazy dogs. Email: user@domain.com 567 nikhiladhikari1@gmail.com
Words: ['The', 'quick', 'brown', 'fox', 'jumps', 'over', '124', 'lazy', 'dogs', 'Email', 'user', 'domain', 'com', '567', 'nikhiladhikari1', 'gmail', 'com']
