## 1. Capturing Groups

In [None]:
import re

# Parentheses create groups
text = "John Smith (john@example.com) - 555-123-4567"

# Extract name, email, and phone
pattern = r'(\w+ \w+) \(([\w.@]+)\) - ([\d-]+)'
match = re.search(pattern, text)

if match:
    print(f"Full match: {match.group(0)}")
    print(f"Name: {match.group(1)}")
    print(f"Email: {match.group(2)}")
    print(f"Phone: {match.group(3)}")
    print(f"All groups: {match.groups()}")

In [None]:
# Groups with findall
text = "Price: $10.99, Tax: $0.88, Total: $11.87"

# Without groups - returns full match
prices = re.findall(r'\$\d+\.\d{2}', text)
print(f"Without groups: {prices}")

# With groups - returns tuple of groups
prices = re.findall(r'(\w+): \$(\d+\.\d{2})', text)
print(f"With groups: {prices}")

In [None]:
# Non-capturing groups (?:...)
text = "http://example.com and https://secure.example.com"

# With capturing
result1 = re.findall(r'(https?)://([\w.]+)', text)
print(f"Capturing: {result1}")

# Without capturing protocol
result2 = re.findall(r'(?:https?)://([\w.]+)', text)
print(f"Non-capturing: {result2}")

## 2. Named Groups

In [None]:
# Named groups (?P<name>...)
text = "2024-01-15 10:30:45"

pattern = r'(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2}) (?P<hour>\d{2}):(?P<minute>\d{2}):(?P<second>\d{2})'
match = re.search(pattern, text)

if match:
    print(f"Year: {match.group('year')}")
    print(f"Month: {match.group('month')}")
    print(f"Day: {match.group('day')}")
    print(f"Time: {match.group('hour')}:{match.group('minute')}:{match.group('second')}")
    print(f"\nAs dict: {match.groupdict()}")

In [None]:
# Practical: Parse log entries
logs = [
    "[ERROR] 2024-01-15 10:30:45 - Database connection failed",
    "[INFO] 2024-01-15 10:31:00 - Server started",
    "[WARNING] 2024-01-15 10:32:15 - High memory usage"
]

pattern = r'\[(?P<level>\w+)\] (?P<date>[\d-]+) (?P<time>[\d:]+) - (?P<message>.+)'

print("Parsed logs:")
for log in logs:
    match = re.search(pattern, log)
    if match:
        d = match.groupdict()
        print(f"  [{d['level']}] {d['date']} {d['time']}")
        print(f"    Message: {d['message']}")

## 3. Lookahead and Lookbehind

In [None]:
# Lookahead (?=...) - match only if followed by
text = "apple pie, apple juice, orange juice, apple cider"

# Find 'apple' only if followed by ' juice'
result = re.findall(r'apple(?= juice)', text)
print(f"Positive lookahead: {result}")

# Negative lookahead (?!...) - match only if NOT followed by
result = re.findall(r'apple(?! juice)', text)
print(f"Negative lookahead: {result}")

In [None]:
# Lookbehind (?<=...) - match only if preceded by
text = "$10 USD, ‚Ç¨20 EUR, ¬£30 GBP"

# Find numbers after $ sign
result = re.findall(r'(?<=\$)\d+', text)
print(f"Positive lookbehind (after $): {result}")

# Negative lookbehind (?<!...)
result = re.findall(r'(?<!\$)\d+', text)
print(f"Negative lookbehind (not after $): {result}")

In [None]:
# Practical: Password validation
def validate_password(password):
    """
    Password must have:
    - At least 8 characters
    - At least one uppercase letter
    - At least one lowercase letter
    - At least one digit
    - At least one special character
    """
    if len(password) < 8:
        return False, "Too short"
    if not re.search(r'(?=.*[A-Z])', password):
        return False, "No uppercase"
    if not re.search(r'(?=.*[a-z])', password):
        return False, "No lowercase"
    if not re.search(r'(?=.*\d)', password):
        return False, "No digit"
    if not re.search(r'(?=.*[!@#$%^&*])', password):
        return False, "No special char"
    return True, "Valid"

passwords = ["abc", "abcdefgh", "Abcdefgh", "Abcdefg1", "Abcdefg1!"]

print("Password validation:")
for pwd in passwords:
    valid, msg = validate_password(pwd)
    status = "‚úì" if valid else "‚úó"
    print(f"  {status} '{pwd}': {msg}")

## 4. Substitution and Splitting

In [None]:
# re.sub() - find and replace
text = "The quick brown fox jumps over the lazy dog"

# Simple replacement
result = re.sub(r'fox', 'cat', text)
print(f"Simple: {result}")

# Replace all words starting with vowels
result = re.sub(r'\b[aeiouAEIOU]\w+', 'VOWEL', text)
print(f"Vowel words: {result}")

# Limited replacements
text2 = "one two one three one four"
result = re.sub(r'one', 'ONE', text2, count=2)
print(f"Limited (2): {result}")

In [None]:
# Using groups in replacement
text = "Smith, John and Doe, Jane"

# Swap first and last name
result = re.sub(r'(\w+), (\w+)', r'\2 \1', text)
print(f"Swapped: {result}")

# With named groups
text = "2024-01-15"
result = re.sub(r'(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})', 
                r'\g<month>/\g<day>/\g<year>', text)
print(f"Date format: {result}")

In [None]:
# Using function in replacement
def convert_temp(match):
    celsius = float(match.group(1))
    fahrenheit = celsius * 9/5 + 32
    return f"{fahrenheit:.1f}¬∞F"

text = "The temperature is 25¬∞C today, but was 18¬∞C yesterday."
result = re.sub(r'(\d+)¬∞C', convert_temp, text)
print(f"Converted: {result}")

In [None]:
# re.split() - split by pattern
text = "apple, banana; cherry : grape"

# Split by multiple delimiters
result = re.split(r'[,;:]\s*', text)
print(f"Split: {result}")

# Keep the delimiters
result = re.split(r'([,;:])', text)
print(f"With delimiters: {result}")

# Limited splits
result = re.split(r'[,;:]\s*', text, maxsplit=2)
print(f"Limited (2): {result}")

## 5. Complete Example: Text Processor

In [None]:
class TextProcessor:
    """
    Utility class for text processing with regex.
    """
    
    @staticmethod
    def extract_emails(text):
        """Extract all email addresses"""
        pattern = r'[\w.-]+@[\w.-]+\.[a-zA-Z]{2,}'
        return re.findall(pattern, text)
    
    @staticmethod
    def extract_urls(text):
        """Extract all URLs"""
        pattern = r'https?://[\w.-]+(?:/[\w./-]*)?'
        return re.findall(pattern, text)
    
    @staticmethod
    def extract_phone_numbers(text):
        """Extract phone numbers"""
        pattern = r'(?:\+?1[-.]?)?\(?\d{3}\)?[-.]?\d{3}[-.]?\d{4}'
        return re.findall(pattern, text)
    
    @staticmethod
    def extract_dates(text):
        """Extract dates in various formats"""
        patterns = [
            r'\d{4}-\d{2}-\d{2}',      # YYYY-MM-DD
            r'\d{2}/\d{2}/\d{4}',      # MM/DD/YYYY
            r'\w+ \d{1,2}, \d{4}'      # Month DD, YYYY
        ]
        dates = []
        for pattern in patterns:
            dates.extend(re.findall(pattern, text))
        return dates
    
    @staticmethod
    def mask_sensitive(text):
        """Mask credit card and SSN numbers"""
        # Mask credit cards (keep last 4)
        text = re.sub(r'\b(\d{4})[- ]?(\d{4})[- ]?(\d{4})[- ]?(\d{4})\b',
                     r'****-****-****-\4', text)
        # Mask SSN (keep last 4)
        text = re.sub(r'\b(\d{3})[- ]?(\d{2})[- ]?(\d{4})\b',
                     r'***-**-\3', text)
        return text
    
    @staticmethod
    def clean_whitespace(text):
        """Normalize whitespace"""
        # Replace multiple spaces with single
        text = re.sub(r' +', ' ', text)
        # Remove leading/trailing whitespace from lines
        text = re.sub(r'^\s+|\s+$', '', text, flags=re.MULTILINE)
        return text
    
    @staticmethod
    def extract_hashtags(text):
        """Extract hashtags from text"""
        return re.findall(r'#\w+', text)
    
    @staticmethod
    def camel_to_snake(name):
        """Convert camelCase to snake_case"""
        s1 = re.sub(r'(.)([A-Z][a-z]+)', r'\1_\2', name)
        return re.sub(r'([a-z0-9])([A-Z])', r'\1_\2', s1).lower()

# Demo
print("üìù TEXT PROCESSOR DEMO")
print("=" * 50)

sample_text = """
Contact us at support@example.com or sales@company.org
Visit https://www.example.com/products or http://shop.example.com
Call us: (555) 123-4567 or 1-800-555-0123
Valid until: 2024-12-31 or January 15, 2025
CC: 4111-1111-1111-1234, SSN: 123-45-6789
#Python #RegEx #Programming
"""

processor = TextProcessor()

print("\nüìß Emails:")
for email in processor.extract_emails(sample_text):
    print(f"  {email}")

print("\nüîó URLs:")
for url in processor.extract_urls(sample_text):
    print(f"  {url}")

print("\nüìû Phones:")
for phone in processor.extract_phone_numbers(sample_text):
    print(f"  {phone}")

print("\nüìÖ Dates:")
for date in processor.extract_dates(sample_text):
    print(f"  {date}")

print("\n#Ô∏è‚É£ Hashtags:")
print(f"  {processor.extract_hashtags(sample_text)}")

print("\nüîí Masked sensitive data:")
masked = processor.mask_sensitive(sample_text)
for line in masked.strip().split('\n'):
    if 'CC:' in line or 'SSN:' in line:
        print(f"  {line}")

print("\nüîÑ CamelCase conversions:")
names = ["getUserById", "HTTPResponseCode", "XMLParser"]
for name in names:
    print(f"  {name} ‚Üí {processor.camel_to_snake(name)}")

## Summary

### Groups:

| Syntax | Purpose |
|--------|----------|
| `(...)` | Capturing group |
| `(?:...)` | Non-capturing group |
| `(?P<name>...)` | Named group |

### Lookaround:

| Syntax | Meaning |
|--------|----------|
| `(?=...)` | Positive lookahead |
| `(?!...)` | Negative lookahead |
| `(?<=...)` | Positive lookbehind |
| `(?<!...)` | Negative lookbehind |

### Substitution:

| Feature | Syntax |
|---------|----------|
| Group reference | `\1`, `\2`, ... |
| Named group | `\g<name>` |
| Function | `re.sub(pattern, func, text)` |

### Best Practices:
1. Use raw strings (r'pattern')
2. Test patterns incrementally
3. Use named groups for clarity
4. Compile frequently used patterns
5. Don't overuse regex - sometimes simple string methods are better

### üéâ Congratulations!
You've completed the Python Learning Course!