In [1]:
import re

# FUNCTIONS

In [11]:
# search

text = "This is a sample text."
pattern = "sample"

match = re.search(pattern, text)

if match:
    print("Found a match!")
else:
    print("No match found.")

Found a match!


In [8]:
# match

text = "This is a sample text."
pattern = "This"

re.match(pattern, text)

<re.Match object; span=(0, 4), match='This'>

In [9]:
# fullmatch

date_string = "2023-06-09"
pattern = "\d{4}-\d{2}-\d{2}"

re.fullmatch(pattern, date_string)

<re.Match object; span=(0, 10), match='2023-06-09'>

In [12]:
# findall

text = "Price of book: $19.99, movie: $12.50"
pattern = "\d+"

re.findall(pattern, text)

['19', '99', '12', '50']

In [14]:
# sub/subn

text = "Price of book: $19.99, movie: $12.50"
pattern = r"\b\w{4}\b"

re.sub(pattern, '****', text)

'Price of ****: $19.99, movie: $12.50'

In [15]:
re.subn(pattern, '****', text)

('Price of ****: $19.99, movie: $12.50', 1)

In [16]:
# split

text = "Price of book: $19.99, movie: $12.50"
pattern = "\s+"

re.split(pattern, text)

['Price', 'of', 'book:', '$19.99,', 'movie:', '$12.50']

In [17]:
re.split(pattern, text, 3)

['Price', 'of', 'book:', '$19.99, movie: $12.50']

In [20]:
# escape

text = r"Price of \book: $19.99, movie: $12.50"
pattern = r"\b"

re.search(pattern, text)

<re.Match object; span=(0, 0), match=''>

In [21]:
re.search(re.escape(pattern), text)

<re.Match object; span=(9, 11), match='\\b'>

In [23]:
# compile

text = "Price of book: $19.99, movie: $12.50"
pattern = "\d+"

regex = re.compile(pattern)
regex.findall(text)

['19', '99', '12', '50']

# MATCH OBJECT

In [29]:
pattern = "(\d{3})-(\d{3}-\d{4})"
text = "My phone number is 222-333-4444"

match = re.search(pattern, text)

In [32]:
if match:
    print(f'Matched phone number: {match.group()}')
    print(f'Area code: {match.group(1)}')
    print(f'Phone number: {match.group(2)}')
    print(f'Start position: {match.start()}')
    print(f'End position: {match.end()}')
    print(f'Span: {match.span()}')

Matched phone number: 222-333-4444
Area code: 222
Phone number: 333-4444
Start position: 19
End position: 31
Span: (19, 31)


# FLAGS

In [34]:
# re.IGNORECASE

pattern = 'hello'
text = 'Hello world!'

print(re.search(pattern, text))
print(re.search(pattern, text, re.IGNORECASE))

None
<re.Match object; span=(0, 5), match='Hello'>


In [35]:
# re.DOTALL

pattern = 'hello.*world'
text = 'hello\nworld!'

print(re.search(pattern, text))
print(re.search(pattern, text, re.DOTALL))

None
<re.Match object; span=(0, 11), match='hello\nworld'>


In [37]:
print(re.search(pattern, text, re.DOTALL).group())

hello
world


In [40]:
# re.MULTILINE

pattern = '^\d+'
text = 'Line 1\nLine 2\nLine 3'

re.findall(pattern, text, re.MULTILINE)

[]

In [41]:
pattern = '\d+$'
text = 'Line 1\nLine 2\nLine 3'

re.findall(pattern, text, re.MULTILINE)

['1', '2', '3']

# SEARCH AND MATCHING PATTERNS IN FILES

In [4]:
# Match Word

with open("sample1.txt", "r") as f:
    for line in f:
        match = re.search("sample", line)
        if match:
            print("Found a match in line: ", line)
            break

Found a match in line:  This is a sample.



In [5]:
# Search a file for file names

file_extensions = ['pdf', 'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx']

with open("sample2.txt", "r") as f:
    text = f.read()
    pattern = r'\b\w+\.(?:' + '|'.join(file_extensions) + r')\b'
    matches = re.findall(pattern, text)

print(matches)

['f1.doc', 'f2.docx', 'f3.pdf']


In [4]:
# Searching and matching patterns in files

phone_pattern = r'\d{3}-\d{3}-\d{4}'
email_pattern = r'\b[A-za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
zip_pattern = r'\b\d{5}(?:-\d{4})?\b'
order_pattern = r'\b[A-Z]{2}-\d{4}-\d{4}\b'
hex_pattern = r'#[A-Fa-f0-9]{6}'

with open('sample3.txt', 'r') as f:
    text = f.read()
    phone_match = re.search(phone_pattern, text)
    email_match = re.search(email_pattern, text)
    zip_match = re.search(zip_pattern, text)
    order_match = re.search(order_pattern, text)
    hex_match = re.search(hex_pattern, text)

if phone_match:
    print(f'Phone number found: {phone_match.group()}')
if email_match:
    print(f'Email address found: {email_match.group()}')
if zip_match:
    print(f'Zip code found: {zip_match.group()}')
if order_match:
    print(f'Order number found: {order_match.group()}')
if hex_match:
    print(f'Hex color code found: {hex_match.group()}')

Phone number found: 123-456-7890
Email address found: john.doe@example.com
Zip code found: 54321
Order number found: AB-1234-5678
Hex color code found: #FF5733


In [6]:
# Find custom product codes in a file

with open("sample4.txt", "r") as f:
    text = f.read()

pattern = r'\b[A-Z]{2}\d{2}-[A-Z]{2}\d{2}\b|\b\d{3}-\d{3}\b'
matches = re.findall(pattern, text)

print ("All product codes:", matches)

All product codes: ['AB12-CD34', '123-456', 'EF56-GH78']


In [15]:
# Find custom product codes in multiple files

import os

def extract_product_codes(file_path, output_file):
    with open(file_path, 'r') as f:
        text = f.read()
        pattern = r'\b[A-Z]{2}\d{2}-[A-Z]{2}\d{2}\b|\b\d{3}-\d{3}\b'
        matches = re.findall(pattern, text)

    with open(output_file, 'w') as output_f:
        for match in matches:
            output_f.write(f'{match}\n')

file_pairs = [('file1.txt', 'code1.txt'), ('file2.txt', 'code2.txt')]

for input_file, output_file in file_pairs:
    input_path = os.path.join('input', input_file)
    output_path = os.path.join('output', output_file)
    extract_product_codes(input_path, output_path)

# VALIDATING USER INPUT AND DATA

In [20]:
# Check if a username meets the requirement

def validate_username (username):
    pattern = r"^[a-zA-Z]+[\w]*$"

    match = re.fullmatch(pattern, username)
    return match is not None

user_input = input("Enter a username: ")

if validate_username(user_input):
    print("Valid username!")
else:
    print("Invalid username.")

Valid username!


In [22]:
# Checking numeric data

def validate_strictly_numeric(input_string):
    pattern = r"^\d+$" # không nhận số âm, số thập phân,...
    match = re.search(pattern, input_string)
    return match is not None

user_input = input("Enter a number: ")

if validate_strictly_numeric(user_input):
    print("Valid number!")
else:
    print("Invalid number.")

Invalid number.


In [23]:
def validate_numeric(input_string):
    pattern = r"^-?\d+(\.\d+)?$" # không nhận số âm, số thập phân,...
    match = re.search(pattern, input_string)
    return match is not None

user_input = input("Enter a number: ")

if validate_numeric(user_input):
    print("Valid number!")
else:
    print("Invalid number.")

Valid number!


In [3]:
# Validating email addresses

email_pattern = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')

with open('email_list.txt', 'r') as input_file:
    email_addresses = input_file.readlines()

invalid_emails = [email.strip() for email in email_addresses if not email_pattern.match(email.strip())]

with open('invalid_emails.txt', 'w') as output_file:
    for invalid_email in invalid_emails:
        output_file.write(f"{invalid_email}\n")

In [17]:
# Validate inout and prevent code in injection

def sanitize_input(input_string):
    sanitized_string = re.sub(r"[;'\"]", "", input_string)
    return sanitized_string

def validate_string_no_code_injection(input_string):
    pattern = r"^[^;'\"]*$"
    match = re.search(pattern, input_string)
    return match is not None

user_input = input("Enter a string to check for code injection: ")
sanitized_input = sanitize_input(user_input)

if validate_string_no_code_injection(sanitized_input):
    print("Valid input!")
else:
    print("Invalid input. Detected potentially harmful characters.")

Valid input!
