In [1]:
import re

# FUNCTIONS

In [11]:
# search

text = "This is a sample text."
pattern = "sample"

match = re.search(pattern, text)

if match:
    print("Found a match!")
else:
    print("No match found.")

Found a match!


In [8]:
# match

text = "This is a sample text."
pattern = "This"

re.match(pattern, text)

<re.Match object; span=(0, 4), match='This'>

In [9]:
# fullmatch

date_string = "2023-06-09"
pattern = "\d{4}-\d{2}-\d{2}"

re.fullmatch(pattern, date_string)

<re.Match object; span=(0, 10), match='2023-06-09'>

In [12]:
# findall

text = "Price of book: $19.99, movie: $12.50"
pattern = "\d+"

re.findall(pattern, text)

['19', '99', '12', '50']

In [14]:
# sub/subn

text = "Price of book: $19.99, movie: $12.50"
pattern = r"\b\w{4}\b"

re.sub(pattern, '****', text)

'Price of ****: $19.99, movie: $12.50'

In [15]:
re.subn(pattern, '****', text)

('Price of ****: $19.99, movie: $12.50', 1)

In [16]:
# split

text = "Price of book: $19.99, movie: $12.50"
pattern = "\s+"

re.split(pattern, text)

['Price', 'of', 'book:', '$19.99,', 'movie:', '$12.50']

In [17]:
re.split(pattern, text, 3)

['Price', 'of', 'book:', '$19.99, movie: $12.50']

In [20]:
# escape

text = r"Price of \book: $19.99, movie: $12.50"
pattern = r"\b"

re.search(pattern, text)

<re.Match object; span=(0, 0), match=''>

In [21]:
re.search(re.escape(pattern), text)

<re.Match object; span=(9, 11), match='\\b'>

In [23]:
# compile

text = "Price of book: $19.99, movie: $12.50"
pattern = "\d+"

regex = re.compile(pattern)
regex.findall(text)

['19', '99', '12', '50']

# MATCH OBJECT

In [29]:
pattern = "(\d{3})-(\d{3}-\d{4})"
text = "My phone number is 222-333-4444"

match = re.search(pattern, text)

In [32]:
if match:
    print(f'Matched phone number: {match.group()}')
    print(f'Area code: {match.group(1)}')
    print(f'Phone number: {match.group(2)}')
    print(f'Start position: {match.start()}')
    print(f'End position: {match.end()}')
    print(f'Span: {match.span()}')

Matched phone number: 222-333-4444
Area code: 222
Phone number: 333-4444
Start position: 19
End position: 31
Span: (19, 31)


# FLAGS

In [34]:
# re.IGNORECASE

pattern = 'hello'
text = 'Hello world!'

print(re.search(pattern, text))
print(re.search(pattern, text, re.IGNORECASE))

None
<re.Match object; span=(0, 5), match='Hello'>


In [35]:
# re.DOTALL

pattern = 'hello.*world'
text = 'hello\nworld!'

print(re.search(pattern, text))
print(re.search(pattern, text, re.DOTALL))

None
<re.Match object; span=(0, 11), match='hello\nworld'>


In [37]:
print(re.search(pattern, text, re.DOTALL).group())

hello
world


In [40]:
# re.MULTILINE

pattern = '^\d+'
text = 'Line 1\nLine 2\nLine 3'

re.findall(pattern, text, re.MULTILINE)

[]

In [41]:
pattern = '\d+$'
text = 'Line 1\nLine 2\nLine 3'

re.findall(pattern, text, re.MULTILINE)

['1', '2', '3']

# SEARCH AND MATCHING PATTERNS IN FILES

In [4]:
# Match Word

with open("sample1.txt", "r") as f:
    for line in f:
        match = re.search("sample", line)
        if match:
            print("Found a match in line: ", line)
            break

Found a match in line:  This is a sample.



In [5]:
# Search a file for file names

file_extensions = ['pdf', 'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx']

with open("sample2.txt", "r") as f:
    text = f.read()
    pattern = r'\b\w+\.(?:' + '|'.join(file_extensions) + r')\b'
    matches = re.findall(pattern, text)

print(matches)

['f1.doc', 'f2.docx', 'f3.pdf']


In [4]:
# Searching and matching patterns in files

phone_pattern = r'\d{3}-\d{3}-\d{4}'
email_pattern = r'\b[A-za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
zip_pattern = r'\b\d{5}(?:-\d{4})?\b'
order_pattern = r'\b[A-Z]{2}-\d{4}-\d{4}\b'
hex_pattern = r'#[A-Fa-f0-9]{6}'

with open('sample3.txt', 'r') as f:
    text = f.read()
    phone_match = re.search(phone_pattern, text)
    email_match = re.search(email_pattern, text)
    zip_match = re.search(zip_pattern, text)
    order_match = re.search(order_pattern, text)
    hex_match = re.search(hex_pattern, text)

if phone_match:
    print(f'Phone number found: {phone_match.group()}')
if email_match:
    print(f'Email address found: {email_match.group()}')
if zip_match:
    print(f'Zip code found: {zip_match.group()}')
if order_match:
    print(f'Order number found: {order_match.group()}')
if hex_match:
    print(f'Hex color code found: {hex_match.group()}')

Phone number found: 123-456-7890
Email address found: john.doe@example.com
Zip code found: 54321
Order number found: AB-1234-5678
Hex color code found: #FF5733


In [6]:
# Find custom product codes in a file

with open("sample4.txt", "r") as f:
    text = f.read()

pattern = r'\b[A-Z]{2}\d{2}-[A-Z]{2}\d{2}\b|\b\d{3}-\d{3}\b'
matches = re.findall(pattern, text)

print ("All product codes:", matches)

All product codes: ['AB12-CD34', '123-456', 'EF56-GH78']


In [15]:
# Find custom product codes in multiple files

import os

def extract_product_codes(file_path, output_file):
    with open(file_path, 'r') as f:
        text = f.read()
        pattern = r'\b[A-Z]{2}\d{2}-[A-Z]{2}\d{2}\b|\b\d{3}-\d{3}\b'
        matches = re.findall(pattern, text)

    with open(output_file, 'w') as output_f:
        for match in matches:
            output_f.write(f'{match}\n')

file_pairs = [('file1.txt', 'code1.txt'), ('file2.txt', 'code2.txt')]

for input_file, output_file in file_pairs:
    input_path = os.path.join('input', input_file)
    output_path = os.path.join('output', output_file)
    extract_product_codes(input_path, output_path)

# VALIDATING USER INPUT AND DATA

In [20]:
# Check if a username meets the requirement

def validate_username (username):
    pattern = r"^[a-zA-Z]+[\w]*$"

    match = re.fullmatch(pattern, username)
    return match is not None

user_input = input("Enter a username: ")

if validate_username(user_input):
    print("Valid username!")
else:
    print("Invalid username.")

Valid username!


In [22]:
# Checking numeric data

def validate_strictly_numeric(input_string):
    pattern = r"^\d+$" # không nhận số âm, số thập phân,...
    match = re.search(pattern, input_string)
    return match is not None

user_input = input("Enter a number: ")

if validate_strictly_numeric(user_input):
    print("Valid number!")
else:
    print("Invalid number.")

Invalid number.


In [23]:
def validate_numeric(input_string):
    pattern = r"^-?\d+(\.\d+)?$" # không nhận số âm, số thập phân,...
    match = re.search(pattern, input_string)
    return match is not None

user_input = input("Enter a number: ")

if validate_numeric(user_input):
    print("Valid number!")
else:
    print("Invalid number.")

Valid number!


In [3]:
# Validating email addresses

email_pattern = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')

with open('email_list.txt', 'r') as input_file:
    email_addresses = input_file.readlines()

invalid_emails = [email.strip() for email in email_addresses if not email_pattern.match(email.strip())]

with open('invalid_emails.txt', 'w') as output_file:
    for invalid_email in invalid_emails:
        output_file.write(f"{invalid_email}\n")

In [17]:
# Validate input and prevent code in injection

def sanitize_input(input_string):
    sanitized_string = re.sub(r"[;'\"]", "", input_string)
    return sanitized_string

def validate_string_no_code_injection(input_string):
    pattern = r"^[^;'\"]*$"
    match = re.search(pattern, input_string)
    return match is not None

user_input = input("Enter a string to check for code injection: ")
sanitized_input = sanitize_input(user_input)

if validate_string_no_code_injection(sanitized_input):
    print("Valid input!")
else:
    print("Invalid input. Detected potentially harmful characters.")

Valid input!


In [5]:
# Get all the IP addresses from our log file

def extract_ip_addresses(log_line):
    pattern = r"(\d{1,3}\.){3}\d{1,3}"

    ip_addresses = re.findall(pattern, log_line)

    for ip in ip_addresses:
        print(f"IP Address: {ip}")

with open("sample1.log", "r") as f:
    for line in f:
        extract_ip_addresses (line)

IP Address: 1.
IP Address: 1.
IP Address: 0.
IP Address: 0.
IP Address: 113.


In [6]:
def extract_ip_addresses(log_line):
    pattern = r"(?:\d{1,3}\.){3}\d{1,3}"

    ip_addresses = re.findall(pattern, log_line)

    for ip in ip_addresses:
        print(f"IP Address: {ip}")

with open("sample1.log", "r") as f:
    for line in f:
        extract_ip_addresses (line)

IP Address: 192.168.1.100
IP Address: 192.168.1.101
IP Address: 10.0.0.2
IP Address: 172.16.0.5
IP Address: 203.0.113.25


In [8]:
# Find different types of log entries

def identify_log_entry_type(log_line):
    pattern = r"(INFO|WARNING|ERROR)"

    match = re.search(pattern, log_line)

    if match:
        log_type = match.group(1)
        print(f"Log Entry Type: {log_type}")

with open("sample1.log", "r") as f:
    for line in f:
        identify_log_entry_type(line)

Log Entry Type: INFO
Log Entry Type: INFO
Log Entry Type: ERROR
Log Entry Type: INFO


In [9]:
# Extract error codes from log files

def search_for_error_code(log_line, error_code):
    pattern = r"ERROR (\d+)"

    match = re.search(pattern, log_line)
    if match and match.group(1) == error_code:
        print(f"{error_code} error found:", log_line)

with open("sample1.log", "r") as f:
    for line in f:
        search_for_error_code(line, "404")


404 error found: 2022-04-08 10:34:27 [ERROR 404] Failed connection from 10.0.0.2



In [10]:
# Filter a live log stream for error codes

import time
import random

# Simulate getting live logging data
def get_live_logging_data():
    log_levels = ["INFO", "WARNING", "ERROR"]
    log_level = random.choice(log_levels)
    return f"[{log_level}] This is a sample log message."

# Define a function to filter for errors in live logging data
def filter_errors(log_line):
    pattern = r"\[(ERROR)\]"

    match = re.search(pattern, log_line)
    if match:
        print("Error found:", log_line)

# Simulate live logging system
while True:
    line = get_live_logging_data()
    filter_errors(line)
    time.sleep(1) # Pause for 1 second before fetching the next log entry

Error found: [ERROR] This is a sample log message.
Error found: [ERROR] This is a sample log message.
Error found: [ERROR] This is a sample log message.
Error found: [ERROR] This is a sample log message.
Error found: [ERROR] This is a sample log message.
Error found: [ERROR] This is a sample log message.
Error found: [ERROR] This is a sample log message.
Error found: [ERROR] This is a sample log message.
Error found: [ERROR] This is a sample log message.
Error found: [ERROR] This is a sample log message.
Error found: [ERROR] This is a sample log message.
Error found: [ERROR] This is a sample log message.
Error found: [ERROR] This is a sample log message.
Error found: [ERROR] This is a sample log message.
Error found: [ERROR] This is a sample log message.
Error found: [ERROR] This is a sample log message.
Error found: [ERROR] This is a sample log message.
Error found: [ERROR] This is a sample log message.
Error found: [ERROR] This is a sample log message.
Error found: [ERROR] This is a 

KeyboardInterrupt: 

In [11]:
# Split entries of a log file using the split function

def split_log_entry(log_line):
    pattern = r"\s+"

    parts = re.split(pattern, log_line)

    print("Date:", parts[0])
    print("Type:", parts[1])
    print("Message:", " ".join(parts[2:]))

with open("sample1.log", "r") as f:
    for line in f:
        split_log_entry(line)

Date: 2022-04-08
Type: 10:34:22
Message: [INFO] Request from 192.168.1.100: GET /index.html 
Date: 2022-04-08
Type: 10:34:24
Message: [INFO] Request from 192.168.1.101: POST /api/data 
Date: 2022-04-08
Type: 10:34:27
Message: [ERROR 404] Failed connection from 10.0.0.2 
Date: 2022-04-08
Type: 10:34:29
Message: [ERROR 401] Request from 172.16.0.5: PUT /api/update 
Date: 2022-04-08
Type: 10:34:32


# REPLACING, TRANSFORMING, AND CLEANING DATA

In [15]:
# Standardize phone numbers based on user input

def clean_phone_number(phone_number):
    digits_only = re.sub(r"\D", "", phone_number)
    if len(digits_only) != 10:
        return None

    formatted_number = f"({digits_only[:3]}) {digits_only[3:6]}-{digits_only[6:]}"
    return formatted_number

user_input = input("Enter your phone number: ")

cleaned_number = clean_phone_number(user_input)

if cleaned_number:
    print("Cleaned phone number:", cleaned_number)
else:
    print("InvaLid phone number.")

Cleaned phone number: (111) 333-5678


In [16]:
# Replace misspelled words in a document

def replace_misspellings(text, corrections):
    for misspelled, correct in corrections.items():
        pattern = re.compile(rf"\b{misspelled}\b", re.IGNORECASE)
        text = pattern.sub(correct, text)
    return text

corrections = {
"recieve": "receive",
"adress": "address",
"acomodate": "accommodate"
}

with open("document.txt", "r") as f:
    content = f.read()

corrected_content = replace_misspellings (content, corrections)

with open("corrected_document.txt", "w") as f:
    f.write(corrected_content)
print("Misspellings corrected!")

Misspellings corrected!


In [17]:
# Find all the uppercase letters preceded by a lowercase letter adn followed by a digit

text = "aB3cD4eF5gH6"
pattern = "(?<=[a-z])[A-Z](?=\d)"

re.findall(pattern, text)

['B', 'D', 'F', 'H']

In [18]:
# Replace prices and date with the required format

def reformat_dates(text):
    pattern = r"(?<=Date: )\d{2}/\d{2}/\d{4}"
    return re.sub(pattern, lambda m: m.group(0)[6:] + '-' + m.group(0)[:5], text)

def reformat_prices(text):
    pattern = r"(?<=Price: )\$\d{1,3}(?:,\d{3})*"
    return re.sub(pattern, lambda m: m.group(0).replace(',', ''), text)

with open("invoice.txt", "r") as f:
    content = f.read()

content = reformat_dates(content)
content = reformat_prices(content)

with open("reformatted_invoice.txt", "w") as f:
    f.write(content)

print("Dates and prices reformatted!")

Dates and prices reformatted!


In [19]:
re.sub(r"\b\w",lambda m: m[0].upper(),"i am your")

'I Am Your'

In [22]:
re.sub(r"(?<=Date: )\d{2}/\d{2}/\d{4}",lambda m: m[6:] + '-' + m[:5],"Date: 04/18/2023")

IndexError: no such group

In [30]:
m = r"(?<=Date: )\d{2}/\d{2}/\d{4}"

m2 = lambda x: x.group(0)

m2(m)

AttributeError: 'str' object has no attribute 'group'

In [33]:
# Tokenize words of a text

text = "The quick, brown fox jumps over the lazy dog!"

no_punctuation = re.sub(r"[^\w\s]", "", text)
print("Text without punctuation:", no_punctuation)

lowercase_text = no_punctuation.lower()
print("Lowercase text:", lowercase_text)

words = re.split(r"\s+", lowercase_text)
print("Tokenized words:", words)

Text without punctuation: The quick brown fox jumps over the lazy dog
Lowercase text: the quick brown fox jumps over the lazy dog
Tokenized words: ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']


In [34]:
# Prepare a data set with product data for processing

import csv

def clean_data(row):
    date_pattern = r"(\d{2})/(\d{2})/(\d{4})"
    row[0] = re.sub(date_pattern, r"\3-\1-\2", row[0])

    price_pattern = r"[^\d.]"
    row[1] = re.sub(price_pattern, "", row[1])
    return row

input_file = "input.csv"
output_file = "output.csv"

with open(input_file, "r") as infile, open(output_file, "w", newline="") as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)

    header = next (reader)
    writer.writerow(header)

    for row in reader:
        cleaned_row = clean_data(row)
        writer.writerow(cleaned_row)

In [38]:
price_pattern = r"[^\d.]"

re.findall(price_pattern, "$1200.00")

['$']

In [39]:
price_pattern = r"[^\d]"

re.findall(price_pattern, "$1200.00")

['$', '.']

In [44]:
# Handle Unicode characters

text = "I love using emojis 😋 and mathematical symbols like π (pi)!"

emoji_pattern = r"[\U0001F600-\U0001F64F]"
emojis = re.findall(emoji_pattern, text)
print("Emojis found:", emojis)

math_symbol_pattern = r"[\u2200-\u22FF\u0370-\u03FF]"
math_symbols = re.findall(math_symbol_pattern, text)
print("Mathematical symbols found:", math_symbols)

clean_text = re.sub(r"[^\w\s]", "", text, flags=re.UNICODE)
print("Cleaned text:", clean_text)

Emojis found: ['😋']
Mathematical symbols found: ['π']
Cleaned text: I love using emojis  and mathematical symbols like π pi


In [46]:
# Redacting sensitive data in text

text = """
Name: John Doe
Email: john.doe@example.com
Phone: +1 (123) 456-7890
Credit Card: 1234 5678 9012 3456
"""

email_pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
phone_pattern = r"\+?\d{1,4}[-.() \d]*\d"
credit_card_pattern = r"\b(?:\d[ -]*?){13,16}\b"

redacted_text = re.sub(email_pattern, "[REDACTED_EMAIL]", text)
redacted_text = re.sub(phone_pattern, "[REDACTED_PHONE]", redacted_text)
redacted_text = re.sub(credit_card_pattern, "[REDACTED_CC]", redacted_text)

print("Redacted text:")
print(redacted_text)

Redacted text:

Name: John Doe
Email: [REDACTED_EMAIL]
Phone: [REDACTED_PHONE]
Credit Card: [REDACTED_PHONE]



In [53]:
phone_pattern = r"[-.() \d]"
phone = "+1 (123) 456-7890"

re.sub(phone_pattern, "", phone)

'+'

In [55]:
credit_card_pattern = r"\b(?:\d[ -]*?){13,16}\b"
credit_card = "1234 5678 9012 3456"

re.sub(credit_card_pattern, "", credit_card)

''