# Exercise: Regular Expressions

In [1]:
import re

In [2]:
#1 Question: Match all email addresses (SOLVED)

inputs = ["My email is john@email.com and mary@otherplace.net", 
          "Visit us at support@ourcompany.co.uk for help",
          "No emails here"]

pattern = r"(\w+)@(\w+)\.(\w+)"

matches = []
for input in inputs:
    matches += re.findall(pattern, input)

for m in matches:
    username = m[0]
    mailserver = m[1]
    domain = m[2]
    print(f"Username: {username}")
    print(f"Mailserver: {mailserver}")
    print(f"Domain: {domain}")
    print('')

Username: john
Mailserver: email
Domain: com

Username: mary
Mailserver: otherplace
Domain: net

Username: support
Mailserver: ourcompany
Domain: co



In [3]:
#2 Question: Extract domain from email

inputs = ["john@email.com", 
          "mary+newsletter@gmail.com",
          "support@ourcompany.co.uk"]

pattern = r"\.(\w+)"

matches = []

for input in inputs:
    matches += re.findall(pattern, input)
    
matches

['com', 'com', 'co', 'uk']

In [4]:
#3 Question: Validate phone number  

inputs = ["555-123-4567",  
          "1 (234) 567-8910",
          "notaphonenumber"]

pattern = r'(1 \(\d{3}\) \d{3}-\d{4}|\d{3}-\d{3}-\d{4})'

for phone_number in inputs:
    matches = re.findall(pattern, phone_number)
    for match in matches:
        print(match)

555-123-4567
1 (234) 567-8910


In [5]:
#4 Question: Extract area code    

inputs = ["(555) 123-4567",
          "1 (234) 567-8910",
          "5551234567"]

pattern = r"\(\d*?\)"
matches = []
for input in inputs:
        matches += re.findall(pattern, input)
matches

['(555)', '(234)']

In [6]:
#5 Question: Match URLs and extract host

inputs = ["Visit https://www.example.com for more info",
          "Our website is example.com",
          "No URLs here"]
          
url_pattern = r'https?://(?:www\.)?|([\w.-]+\.\w{2,3})'

for text in inputs:
    matches = re.findall(url_pattern, text)
    for match in matches:
        print(match)



example.com
example.com


In [7]:
#6 Question: Remove non-alphabetic characters

inputs = ["Hello world!",
          "123 Main St.", 
          "greetings&more"]

# your code here ...:
for text in inputs:
    cleaned_text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    print(cleaned_text)

Hello world 
    Main St 
greetings more


In [8]:
#7 Question: Find words containing "tion" 

inputs = ["This is a test sentence with the word station in it.",
          "No words containing tion here",
          "motion activation vacation"]


pattern = r'\b\w*tion\w*\b'

for text in inputs:
    matches = re.findall(pattern, text)
    print(matches)

['station']
['tion']
['motion', 'activation', 'vacation']


In [9]:
#8 Question: Replace all occurrences of "hello" with "goodbye"

inputs = ["hello world", 
          "hello there",
          "no match"]

for text in inputs:
    replaced_text = text.replace("hello", "goodbye")
    print(replaced_text)

goodbye world
goodbye there
no match


In [10]:
#9 Question: Extract date strings in ISO8601 format

inputs = ["Log from 2023-01-15",
          "Meeting on 2023-02-01T13:00:00Z",  
          "No dates"]

pattern = r'\d{2,4}[-\/]\d{2}[-\/]\d{2,4}'
matches = []
for date in inputs:
    matches = re.findall(pattern, date)
    for match in matches:
        print(match)

2023-01-15
2023-02-01


In [11]:
#10 Question: Validate correctly formatted date

inputs = ["2023-01-15",
          "02/01/2023",
          "invalid date"]

pattern = r'\b\d{2,4}[-\/]\d{2}[-\/]\d{2,4}\b'

for date in inputs:
    if re.match(pattern, date):
        print(f"{date} is a valid date.")
    else:
        print(f"{date} is not a valid date.")

2023-01-15 is a valid date.
02/01/2023 is a valid date.
invalid date is not a valid date.


In [12]:
#11 Question: Remove punctuation except hyphens 

inputs = ["Hello! World?",
          "123-Main_St.",
          "Hi there."]

    
pattern = r'[^a-zA-Z1-9\s\-]'

for text in inputs:
    cleaned_text = re.sub(pattern, '', text)
    print(cleaned_text)

Hello World
123-MainSt
Hi there


In [13]:
#12 Question: Count occurrences of a word

inputs = ["Hello world. Hello!", 
          "Hello hello world",
          "no match"]

word_to_count = "Hello"
word_count = 0

for text in inputs:
    matches = re.findall(rf'\b{word_to_count}\b', text)
    word_count += len(matches)

print(f"The word '{word_to_count}' appears {word_count} times.")

The word 'Hello' appears 3 times.


In [14]:
#13 Question: Extract IP addresses from log 

inputs = ["127.0.0.1 - GET /",
          "User logged in from 192.168.1.1",
          "No IPs"]

ip_pattern = r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b'

for log_entry in inputs:
    ip_address = re.findall(ip_pattern, log_entry)
    for i in ip_address:    
        print(i)

127.0.0.1
192.168.1.1


In [15]:
#14 Question: Redact credit card and SSN numbers

inputs = ["Visa: 4111-1111-1111-1111",
          "My SSN is 111-11-1111", 
          "No numbers"]

for text in inputs:
    text = re.sub(r'\b(?:\d{4}-?){4}\b', 'XXXX-XXXX-XXXX-XXXX', text)
    text = re.sub(r'\b\d{3}-\d{2}-\d{4}\b', 'XXX-XX-XXXX', text)
    print(text)

Visa: XXXX-XXXX-XXXX-XXXX
My SSN is XXX-XX-XXXX
No numbers
