# Regular Expression (Regex)

## Ex: Ask user to input their email address

### Check with comman way

In [None]:
email = input("What's your email? ").strip()

# this is very loose check - some expectition can pass
# sush as '@.'
if "@" in email and "." in email:
    print("Valid")
else:
    print("Invalid")

In [None]:
email = input("What's your email? ").strip()

# check if @ split username and its domain
#
username, domain = email.split("@")
if username and domain.endswith(".edu"):
    print("Valid")
else:
    print("Invalid")

### Check with regular expression

**re.search(pattern, string, flags=0)**
-       . any character except a newline
-       * 0 or more repetitions
-       + 1 or more repetitions
-       ? 0 or 1 repetition
-       {m} m repetitions
-       {m, n} m-n repetitions

In [None]:
import re

email = input("What's your email? ").strip()

# check if 0 or more a-z before and after
if re.search(".*@.*", email):
    print("Valid")
else:
    print("Invalid")

# check if 1 or more a-z before and after
if re.search(".+@.+", email):
    print("Valid")
else:
    print("Invalid")

if re.search("..*@..*", email):
    print("Valid")
else:
    print("Invalid")

# using scape character
# 一个原始字符串（raw string）是通过在字符串前加上 r 或 R 来定义的。
# 原始字符串不会对反斜线 \ 进行特殊处理，这意味着在原始字符串中反斜线就是一个普通的字符，
# 而不是用来表示转义字符的特殊符号。
# excepction: malan@@@harvard.edu/ my mail address is malan@harvard.edu
# the only requirment is to to include a string satisify this pattern
if re.search(r".+@.+\.edu", email):  # raw string
    print("Valid")
else:
    print("Invalid")


-       ^ matches the start of the string
-       $ matches the end of the string or just before the newline at the end of the string


In [None]:
import re

email = input("What's your email? ").strip()

# excepction: malan@@@harvard.edu
if re.search(r"^.+@.+\.edu$", email):
    print("Valid")
else:
    print("Invalid")

-       [] set of characters
-       [^] complementing the set

In [None]:
import re

email = input("What's your email? ").strip()

# exclude @ in the both side of @ sign
# excepction: .edu@harvard.edu - too general
if re.search(r"^[^@]+@[^@]+\.edu$", email):
    print("Valid")
else:
    print("Invalid")

# make it more specific
if re.search(r"^[a-zA-Z0-9_ ]+@[a-zA-Z0-9_ ]+\.edu$", email):
    print("Valid")
else:
    print("Invalid")

-       \d decimal digit
-       \D not a decimal digit
-       \s whitespace characters
-       \S not a whitespace character
-       \w word character ... as well as numbers and the underscore
-       \W not a word character

In [None]:
import re

email = input("What's your email? ").strip()

# only a word - a more concise way
if re.search(r"^\w+@\w+\.edu$", email):
    print("Valid")
else:
    print("Invalid")

-       A|B either A or B
-       (...) a group
-       (?:...) non-capturing version

In [None]:
import re

email = input("What's your email? ").strip()

# only a word or a space- a more concise way
if re.search(r"^(\w|\s)+@\w+\.edu$", email):
    print("Valid")
else:
    print("Invalid")

# deal with MALAN@HARVARD.EDU
if re.search(r"^(\w|\s)+@\w+\.edu$", email.lower()):  # force the string into lower case
    print("Valid")
else:
    print("Invalid")

-       re.ICNORECASE
-       re.MULTILINE
-       re.DOTALL

In [None]:
import re

email = input("What's your email? ").strip()

# deal with MALAN@HARVARD.EDU
if re.search(
    r"^(\w|\s)+@\w+\.edu$", email, re.IGNORECASE
):  # research to specifically igorn the upper and lower case
    print("Valid")
else:
    print("Invalid")

# deal with malan@cs50.harvard.edu  -- sub domain
if re.search(
    r"^(\w|\s)+@(\w+\.)?\w+\.edu$", email, re.IGNORECASE
):  # ? means zero or more
    print("Valid")
else:
    print("Invalid")

# deal with malan@cs50.fall2022.harvard.edu  -- more sub domain
if re.search(r"^(\w|\s|\.)+@(\w+\.)*\w+\.edu$", email, re.IGNORECASE):
    print("Valid")
else:
    print("Invalid")

**re.match(pattern, string, float = 0)**
- similar to re.search but do not need to add ^ to specific from start to end

**re.fullmatch(pattern, string, float = 0)**
- similar to re.search but do not need to add both ^ and $ to specific from start to end

## Ex: Format name and ouput

### Check with split func

In [None]:
name = input("What's your name?").strip()
if "," in name:
    last, first = name.split(", ?")
    name = f"{first} {last}"
    print(f"hello, {name}")

### Check with group func

In [None]:
import re

name = input("What's your name?").strip()
matches = re.search(r"^(.+), (.+)$", name)

# a comman way of using group func
if matches:
    last, first = matches.groups()
    name = f"{first} {last}"
    # name = matches.group(2) + " " + matches.group(1)
    print(f"hello, {name}")

# ':=' (colon equals)- is right side is true then assign value to left side
if matches := re.search(r"^(.+), (.+)$", name):
    name = matches.group(2) + " " + matches.group(1)
    print(f"hello, {name}")

## Ex: Extract a username from a url

### Do with a command way

In [None]:
url = input("URL: ").strip()
print(url)

username = url.replace("https://twitter.com/", "")
print(f"Username: {username}")


username = url.removeprefix("https://twitter.com/")
print(f"Username: {username}")

### Do with regex

re.sub(pattern, repl, string, count = 0, flags = 0)

In [None]:
import re

url = input("URL: ").strip()

username = re.sub(r"^(https?://)?(www\.)?twitter\.com/", "", url)
print(f"Username: {username}")

# group version
matches = re.search(r"^(https?://)?(www\.)?twitter\.com/(.+)$", url, re.IGNORECASE)
if matches:
    print(f"Username: ", matches.group(2))  # group func start from 1 rathrt than 0

# non-capturing version (?:) - group but not capture
matches = re.search(r"^(https?://)?(?:www\.)?twitter\.com/(.+)$", url, re.IGNORECASE)
if matches:
    print(f"Username: ", matches.group(2))  # group func start from 1 rathrt than 0

### Other re() funcs are also avaiable
**re.split(pattern, string, maxsplit = 0, flags = 0)**

**re.findall(pattern, string, flags = 0)**