# Regular Expressions

Regular Expressions can be hard. Luckily, there are some great online tools to help you build and test them:

https://regex101.com/

---

In [1]:
import re

In [2]:
phone1="314-334-5632"

In [3]:
matches=re.match(r"^(\d{3})-(\d{3})-(\d{4})", phone1)

In [4]:
matches.group(0)

'314-334-5632'

In [5]:
matches.group(1)

'314'

In [6]:
matches.group(2)

'334'

In [7]:
matches.group(3)

'5632'

In [15]:
phones=["324-567-2232", "(445) 333-1354", "+1 333-453-5454"]

In [19]:
for phone in phones:
    matches=re.match(r"^(\+\d )?(\(?\d{3}\)?)[- ](\d{3})-(\d{4})", phone)  # ? make () optional
    if matches is None:
        print(f"doesn't match: {phone}")
    else:
        print(matches.group(0))
        

324-567-2232
(445) 333-1354
+1 333-453-5454


# Name Parsing

In [2]:
# Names are formatted as First MI. Last
name="Paul E. Boal"

In [3]:
matches = re.match(r"^([\w]+) ([A-Z])\. ([\w]+)", name)

In [10]:
# The whole match
matches.group(0)

'Paul E. Boal'

In [11]:
# THe first subgroup
matches.group(1)

'Paul'

In [12]:
matches.group(2)

'E'

In [13]:
matches.group(3)

'Boal'

# Add on Credentials!

In [16]:
names = [
    "Steven G. Sanders, MD",
    "Alice D. Edwards, PhD",
    "Joseph B. Lester, MD, PhD"
]

In [17]:
for name in names:
    matches = re.match(r"^([\w]+) ([A-Z])\. ([\w]+), ([\w\.\, ]+)", name)
    first = matches.group(1)
    middle = matches.group(2)
    last = matches.group(3)
    creds = matches.group(4).split(', ')
    print([first, middle, last, creds])

['Steven', 'G', 'Sanders', ['MD']]
['Alice', 'D', 'Edwards', ['PhD']]
['Joseph', 'B', 'Lester', ['MD', 'PhD']]


# Date of Birth

Month ##, ####

In [18]:
birthday="March 25, 2011"

In [19]:
matches = re.match(r"([a-zA-Z]+) ([0-9]{1,2}), ([0-9]{4})", birthday)  # + means 1 or more
                                                                       # * means 0 or more
                                                                       # ? means 0 or 1
                                                                       # {1,2} menas 1 to 2 occurance
                                                                # [0-9] is same as \d means any digit

In [20]:
matches.group(0)

'March 25, 2011'

In [21]:
matches.group(1)

'March'

In [22]:
matches.group(2)

'25'

In [23]:
matches.group(3)

'2011'