<h5>Regular expressions(Regex) are patterns used to match character combinations in strings</h5>


<table>
  <tr>
    <th style="width:10%">Regex</th>
    <th>Explain</th>
    <th style="width:20%">Example</th>
  </tr>
  <tr>
    <td>Hello</td>
    <td>
      <ul>
        <li> Matches string for these 5 letters in the exact order</li>       
      </ul>
   </td>
    <td>Neo_<mark>Hello</mark>_:D</td>
  </tr>
  <tr>
    <td>^Hello</td>
    <td>
      <ul>
        <li>Matches "Hello" and it must at the start of string</li>        
      </ul>
    </td>
    <td><mark>Hello</mark>_world!</td>
  </tr>
  <tr>
    <td>time$</td>
    <td>
      <ul>
        <li>Matches "time" and it must at the end of string</li>       
      <ul>
    </td>
    <td>Good_<mark>time</mark></td>
  </tr>
  <tr>
    <td>\d</td>
    <td>
      <ul>
        <li>Matches a numerical digit 0-9</li>       
      <ul>
    </td>
    <td>Nice_<mark>2</mark>_<mark>0</mark>_Over</td>
  </tr>
  <tr>
    <td>\d\d</td>
    <td>
      <ul>
        <li>Matches 2 consecutive numerical digit</li>       
      <ul>
    </td>
    <td>Nice_<mark>20</mark>_Over</td>
  </tr>
  <tr>
    <td>\s</td>
    <td>
      <ul>
        <li>Matches single whitespace</li>       
      <ul>
    </td>
    <td>Good<mark> </mark>night.</td>
  </tr>
  <tr>
    <td>\w</td>
    <td>
      <ul>
        <li>Matches any word character (alphanumeric & underscore). Equivalent to [A-Za-z0-9_]</li>       
      <ul>
    </td>
    <td>$<mark>W</mark>$</td>
  </tr>
  <tr>
    <td>^\w\w\w\w\s</td>
    <td>
      <ul>
        <li>Matches a pattern with 4 word characters followed by a whitespace which must at start</li>       
      <ul>
    </td>
    <td><mark>Four </mark>people</td>
  </tr>
</table>


<table>
  <tr>
    <th>Regex Quantifiers</th>
    <th>meaning</th>    
    <th>Using</th>
    <th>Example</th>
  </tr>
  <tr>
    <td>*</td>
    <td>0+</td>
    <td>^[aeiou]*$</td>
    <td><mark>iou</mark></td>    
  </tr>
  <tr>
    <td>?</td>
    <td>0 or 1</td>
    <td>^[abcd]?</td>  
    <td><mark>a</mark>pple</td>  
  </tr>
  <tr>
    <td>+</td>
    <td>1+ (positive number of occurrences)</td>    
    <td>o+</td>
    <td>f<mark>oo</mark>d</td>
  </tr>
  <tr>
    <td>{n}</td>
    <td>exactly n occurrences</td>   
    <td>[aeiou]{2}</td>
    <td>B<mark>oo</mark>k</td> 
  </tr>
</table>


<table>
  <tr>
    <th style="width:10%">Regex</th>
    <th>Explain</th>
    <th style="width:20%">Example</th>
  </tr>
  <tr>
    <td>^S.+a</td>
    <td>
      <ul>
        <li> Matches strings that start with the character S and end with the character a</li>
        <li>The ^ at the beginning of the expression indicates that the match must occur at the start of the string</li>
        <li>The S matches the first character of the string if it is an S</li>
        <li>The .+ matches one or more characters of any type (except for a newline)</li>
        <li>The a at the end of the expression matches the last character of the string if it is an a</li>
      </ul>
   </td>
    <td><mark>Superman is a</mark> hero</td>
  </tr>
  <tr>
    <td>S[a-t]*$</td>
    <td>
      <ul>
        <li>Matches pattern that start with the character S and end with zero or more characters in the range a to t</li>
        <li>The $ at the end of the expression indicates that the match must occur at the end of the string</li>
        <li>The S matches the first character of the sub string if it is an S</li>
        <li>The [a-t]* matches zero or more characters in the range a to t</li>
      </ul>
    </td>
    <td>Please <mark>Starting</mark></td>
  </tr>
  <tr>
    <td>\w{9}</td>
    <td>
      <ul>
        <li>Matches strings that contain exactly 9 word characters</li>
        <li>A word character is any letter, digit, or underscore character (equivalent to the [a-zA-Z0-9_] character set)</li>
        <li>The {9} is a quantifier that specifies that the preceding character or character set must be matched exactly 9 times</li>
      <ul>
    </td>
    <td><mark>Hello_wor</mark>ld!</td>
  </tr>
</table>


In [82]:
import re

names = [
    "Beyoncé",
    "LeBron James",
    "Kim Kardashian West",
    "Elon Musk",
    "M!ke",
]

# Find people with first name and last name only
regex = "^\w+\s+\w+$"
for name in names:
    result = re.search(regex, name)
    if result:
        # print(name)
        print(result)

# Search each name for sequence of word characters that starting with 'B'
regex = "B\w*"
for name in names:
    match = re.search(regex, name)
    if match:
        print(name)
        # print(match.start())
        # print(match.end())
        print(match.span())
        print(match.group())

<re.Match object; span=(0, 12), match='LeBron James'>
<re.Match object; span=(0, 9), match='Elon Musk'>
Beyoncé
(0, 7)
Beyoncé
LeBron James
(2, 6)
Bron


In [83]:
names = [
    "Beyoncé",
    "LeBron James",
    "Kim Kardashian West",
    "Elon Musk",
    "M!ke",
]


# Test for first name and last name
regex = "^(?P<fn>\w+)\s+(?P<ln>\w+)$"  # use group to identify a part of regex pattern
for name in names:
    match = re.search(regex, name)
    if match:
        print(
            f"Full name: {match.group()}, First name: {match.group('fn')}, Last name: {match.group('ln')}"
        )

# Detect name include '!'
regex = "^[a-zA-Z!]+$"
for name in names:
    if re.search(regex, name):
        print(name)

# San for blocks of lower case letters
regex = "[a-z]+"
for name in names:
    # matches = re.findall(regex, name)
    # if matches:
    #     print(matches)

    matches = re.finditer(regex, name)
    print("-" * 10)
    print(name)
    for match in matches:
        print(match)
    print("-" * 10)

Full name: LeBron James, First name: LeBron, Last name: James
Full name: Elon Musk, First name: Elon, Last name: Musk
M!ke
----------
Beyoncé
<re.Match object; span=(1, 6), match='eyonc'>
----------
----------
LeBron James
<re.Match object; span=(1, 2), match='e'>
<re.Match object; span=(3, 6), match='ron'>
<re.Match object; span=(8, 12), match='ames'>
----------
----------
Kim Kardashian West
<re.Match object; span=(1, 3), match='im'>
<re.Match object; span=(5, 14), match='ardashian'>
<re.Match object; span=(16, 19), match='est'>
----------
----------
Elon Musk
<re.Match object; span=(1, 4), match='lon'>
<re.Match object; span=(6, 9), match='usk'>
----------
----------
M!ke
<re.Match object; span=(2, 4), match='ke'>
----------


In [84]:
values = [
    "https://www.baidu.com",
    "http://www.baidu.org",
    "file://test.this.path",
    "com.baidu.www_https://",
]

# Test if string starts with http or https
# regex = "https?"
# for value in values:
#     if re.match(regex, value):
#         print(value)

regex = "https?://w{3}.\w+.(org|com)"
for value in values:
    if re.fullmatch(regex, value):
        print(value)

https://www.baidu.com
http://www.baidu.org


In [85]:
# https://docs.python.org/3/library/re.html