In [None]:
#################################################################
#
# pass_through.ipynb
#
# Demonstrates how to pass specific blocks (code and markdown)
# through without processing.
#
# (For purpose of regression testing, this example also verifies
# that strings with double quotes are properly escaped when building
# the test code.)
#
# !!! IMPORTANT !!! This is also a regression test !!! 
#################################################################

import re

# "Magic" code to place `plnq` into the Jupyter environment's namespace 
# without overwriting the plnq object injected by the plnq executable.
# (You can safely remove these two lines; but, then your editor will
# probably complain that plnq is not defined.)
if not 'plnq' in globals():  
    plnq = __import__('types').SimpleNamespace()
    plnq.add_function = lambda *args, **kwargs: (args, kwargs)

# Data for info.json
plnq.info = {
    "title": "Regular Expression capture",
    "topic": "regex",
    "tags": ["regex", "ic"]
}

# InClass: Regular Expression Capture

You may find these resources helpful:
  * Chapter 12 from the online textbook [Python for Everybody](https://runestone.academy/ns/books/published/py4e-int/regex/combiningsearchingandextracting.html).
  * The [Python re documentation](https://docs.python.org/3/library/re.html).

When applying a regular expression pattern to a string, we can "capture" part of that pattern.

Recall that part of a regular expression pattern can be in parentheses. Originally, we used parentheses to group a portion of the regular expression to be repeated, or to group alternatives. We can extract the portion of a string that matches these parentheses.

The function `re.search()` returns a `re.Match` object when a string matches a pattern. We can query this object to find out exactly what matches.

Notice that `search()` only finds the _first_ match.

!!!PLNQ.PassThrough!!!

In [None]:
import re
words = ['bat', 'cat', 'bit', 'bite', 'abate', 'robot', 'baste', 'rabid', 'debt', 'about', 'bitbot']
for word in words:
    match = re.search(r'b.t', word)
    if match == None:
        print(f"'{word}' does not match.")
    else:
        print(f"characters '{match.group()}' at positions {match.span()} match")

# !!!PLNQ.PassThrough!!!                   

We can also use the `Match` object to access any portion of the pattern marked with parentheses:

!!!PLNQ.PassThrough!!!

In [None]:
import re

films2 = ['Frontier(s) (2007)', '(K)nox: The Rob Knox Story (2021)', 'I (Almost) Got Away With It (2010)', 'Star Wars', 'Superman (19887)']
for film in films2:
    match = re.search (r'\((\d{4})\)$', film)
    if match == None:
        print(f"'{film}' does not match.")
    else:
        print(f"Entire matching portion: {match[0]}") # Notice that match[0] is the entire match
        print(f"Group 1: {match[1]}")

# !!!PLNQ.PassThrough!!!   

# Task 1

Write a method !!!`extract_phone(str)`!!! that extracts the digits from a phone number in the form `616-231-5544`, or returns `None` if the string is not a properly formatted phone number. (Match the entire string.)

In [None]:
import re
def extract_phone(str):
    m = re.search(r'^(\d{3})-(\d{3})-(\d{4})$', str)
    if not m:
        return None
    return f"{m[1]}{m[2]}{m[3]}"


plnq.add_function('extract_phone',
  desc="A function that extract the digits from a formatted phone number",         
  displayed_examples=[
    ['616-231-4455', '6162314455'],
    ['(616)-231-4455', None],
    ['616-231-445', None]
  ],
  test_cases=[
    ['517-432-8841', '5174328841'],
    ['313-735-2132', '3137352132'],
    ['1-313-735-2132', None],
    ['313-75-2132', None],
  ]
)

# Task 2

Consider the format string used by `matplotlib's` `plot` function: (https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.plot.html)

Write a function !!!`parse_format(str)`!!! that uses a regular expression to parse and extract the marker, line, and color from a format string.
* Assume the components are in this order: marker, line, color.
* Return the extracted values as a tuple: `(marker, line, color)`
* Each component is optional. If a component is not present, place `None` in the tuple. 
* If the string is not a valid format string, return `None`

Hint: The following letters are used for colors: `b`, `g`, `r`, `c`, `m`, `y`, `k`, and `w`. None of these letters are valid markers.


In [None]:
import re
def parse_format(str):
    m = re.search(r'^([^bgrcmykw\-\.:])?(-|--|-.|:)?([bgrcmykw])?$', str)
    if not m:
        return None
    return (m[1], m[2], m[3])

plnq.add_function('parse_format',
  desc="A function to parse matplotlib's format string",         
  displayed_examples=[
    ['D-g', ('D', '-', 'g')],
    ['k', (None, None, 'k')],
    [':y', (None, ':', 'y')]
  ],
  test_cases=[
    ['>--m', ('>', '--', 'm')],
    ['Dm--z', None],
    ['X-.w', ('X', '-.', 'w')],
    ['X-.', ('X', '-.', None)],
    ['-.w', (None, '-.', 'w')],
    ['-.', (None, '-.', None)],
    ['Xw', ('X', None, 'w')],
    ['X', ('X', None, None)],
    ['w', (None, None, 'w')],
    ['d:c', ('d', ':', 'c')],
  ]
)

# Task 3

Write a function !!!`get_first_quote(str)`!!! that returns the contents of the _first_ quote in a string. (Be careful: The string may contain more than one quote.). Return `None` if the string does not contain any quotes.

In [None]:
import re
def get_first_quote(str):
    m = re.search(r'"([^"]*)"', str)
    return m[1] if m else None

plnq.add_function('get_first_quote',
  desc="A function returning the first quote in a string",         
  displayed_examples=[
    ['Here comes "Pickles" and "Wheezer".', "Pickles"],
    ['There are no quotes here', None]
  ],
  test_cases=[
    ['More nicknames: "Stretch", "Gravity", "Rainmaker"', "Stretch"],
    ['This is a "Partial quotation', None],
    ['End with a "quote"', "quote"]
  ]
)

Pickles\


(('get_first_quote',),
 {'desc': 'A function returning the first quote in a string',
  'displayed_examples': [['Here comes \\"Pickles\\" and \\"Wheezer\\".',
    'Pickles'],
   ['There are no quotes here', None]],
  'test_cases': [['More nicknames: \\"Stretch\\", \\"Gravity\\", \\"Rainmaker\\"',
    'Stretch'],
   ['This is a \\"Partial quotation', None],
   ['End with a \\"quote\\"', 'quote']]})

# Find All

Notice that `search` only finds the _first_ substring that matches the pattern. (See Task 3 above.)  If you want to find _all_ occurrences of a pattern, use `findall`. Notice that `findall` returns a `list`.

!!!PLNQ.PassThrough!!!

In [None]:
input = "There are 17 giraffes, 14 lions, and 35 monkeys"
m = re.findall(r'\d+', input)
print(m)

# !!!PLNQ.PassThrough!!!  

# Backreferences

If you are looking for consistency/repetition in a pattern, you can refer to previously captured values. The code below shows how to match phone numbers with either `-` or `.`, but insist that the separator is consistent. Note the use of `\1` between the 2nd and 3rd group of digits.

!!!PLNQ.PassThrough!!!

In [None]:
numbers = ['616-443-2818', '616.443.2818', '6164432818', '616-443.2818', '616.443-2818', '616443-2818', '616.4432818']
for number in numbers:
    if re.search(r'^\d\d\d(-|\.|)\d\d\d(\1)\d\d\d\d', number):
        print(f"{number}: Match")
    else:
        print(f"{number}: Does not match")

# !!!PLNQ.PassThrough!!!  

# Non-Capturing Groups

Consider the case of capturing the domain and _optional_ port from a URL. 
* `http://example.com/index.html`
* `http://example.com:443/index.html`

A first attempt might look like this: `://([^:]+)(:(\d+))?/`

Notice the nested parentheses: `(:(\d+))?`
* We want to match either both the colon and the number, or neither. Hence, the outer parentheses for the `?`.
* However, we only want to capture the number, hence the inner parentheses. 

!!!PLNQ.PassThrough!!!

In [None]:
import re

for url in ['http://example.noport.com/index.html','http://example.withport.com:443/index.html']:
    print(f"\nParsing '{url}'")
    m = re.search(r'://([^:/]+)(:(\d+))?/(.*)$', url)
    if not m:
        print("No match")
        continue
    for index, c in enumerate(m.groups()):
        print(f"Group {index}: {c}")

# !!!PLNQ.PassThrough!!!  

Notice what happens if we add `?:` to the beginning of the group with the colon:

!!!PLNQ.PassThrough!!!

In [None]:
import re

for url in ['http://example.noport.com/index.html','http://example.withport.com:443/index.html']:
    print(f"\nParsing '{url}'")
    m = re.search(r'://([^:/]+)(?::(\d+))?/(.*)$', url)
    if not m:
        print("No match")
        continue
    for index, c in enumerate(m.groups()):
        print(f"Group {index}: {c}")

# !!!PLNQ.PassThrough!!!  