In [1]:
import re
from numpy.testing import assert_equal

#### Note : Reviewer is still incomplete. Adding some notes on advanced RegEx applications (e.g. Look Aheads, if-else, groups)

# Recommended External Guides and Tools for RegEx

1. Official Documentation of RegEx in Python 3
    * https://docs.python.org/3/howto/regex.html
2. Regex 101
    * Highly recommended site for testing regex since it lists down all valid Python3 RegExs.
    * https://regex101.com
3. RegEx Cheatsheet
    * https://cheatography.com/davechild/cheat-sheets/regular-expressions/
    

**DISCLAIMER** : Contents of this reviewer shall contain mostly use-cases and explainations of the regex exercises. For a more comprehensive guide on syntax and list of expressions, see the resources listed above 

# Python RegEx Commands

## Methods and Attributes

### re.compile()
Create a pattern object for searching or matching RegEx. This is useful if RegEx will be used multiple times.

In [3]:
text = "Fishes are not dog's dishes"
p = re.compile('(\w+ish\w+)')


re.compile(r'(\w+ish\w+)', re.UNICODE)

#### re.match()
Determine if the RegEx matches at the beginning of the string.

In [None]:
print(p.match(text))

<re.Match object; span=(0, 6), match='Fishes'>


#### re.search()
Scan through a string, looking for any location where this RegEx matches.

In [None]:
print(p.search(text))

<re.Match object; span=(0, 6), match='Fishes'>


#### re.findall()
Find all substrings where the RegEx matches, and returns them as a list.

In [None]:
print(p.findall(text))

['Fishes', 'dishes']


#### re.finditer()
Find all substrings where the RegEx matches, and returns them as an iterator.RegEx

In [None]:
print(p.finditer(text))
print([x for x in p.finditer(text)])

<callable_iterator object at 0x7f36d0972520>
[<re.Match object; span=(0, 6), match='Fishes'>, <re.Match object; span=(21, 27), match='dishes'>]


#### RegEx Object Attributes

In [4]:
m = p.match(text)

**group()**

Return the string matched by the RegEx

In [5]:
m.group()

'Fishes'

**start() and end()**

Return the starting/ending position of the match

In [None]:
m.start(),m.end()

(0, 6)

**span()**

Return a tuple containing the (start, end) positions of the match

In [None]:
m.span()

(0, 6)

### String Manipulation

In [None]:
text = 'words are not words but maybe words shouldnt be words'

#### re.split()
Split the string into a list, splitting it wherever the RegEx matches

In [None]:
re.split('words',text)

['', ' are not ', ' but maybe ', ' shouldnt be ', '']

In [None]:
re.split('words',text, maxsplit = 2)

['', ' are not ', ' but maybe words shouldnt be words']

#### re.sub()
Find all substrings where the RegEx matches, and replace them with a different string

In [None]:
re.sub('words','humans', text)

'humans are not humans but maybe humans shouldnt be humans'

In [None]:
re.sub('words','humans', text, count = 1)

'humans are not words but maybe words shouldnt be words'

#### re.subn()
Does the same thing as sub(), but returns the new string and the number of replacements

In [None]:
re.subn('words','humans', text)

('humans are not humans but maybe humans shouldnt be humans', 4)

In [None]:
re.subn('words','humans', text, count = 1)

('humans are not words but maybe words shouldnt be words', 1)

# Exercises and Assignment1 Explanations

## RegEx Exercise

### Is Date?
Create a function `is_date` that uses python regular expressions to check whether the input string follows any of the following date format:

* yyyy-mm-dd
* yyyy/mm/dd
* mm-dd-yyyy
* mm/dd/yyyy
* dd-mm-yyyy
* dd/mm/yyyy

It should return `True` if it does and `False` otherwise. No need to check whether the input is a valid date.

**SUMMARY**
1. Generate variables for each component of a date to avoid unneccessary redundancies to the writing statements
```python
yyyy = '([1-9][0-9]{3})' # Matches Year
dd = '(0[1-9]|[12][0-9]|3[01])' # Matches Day
mm = '(0[1-9]|1[012])' # Matches Month
```
2. Create RegEx statements (For this number, ill discuss only one portion since the other portions are just variations).

    * **Skeleton**

        * `^(()|()|())$`

            * `^` 
                * Anchor to start of line

            * `$` 
                * Anchor to end of line

            * `(...)|(...)|(...)` 
                * Or Statements to Capture Variations 
    * **For Matching Dates (What goes inside `(...)`)**

        * `fr'({yyyy}(?P<del>[-/]){mm}(?P=del){dd})'`

            * `{yyyy}`, `{mm}`, and `{dd}` are python format strings flagged by `f` in the beginning of the string.
            * `(?P<del>[-/])` 
                * is a named capturing group where `del` is the name of the group and it captures
            if the delimiter is either a `-` or a `/`.
            * `(?P=del)` 
                * matches the same text matched by the previously named capture group `del`.                

In [None]:
def is_date(test_str):
    '''Check if input test_str matches the following date formats:
    
    * yyyy-mm-dd
    * yyyy/mm/dd
    * mm-dd-yyyy
    * mm/dd/yyyy
    * dd-mm-yyyy
    * dd/mm/yyyy
    
    Parameters
    -------
    test_str: str
        Input string to check for date format.

    Returns
    -------
    is_valid_date : bool
        Boolean that determines if input string follows a valid
        date format.
    '''
    yyyy = '([1-9][0-9]{3})' # Matches Year
    dd = '(0[1-9]|[12][0-9]|3[01])' # Matches Day
    mm = '(0[1-9]|1[012])' # Matches Month
    pattern = fr'^(({yyyy}(?P<del>[-/]){mm}(?P=del){dd})|' +\ # Deals with first two formats
              fr'({mm}(?P<del2>[-/]){dd}(?P=del2){yyyy})|' +\ # Deals with 3rd and 4
              fr'({dd}(?P<del3>[-/]){mm}(?P=del3){yyyy}))$' # Deals with last two formats
    return re.match(pattern,test_str) is not None

In [None]:
assert_equal(is_date('2020-05-18'), True)
assert_equal(is_date('2020/05/18'), True)
assert_equal(is_date('05-18-2020'), True)
assert_equal(is_date('05/18/2020'), True)
assert_equal(is_date('18-05-2020'), True)
assert_equal(is_date('18/05/2020'), True)
assert_equal(is_date('5-18-2020'), False)
assert_equal(is_date('5/18/2020'), False)
assert_equal(is_date('18-5-2020'), False)
assert_equal(is_date('18/5/2020'), False)
assert_equal(is_date('2020-054-18'), False)
assert_equal(is_date('2020-05-181'), False)
assert_equal(is_date('202-05-18'), False)

### Find Data
Create a function `find_data` that returns all occurrences of `data set` or `dataset` in `text` as a list of strings in the same order as they appear in `text`.

**SUMMARY**

* **RegEx Skeleton**

    * `(\bdata\s?set\b)`

        * `\bdata`
            * using `\b` as a word boundary, we prevent matches where `data` is just the tail-end of a word

        * `\s?`
            * Specifies the possibility of encountering a whitespace either once or not at all.

        * `set\b`
            * matches the word `set`. Note that the pattern included the whitespace before this part - thus we need not worry for cases that set is the tail-end of a word. The `\b` ensures that a non-word character borders `set` (e.g. `seta`).


In [None]:
def find_data(text):
    '''Return all instances of of the strings, 'data set' and 'dataset'
    within the input string.
    
    Parameters
    -------
    text: str
        Input string to parse

    Returns
    -------
    occurances : list of strings
        List containing all instances of 'data set' and 'dataset' in order
        of appearance.
    '''
    return re.findall(r'(\bdata\s?set\b)',text)

In [None]:
text = """
My dataset is bigger than your data set. Your data set is so tiny, ten of your
data sets can fit inside my dataset. Ten of your data sets inside one of my
data set! Your data set doesn't stand a chance.
"""
assert_equal(
    find_data(text), 
    ['dataset', 'data set', 'data set', 'dataset', 'data set', 'data set'])

### Find Lamb
Create a function `find_data` that returns all phrases in `text` that begin with the word `little` and end with the word `lamb` as a list of strings in the same order as they appear in `text`.

**SUMMARY**

* **RegEx Skeleton**

    * `(\blittle\b.*?\blamb\b)`

        * `\blittle\b`
            * using `\b` as a word boundary, we prevent matches where `little` is either just the front-end or tail-end of a word

        * `.*?`
            * is a non-greedy pattern to capture words and characterns between `little` and `lamb`.

        * `\blamb\b`
            * using `\b` as a word boundary, we prevent matches where `lamb` is either just the front-end or tail-end of a word (e.g. `lambchops`, `kulambo`)


In [None]:
def find_lamb(text):
    '''Return all phrases starting with the word 'lamb' and ending
    with the word 'lamb'.
    
    Parameters
    -------
    text: str
        Input string to parse

    Returns
    -------
    occurances : list of strings
        List containing all instances of phrases starting with the word
        'lamb' and ending with the word 'lamb' in order of appearance.
    '''
    return re.findall(r'(\blittle\b.*?\blamb\b)',text)

In [None]:
text = """
Mary had a little lamb, little lamb, little lamb
Mary had a little brown lamb, little brown lamb, little brown lamb
Whose wool is as brown as wood
"""
assert_equal(
    find_lamb(text), 
    ['little lamb',
     'little lamb',
     'little lamb',
     'little brown lamb',
     'little brown lamb',
     'little brown lamb'])

### Repeat Alternate (Replacement/Substitution Problem)
Create a function `repeat_alternate` that returns a string where every other word of `text` is repeated.

**SUMMARY**

* **RegEx Skeleton (Matches)**

    * `\s*?(\b\S+?\b)\s(\b\S+?\b\s)`

        * `\s*?`
            * Captures any leading whitespaces and accounts for spaces every two words.

        * `(\b\S+?\b)` 
            * First word capture.

        * `\s`
            * Whitespace between first word capture and second word capture
            
        * `(\b\S+?\b\s)`
            * Second word capture. Notice that we made it non-greedy so that the match does not extend beyond one-word. 
            
* **RegEx Skeleton (Replacement)**

    * `\1 \1 \2`

        * `\1` is the first word capture
        * `\1 \1` is two `first word captures` in sequence
        * `\1 \1 \2` adds the second word capture to the two `first word captures`.

In [None]:
def repeat_alternate(text):
    '''Return an altered text input where every other word (starting
    from the first word) is repeated.
    
    Parameters
    -------
    text: str
        Input string to alter

    Returns
    -------
    modified_text : str
        Altered string
    '''
    pattern = r'\s*?(\b\S+?\b)\s(\b\S+?\b\s)'
    replacement = r'\1 \1 \2'
    return re.sub(pattern,replacement,text)

In [None]:
text = ("Peter Piper picked a peck of pickled peppers "
        "A peck of pickled peppers Peter Piper picked "
        "If Peter Piper picked a peck of pickled peppers "
        "Where's the peck of pickled peppers Peter Piper picked ")
assert_equal(
    repeat_alternate(text),
    ("Peter Peter Piper picked picked a peck peck of pickled pickled peppers "
     "A A peck of of pickled peppers peppers Peter Piper Piper picked If If "
     "Peter Piper Piper picked a a peck of of pickled peppers peppers "
     "Where's the the peck of of pickled peppers peppers Peter Piper Piper "
     "picked ")
)

### Whats on the Bus
Create a function `whats_on_the_bus` that will return the unique items that are on the bus according to the `text`.

**SUMMARY**

* **RegEx Skeleton**

    * `The ([A-Za-z]+?) on the bus`

        * Since we know that the format for specifying whats on the bus, we simply create a capturing
        group `([A-Za-z]+?)` for the item on the bus.

In [None]:
def whats_on_the_bus(text):
    '''Find unique objects that are on the bus.
    
    Parameters
    -------
    text: str
        Input string to parse.

    Returns
    -------
    objects : set
        Unique items on the bus.
    '''
    return set(re.findall(r'The ([A-Za-z]+?) on the bus',text))

In [None]:
text = """
The wheels on the bus go round and round
Round and round, round and round
The wheels on the bus go round and round
All day long
The wipers on the bus go swish, swish, swish
Swish, swish, swish, swish, swish, swish
The wipers on the bus go swish, swish, swish
All day long
The horn on the bus goes beep, beep, beep
Beep, beep, beep, beep, beep, beep
The horn on the bus goes beep, beep, beep
All day long
The babies on the bus go wah, wah, wah
Wah, wah, wah, wah, wah, wah
The babies on the bus go wah, wah, wah
All day long
The wheels on the bus go round and round
Round and round, round and round
The wheels on the bus go round and round
All day long
"""
items = whats_on_the_bus(text)
assert_equal(len(items), 4)
assert_equal(set(items), set(['babies', 'horn', 'wheels', 'wipers']))

### String to List
Create a function `to_list` that returns the list of items in `text` which were delimited by `,`, `+` or `and`.

**SUMMARY**

* We can use `re.split()` for this particular problem! Recall that `re.split` delimits our string based on the patterns we specify.

* I think this problem is pretty clear so no further explanation needed :)

In [None]:
def to_list(text):
    '''Split input text into a list with respect to the following delimiters:
    
    * ,
    * +
    * and
    * white-spaces
    
    Parameters
    -------
    text: str
        Input string to parse.

    Returns
    -------
    list of text : list
        List of words from input string with respect to possible delimiters. 
    '''
    return re.split('[\+\,]|and',text)

In [None]:
text = "a,b,candfoo bar+bazandd e+fee fi fo"
assert_equal(
    to_list(text), 
    ['a', 'b', 'c', 'foo bar', 'baz', 'd e', 'fee fi fo'])

### Product of Matches
Create a function `march_product` that returns the product of each `m by n` pair in `text`.

**SUMMARY**

* Since the RegEx for this problem is straightforward, the main concern in this problem is how to utilize the capturing groups and get the product of each pair of capturing group. Luckily `re.match` gives us a list of tuples of capturing groups therefore we can use a list comprehension to perform our multiplication.

* **RegEx Skeleton**

    * `(\d+) by (\d+)`

        * This RegEx problem only requires that we extract the integers that are separated by the word `by` (based on input string). Explanation is left to the reader :)


In [None]:
def march_product(text):
    '''Calculate the product (total number of ants per group)
    of each group of ants that are marching.
    
    Parameters
    -------
    text: str
        Input string to parse.

    Returns
    -------
    list of products : list of integers
        List of number of ants that are marching in each group.
    '''
    matches = re.findall(r'(\d+) by (\d+)',text)
    return [int(m[0])*int(m[1]) for m in matches]

In [None]:
text = """
The ants go marching 1 by 1, hurrah, hurrah
The ants go marching 2 by 13, hurrah, hurrah
The ants go marching 42 by 8,
The little one stops to suck his thumb
And they all go marching down to the ground
To get out of the rain, BOOM! BOOM! BOOM!

The ants go marching 9 by 16, hurrah, hurrah
The ants go marching 54 by 7, hurrah, hurrah
The ants go marching 8 by 42,
The little one stops to tie his shoe
And they all go marching down to the ground
To get out of the rain, BOOM! BOOM! BOOM!
"""
assert_equal(march_product(text), [1, 26, 336, 144, 378, 336])

### Get BIG
Create a function `get_big` that will take in `items` and return the list of `ITEM`s that begin with `Big` but with an SKU that is not all numbers.

**SUMMARY**

* **RegEx Skeleton**

    * `^(?:[A-Z0-9]*(?:[A-Z]\d|\d[A-Z]|[A-Z])\S*?) (Big\b.*)$`

        * `^` 
            * Anchor to start of line

        * `(?:[A-Z0-9]*(?:[A-Z]\d|\d[A-Z]|[A-Z])\S*?)`
            * Nested Non-Capturing Groups
            * `[A-Z0-9]*` 
                * Accounts for all leading character before a valid pair of character is found.
            * `(?:[A-Z]\d|\d[A-Z]|[A-Z])`
                * Searches for valid pairs
                    * a number succeeding a letter
                    * a letter succeeding a number
                    * a letter (for instances that SKU is just a single character)
            * `\S*?`
                * To account for trailing non-whitespace characters.
        * `(Big\b.*)$`
            * Matches phrases that start with the word `Big`.

        * `$` 
            * Anchor to end of line

In [None]:
def get_big(items):
    '''Identify items with an SKU that is a combination of numbers and
    letters with an item name starting with "Big".
    
    Parameters
    -------
    items: str
        Input string to parse.

    Returns
    -------
    big_items: list of strings
        List of items that match the pattern desired.
    '''
    pattern = r'^(?:[A-Z0-9]*(?:[A-Z]\d|\d[A-Z]|[A-Z])\S*?) (Big\b.*)$'
    return re.findall(pattern,items,re.M)

In [None]:
items = """SKU ITEM
1A Big Red Box
A0 Big Bad Wolf
A1 Bigrams and Trigrams
02 Big Big World
BC Ain't Big Shoes
3C Bigger not Big"""

assert_equal(get_big(items), ['Big Red Box', 'Big Bad Wolf'])

In [None]:
get_big(items)

['Big Red Box', 'Big Bad Wolf', 'Big Big World']

### Find Chris
Create a function `find_data` that returns all phrases in `text` that begin with the word `little` and end with the word `lamb` as a list of strings in the same order as they appear in `text`.

**SUMMARY**

* `re.I` flag that the RegEx is case-insensitive

* **RegEx Skeleton**

    * `(\w*Chris\w*) [^BM].*`

        * `(\w*Chris\w*)`
            * matches all words that contains the word `Chris`

        * `[^BM].*`
            * Gets all Last Names that doesnt start with `B` or `M`

In [None]:
def find_chris(test_str):
    '''Find all first names of names with firstnames containing
    the case-insensitive word 'Chris' and a last name that doesnt
    start with a B or an M.
    
    Parameters
    -------
    test_str: str
        Input string to parse.

    Returns
    -------
    chris_names: list of strings
        List of first names that followed the desired pattern.
    '''
    return re.findall(r'(\w*Chris\w*) [^BM].*', test_str, re.I)

In [None]:
names = '''ID FIRST_NAME LAST_NAME
1 Christian Alis
A Chris Monterola
A1 Chris Hemsworth
02 Christian Bale
BC Christopher Nolan
3C Christopher de Leon
4F Marichris Chu'''

res = find_chris(names)
assert_equal(type(res), list)
assert_equal(set(res), 
             set(['Christian', 'Chris', 'Christopher', 'Christopher', 
                  'Marichris']))

### Parse Logs
Create a function `get_client` that accepts `server_log` and returns a list of client IP , date/time of server access, and status code tuples from `log`. The value of `server_log` is shown below with relevant information highlighted in red.
<pre>
<font color="red">66.249.65.159</font> - - [<font color="red">06/May/2019:19:10:38 +0800</font>] "GET /news/53f8d72920ba2744fe873ebc.html HTTP/1.1" <font color="red">404</font> 177 "-" "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
<font color="red">66.249.65.3</font> - - [<font color="red">06/May/2019:19:11:24 +0800</font>] "GET /?q=%E0%A6%AB%E0%A6%BE%E0%A7%9F%E0%A6%BE%E0%A6%B0 HTTP/1.1" <font color="red">200</font> 4223 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
<font color="red">127.0.0.1</font> - - [<font color="red">06/May/2019:19:12:14 +0800</font>] "GET /?q=%E0%A6%A6%E0%A7%8B%E0%A7%9F%E0%A6%BE HTTP/1.1" <font color="red">200</font> 4356 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
</pre>

**SUMMARY**

* **RegEx Skeleton**

    * `(?:(\d{1,3}\.\d{1,3}.\d{1,3}.\d{1,3}) - - \[(.*)\] \".*?\" (\d{3}))`

        * `(?:(\d{1,3}\.\d{1,3}.\d{1,3}.\d{1,3})`
            * Matches the IP-address of log entries. Note that this requires a bit of domain knowledge since it was not mentioned that each value of an IP address may contain up to 3 digits.

        * `\[(.*)\]?`
            * Captures the timestamp that is contained within square brackets.
        
        * `\".*?\"`
            * Accounts for the HTTP methods that are contained inside quotation marks
            
        * `(\d{3})`
            * Matches the first group of three digits after the HTTP methods (which in this case is error code)


In [None]:
def get_client(server_log):
    '''Extract relevant information from server log. The following
    information is retrieved:
    * IPv4 Address
    * Date of Log
    * Error Code Encountered
    
    Parameters
    -------
    server_log: str
        String of server logs.

    Returns
    -------
    info: list of tuples
        List of relevant server log information
    '''
    pattern = r'(?:(\d{1,3}\.\d{1,3}.\d{1,3}.\d{1,3})' +\
               ' - - \[(.*)\] \".*?\" (\d{3}))'
    return re.findall(pattern,server_log)

In [None]:
server_log = '''66.249.65.159 - - [06/May/2019:19:10:38 +0800] "GET /news/53f8d72920ba2744fe873ebc.html HTTP/1.1" 404 177 "-" "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
66.249.65.3 - - [06/May/2019:19:11:24 +0800] "GET /?q=%E0%A6%AB%E0%A6%BE%E0%A7%9F%E0%A6%BE%E0%A6%B0 HTTP/1.1" 200 4223 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
127.0.0.1 - - [06/May/2019:19:12:14 +0800] "GET /?q=%E0%A6%A6%E0%A7%8B%E0%A7%9F%E0%A6%BE HTTP/1.1" 200 4356 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"'''

assert_equal(get_client(server_log),
             [('66.249.65.159', '06/May/2019:19:10:38 +0800', '404'),
              ('66.249.65.3', '06/May/2019:19:11:24 +0800', '200'),
              ('127.0.0.1', '06/May/2019:19:12:14 +0800', '200')])

## Assignment 1

### Get Previous Word

**SUMMARY**

* Make use of `re.escape` for literal search terms.

* **RegEx Skeleton**

    * `fr'(\w+?)(?=\b(?:[^.]\W*?)\b{re.escape(search_term)}\b)'`

        * `(\w+?)`
            * The Previous Word that we are looking for.

        * `(?=\b(?:[^.]\W*?)`
            * A look ahead function that checks if the `Previous Word` is found at the end of a sentence (rejects the match if it is) and also accounts for non-word characters (special characters and whitespace). 
        
        * `\b{re.escape(search_term)}\b`
            * Search Word


In [None]:
def get_prev_word(inp, search_term):
    """Return the previous word of a given `search_term` with respect to
    the input string, `inp`.
    
    Words located at the start of a sentence have no previous words.

    Parameters
    ----------
    inp: string
        String to parse
    search_term : string
        Word to search for previous words for

    Returns
    -------
    matches : list
        List of RegEx matches corresponding to words that are located
        immediately before a search_term.
    """
    pattern = fr'(\w+?)(?=\b(?:[^.][\W]*?)\b{re.escape(search_term)}\b)' #.* -> \.\*
    return re.findall(pattern, inp)

In [None]:
test_inp = ('Data wrangling, sometimes referred to as data munging, is the '
            'process of transforming and mapping data from one "raw" data '
            'form into another format with the intent of making it more '
            'appropriate and valuable for a variety of downstream purposes '
            'such as analytics. A data wrangler is a person who performs '
            'these transformation operations.\n'
            'This may include further munging, data visualization, data '
            'aggregation, training a statistical model, as well as many '
            'other potential uses. Data munging as a process typically '
            'follows a set of general steps which begin with extracting the '
            'data in a raw form from the data source, "munging" the raw data '
            'using algorithms or parsing the data into predefined data '
            'structures, and finally depositing the resulting content into a '
            'data sink for storage and future use.')

assert isinstance(get_prev_word('', ''), list)
assert_equal(get_prev_word(test_inp, '.*'), [])
assert_equal(get_prev_word(test_inp, 'Data'), [])
assert_equal(get_prev_word(test_inp, 'This'), [])
assert_equal(get_prev_word(test_inp, 'wrangling'), ['Data'])
assert_equal(get_prev_word(test_inp, 'the'),
             ['is', 'with', 'extracting', 'from', 'munging', 
              'parsing', 'depositing'])
assert_equal(get_prev_word(test_inp, 'analytics'), ['as'])

### Get Next Word

**SUMMARY**

* Make use of `re.escape` for literal search terms.

* Note that this is a similar implementation with 3.2.1 and is only different in order. Check explanation of this RegEx from 3.2.1


In [None]:
def get_next_word(inp, search_term):
    """Return the next word of a given `search_term` with respect to
    the input string, `inp`.
    
    Words located at the start of a sentence have no previous words.

    Parameters
    ----------
    inp: string
        String to parse
    search_term : string
        Word to check for next words for

    Returns
    -------
    matches : list
        List of RegEx matches corresponding to words that are located
        immediately after a search_term.
    """
    pattern = fr'(?<=\b{re.escape(search_term)}\b)(?:[^.][\W]*?)(\b\w+\b)'
    return re.findall(pattern, inp)

In [None]:
assert isinstance(get_prev_word('', ''), list)
assert_equal(get_next_word(test_inp, '.*'), [])
assert_equal(get_next_word(test_inp, 'Data'), ['wrangling', 'munging'])
assert_equal(get_next_word(test_inp, 'data'), 
             ['munging', 'from', 'form', 'wrangler', 'visualization',
              'aggregation', 'in', 'source', 'using', 'into', 'structures',
              'sink'])
assert_equal(get_next_word(test_inp, 'analytics'), [])
assert_equal(get_next_word(test_inp, 'operations'), [])
assert_equal(get_next_word(test_inp, 'visualization'), ['data'])
assert_equal(get_next_word(test_inp, 'aggregation'), ['training'])
assert_equal(get_next_word(test_inp, 'source'), ['munging'])

### Parse Access Logs

**SUMMARY**

* **RegEx Skeleton**

    * `fr'(.*?) - \((.*?)\)\[(.*?)\](?:{info_pattern}|{error_pattern})'`

        * `(.*?) - \((.*?)\)\[(.*?)\]`
            * Three capturing groups that captures `date`, `logger`, and `severity`.

        * `(?:{info_pattern}|{error_pattern})`
            * A non-capturing group that checks for two different patterns.
            * Info-Pattern
                * `(?:(?<!\[ERROR\])\[(.*?)\]: (.*?) (.*?)  (.*?) (\d+))`
                    * `(?<!\[ERROR\])`
                        * Look behind pattern to check that `severity` is not equal to `[ERROR]`.
                    * `\[(.*?)\]: (.*?) (.*?)  (.*?) (\d+))`
                        * Five capturing groups that captures `ip`, `method`, `url`, `response_code`, `bytes`
            * Error-Pattern
                * `(?:(?<=\[ERROR\]): ((?:.+\n?){4}))`
                    * `(?<!\[ERROR\])`
                        * Look behind pattern to check that `severity` is equal to `[ERROR]`.
                    * `((?:.+\n?){4}))`
                        * Capture the next four lines of text which should be equal to `error`.

In [None]:
def parse_access_log(log):
    """Return a list of dictionaries based on the logs of a webserver.
    Dictionay keys are as follows : `date`, `logger`, `severity`, `ip`,
    `method`, `url`, `response_code`, `bytes`, `error`

    Parameters
    ----------
    log: string
        String of webserver logs to parse

    Returns
    -------
    dictionaries : list of dictionaries
        List of dictionaires containing labeled information about
        web server logs.
    """
    dictionaries = []
    dictionary_keys = ['date','logger','severity','ip',
                       'method','url','response_code','bytes',
                       'error'] #err_msg in example
    info_pattern = r'(?:(?<!\[ERROR\])\[(.*?)\]: (.*?) (.*?)  (.*?) (\d+))'
    error_pattern = r'(?:(?<=\[ERROR\]): ((?:.+\n?){4}))'
    pattern = fr'(.*?) - \((.*?)\)\[(.*?)\](?:{info_pattern}|{error_pattern})'
    matches = re.findall(pattern, access_logs, re.M)
    for m in matches:
        m = [t if t != '' else None for t in m]
        d = dict(list(zip(dictionary_keys,m)))
        dictionaries.append(d)
    return dictionaries

In [None]:
access_logs = '''2018-07-12 10:53:32 - (network)[INFO][127.0.0.1:57534]: GET http://127.0.0.1:8081/v1/get/landuse?lng=121.02787971496582&lat=14.606591351724205&d=500  200 1883
2018-07-12 10:54:14 - (network)[INFO][127.0.0.1:57534]: GET http://127.0.0.1:8081/v1/get/landuse?lng=120.54113388061523&lat=14.67892177840028&d=500  200 1884
2018-07-12 10:54:17 - (network)[INFO][127.0.0.1:57534]: GET http://127.0.0.1:8081/v1/get/landuse?lng=120.27711868286133&lat=14.82400863570895&d=500  200 719
2018-07-12 10:54:20 - (network)[INFO][127.0.0.1:57534]: GET http://127.0.0.1:8081/v1/get/landuse?lng=120.91775894165039&lat=14.096787557861887&d=500  200 345
2018-07-12 11:48:05 - (sanic)[ERROR]: Traceback (most recent call last):
  File "/anaconda3/envs/server/lib/python3.6/site-packages/sanic/router.py", line 356, in _get
    raise NotFound('Requested URL {} not found'.format(url))
sanic.exceptions.NotFound: Requested URL /v1/get/not_exist not found'''
parsed_log = parse_access_log(access_logs)
assert_equal(len(parsed_log), 5)
assert_equal(parsed_log[0],
             {'date': '2018-07-12 10:53:32',
              'logger': 'network',
              'severity': 'INFO',
              'ip': '127.0.0.1:57534',
              'method': 'GET',
              'url': 'http://127.0.0.1:8081/v1/get/landuse?lng=121.02787971496582&lat=14.606591351724205&d=500',
              'response_code': '200',
              'bytes': '1883',
              'error': None})
assert_equal(parsed_log[-1],
             {'date': '2018-07-12 11:48:05',
              'logger': 'sanic',
              'severity': 'ERROR',
              'ip': None,
              'method': None,
              'url': None,
              'response_code': None,
              'bytes': None,
              'error': '''Traceback (most recent call last):
  File "/anaconda3/envs/server/lib/python3.6/site-packages/sanic/router.py", line 356, in _get
    raise NotFound('Requested URL {} not found'.format(url))
sanic.exceptions.NotFound: Requested URL /v1/get/not_exist not found'''})

### Repeating Words

**SUMMARY**

* Pass the `re.I` flag so that the RegEx is case-insensitive

* **RegEx Skeleton**

    * `((\b\w+\b)(\W+?\2){1,}\b)`
    
        * `((...)(...))`
            * The outermost paranthesis acts as the First capturing group.

            * `(\b\w+\b)`
                * Second capturing group.

            * `(\W+?\2){1,}\b`
                * Checks if the succeding words (`\2`) are equal to the `second capturing group`.
                * `{1,}` is the same as `+`
                * The reason that it returns only one of the repeating words and every single instance of the word is because quantifiers only returns the last match of a capturing group.
* **QUESTION** : Can it be possible to not use capturing groups?
    * Yes! You can use look aheads which are non-capturing groups.

In [None]:
def repeating_words(text):
    """Return lists of strings that containing phrases that are repeating
    words.

    Parameters
    ----------
    text: string
        String to check for repeating words for

    Returns
    -------
    matches : list of strings
        List of strings that contain case-insensitive phrases of 
        repeating words 
    """
    matches = re.findall(r'((\b\w+\b)(\W+?\2){1,}\b)',text, re.I)
    return [m[0] for m in matches]

In [None]:
text = """
Row, row, row your boat
Gently down the stream
Merrily merrily, merrily, merrily
Life is but a dream
"""
score = 1
try:
    assert_equal(
        repeating_words(text),
        ['Row, row, row', 'Merrily merrily, merrily, merrily']
    )
except:
    score = 0.9
score

**NOTE** : I did not include #10 red_stuff since instructions were confusing and no definite rule was given

# Common RegEx Applications

## File Extension Manipulation

In [None]:
text = 'ASIAN.html.csv'

In [None]:
re.sub(r'^([^.]+).([^.]+).([^.]+)$',r'\1.\3',text)

'ASIAN.csv'

## Search for a Text (e.g. 10 character words containing audi)

In [None]:
text = 'Fraudicius cars are not sellable but are attractive to people'
re.search(r'(?=\b\w{10}\b)\w*?audi\w*',text)

<re.Match object; span=(0, 10), match='Fraudicius'>

## Check if two words co-occur in a sentence

In [None]:
text = "The Philippines has a good leader and that is a fact - but this is in Leni"
pattern = r"^(?=.*\bgood leader\b).*\bDuterte\b.*$"
re.search(pattern,text)

NoneType

In [None]:
text = "The Philippines has a good leader and that is a fact - but this not in Duterte"
pattern = r"^(?=.*\bgood leader\b).*\bDuterte\b.*$"
re.search(pattern,text)

<re.Match object; span=(0, 78), match='The Philippines has a good leader and that is a f>

In [None]:
text = "Duterte is not a good leader"
pattern = r"^(?=.*\bgood leader\b).*\bDuterte\b.*$"
re.search(pattern,text)

<re.Match object; span=(0, 28), match='Duterte is not a good leader'>

## Check if valid email address

In [None]:
emails = ['lt2.edu@aim.ph',
         'food4thought@yahoo,com',
         'aim@aim.aim']
for email in emails:
    match = re.search('^[A-Z0-9._%+-]+@(?:[A-Z0-9-]+\.)+[A-Z]{2,6}$', email, re.I)
    if match is not None:
        print('Valid email',email)

Valid email lt2.edu@aim.ph
Valid email aim@aim.aim


# Advanced RegEx Techniques

## Groups
Groups and Back-references in our exercises have been barely touched. However, these groups and callbacks are powerful tools that can simplify your RegEx!

### Types
* Capturing Group `(...)`.
    * Capture characters that matches the pattern inside the capturing group
    * Referred by their order of implementation when used for back-referencing and for substitutions
        * from `\1` to `\9`, any number of groups above 9 would fail. For these cases, I recommend using named groups to keep track
* Named Groups  `(?<name>...)`.
    * Same as capturing groups but with a given name
    * For cases of back-referencing you may call it using the following syntax `(?P=name)`
    * For replacement strings, you may call the groups via the following syntax `\g<name>`

#### Capturing Groups

In [None]:
text = 'JohnDoe.MSDS2022@aim.edu'
pattern = r'^([A-Z][a-z]*)([A-Z][a-z]*)\.([A-Z]+)(\d+)@(.*?)$'
match = re.match(pattern,text)
match.groups()

('John', 'Doe', 'MSDS', '2022', 'aim.edu')

In [None]:
re.sub(pattern, r'\1\2@\5', text)

'JohnDoe@aim.edu'

#### Named-Capturing Groups

In [None]:
text = 'JohnDoe.MSDS2022@aim.edu'
named_pattern = r'(?P<firstname>[A-Z][a-z]*)(?P<lastname>[A-Z][a-z]*)\.(?P<program>[A-Z]+)(?P<batch>\d+)@(?P<domain>.*)'
match = re.match(named_pattern,text)
match.groups()

('John', 'Doe', 'MSDS', '2022', 'aim.edu')

In [None]:
re.sub(named_pattern, r'\g<firstname>\g<lastname>@\g<domain>', text)

'JohnDoe@aim.edu'

## Lookarounds

Lookarounds are non-capturing groups which aims to look at either the prior or succeeding characters in our text.
### Types
* Lookahead `(?=...)`.
    * Asserts that what immediately follows the current position in the string matches our pattern

* Lookbehind  `(?<name>...)`.
    * 	Asserts that what immediately precedes the current position in the string matches our pattern
    
* Negated Lookarounds
    * Just replace the `=` sign in the lookahead and lookbehind with a `!`

### Examples

Password Validation where we have the following requirements:

* The password must have between six and ten word characters
* It must include at least one lowercase character 
* It must inclusde at least three uppercase characters 
* It must include at least one digit

In [None]:
#The password must have between six and ten word characters
rule1 = r'^(?=\w{6,10}).*$'

#It must include at least one lowercase character 
rule2 = r'^(?=[^a-z]*[a-z]).*'

#It must include at least three uppercase characters 
#Note that I used a trick here where I inserted a non-capturing group
#(?:...) to get 3 different matches of unique uppercase characters
rule3 = r'^(?=(?:[^A-Z]*[A-Z]){3}).*$'

#It must include at least one digit
rule4 = r'^(?=\D*\d).*$'

#Since we wish to capture all these rules simultaneously, we simply combine them into a single RegEx and removing redundancies (^, .*, $).
password_regex = r'^(?=\w{6,10})(?=[^a-z]*[a-z])(?=(?:[^A-Z]*[A-Z]){3})(?=\D*\d).*$' 

In [None]:
password_trials = ['AAAa0abc','AaAa0abc','Lor3MIpsuM']

In [None]:
for p in password_trials:
  if re.match(password_regex,p) is not None:
    print('VALID!',p)
  else:
    print('INVALID!',p)

VALID! AAAa0abc
INVALID! AaAa0abc
VALID! Lor3MIpsuM


# I'll add more tomorrow. Im sleepy :)

# BONUS : Styling your RegEx
This is a simple summary of how I approach regex problems.