# Regular expressions for detecting word patterns 
- https://docs.python.org/3/howto/regex.html

In [1]:
import nltk
nltk.data.path.append("/data/3/zwang/nltk_data") # setting environment variable to your path

# show the plot in file
from matplotlib import pyplot as plt
plt.style.use('default')

- Many text processing tasks involve pattern matching:
    - check if a word starts with a capitalized letter
    - find words ending with "ed"
    - test if "wood" is a substring of "woodchuck"
    - check if the letters in a word are alphabetic
    - extract year from the date 2021.02.05
    
- **Regular expressions** 
    - a special sequence of characters that descrives **word patterns**
    - match or find other strings
    - **Does this string match the pattern?**

<br>
    - "woodchuck" <br>
    - "woodchucks" <br>
    - "Woodchuck" <br>
    - "Woodchucks" <br>
<img src="./woodchuck.png" width="300" align="center">


<img src="./outline.png" width="400" align="left">

In [2]:
import re
vocab = sorted(set(nltk.corpus.treebank.words()))
len(vocab), type(vocab)

(12408, list)

# Disjunction 
- square bracket: [  ]
- negation: [^  ]
- pipe: |

## square bracket
- match one of the letters in the group

<img src="./sq_bracket_1.png" width="500" align="left">

- r'...': interprete the inside by the re library
- re.compile: compile a regular expression pattern into a regular expression object

In [66]:
p = re.compile(r'[wW]oodchuck')
re.findall(p, 'woodchuck is the nickname of the author who write the book <<Woodchuck>>')

['woodchuck', 'Woodchuck']

In [67]:
# re.findall(): find all substrings where the RE matches, and returns them as a list
re.findall(r'[wW]oodchuck', 'woodchuck is the nickname of the author who write the book <<Woodchuck>>')

['woodchuck', 'Woodchuck']

In [68]:
re.findall(r'[1234567890]', 'my student ID is 20352321')

['2', '0', '3', '5', '2', '3', '2', '1']

In [69]:
re.findall(r'[1234567890]+', 'my student ID is 20352321')

['20352321']

- **square bracket**: ranges

<img src="./sq_bracket_2.png" width="600" align="left">

In [70]:
re.findall(r'[A-Z]', 'Drenched Blossoms')

['D', 'B']

In [80]:
# re.match(pattern, string): determine if the RE matches at the beginning of the string
re.match(r'[A-Z]', 'Drenched Blossoms') #.group(0)

<re.Match object; span=(0, 1), match='D'>

In [75]:
# re.search(pattern, string): find the first location where the RE produces a match
re.search(r'[a-z]', 'Drenched Blossoms')

<re.Match object; span=(1, 2), match='r'>

In [78]:
print(re.findall(r'[a-z]+', 'my beans were impatient'))

['my', 'beans', 'were', 'impatient']


In [79]:
re.findall(r'[0-9]', 'Chapter 1: Down the Rabbit Hole')

['1']

## negation in disjunction
- **^**: caret, means negation only when it is the *first* letter in the square bracket [^ ]

<img src="./negation.png" width="600" align="left">

In [81]:
# all strings that match the RE pattern
re.findall(r'[^A-Z]', 'Oyfn pripetchik')

['y', 'f', 'n', ' ', 'p', 'r', 'i', 'p', 'e', 't', 'c', 'h', 'i', 'k']

In [82]:
# find the first location where the RE produces a match
re.search(r'[^A-Z]', 'Oyfn pripetchik')

<re.Match object; span=(1, 2), match='y'>

In [83]:
re.search(r'[^Ss]', 'I have no exquisite reason')

<re.Match object; span=(0, 1), match='I'>

In [106]:
re.search(r'[^e^]', '^eLook here') # re. 

<re.Match object; span=(2, 3), match='L'>

In [109]:
re.__version__

'2.2.1'

In [111]:
re.search(r'[\w._%+-]', 'Look here')

<re.Match object; span=(0, 1), match='L'>

In [112]:
re.search('[\w._%+-]', 'Look here')

<re.Match object; span=(0, 1), match='L'>

- Summary of the use cases of ^

<img src="./caret.png" width="400" align="left">

In [157]:
re.search(r'[^\d]', '2021.02.11')

<re.Match object; span=(4, 5), match='.'>

In [172]:
re.search(r'a^b', 'Look up a^b now') # contradiction

In [153]:
re.search(r'a\^b', 'Look up a^b now') # escape

<re.Match object; span=(8, 11), match='a^b'>

In [166]:
re.search(r'^b', 'beam, look up the here now') # change the location of beam

<re.Match object; span=(0, 1), match='b'>

In [103]:
re.search(r'[a^b]+', 'Look up a^ba^b now') 

<re.Match object; span=(8, 14), match='a^ba^b'>

In [171]:
re.search(r'(a\^b)+', 'Look up a^ba^b now') 

<re.Match object; span=(8, 14), match='a^ba^b'>

## pipe: |
- a choice between the pattern on its left or right
- e.g., woodchuck is another name for grounddog
    

<img src="./pipe.png" width="600" align="left">

In [21]:
re.findall(r'grounddog|woodchuck', 'woodchuck is the official name of grounddog')

['woodchuck', 'grounddog']

In [173]:
re.findall(r'grounddog|woodchuck', 'woodchuck is the official name of Grounddog', re.I) # mode

['woodchuck', 'Grounddog']

In [24]:
# strings end with ed or ing
[w for w in vocab if re.search('(ed|ing)$', w)][:10]

['62%-owned',
 'Absorbed',
 'According',
 'Adopting',
 'Advanced',
 'Advancing',
 'Alfred',
 'Allied',
 'Annualized',
 'Anything']

In [25]:
# strings that contain a substring wit, wet, wait, woot
[w for w in vocab if re.search('w(i|e|ai|oo)t', w)][:10]

['Hymowitz',
 'Switzerland',
 'awaits',
 'bellwether',
 'notwithstanding',
 'switch',
 'switched',
 'wait',
 'waited',
 'waiting']

# meta-characters

<img src="./meta.png" width="600" align="left">

In [174]:
re.findall(r'colou?r', 'colour: the color is green')

['colour', 'color']

In [177]:
re.search(r'o*h!', 'ooooh! my god!') # h! ooh!

<re.Match object; span=(0, 6), match='ooooh!'>

In [28]:
re.search(r'o+h!', 'h! my god!') # ooh!

In [178]:
re.search(r'beg.n', 'begin')

<re.Match object; span=(0, 5), match='begin'>

## Anchors: ^ \$
- ^: match start of the string
- $: match end of the string

<img src="./anchors.png" width="600" align="left">

In [30]:
# check whether a string start with a capital letter
re.search(r'^[A-Z]','Palo Alto') # P->p

<re.Match object; span=(0, 1), match='P'>

In [181]:
re.search(r'^[^A-Z]','Palo Alto') # P->p

In [188]:
# matches the end of the string
re.search(r'foo.+$','foo1\nfoo2\n') 

<re.Match object; span=(5, 9), match='foo2'>

In [189]:
re.search(r'foo.+$','foo1\nfoo2\n', re.M) # matches the end of each line

<re.Match object; span=(0, 4), match='foo1'>

## backslash: \ 

- Pre-defined character set

<img src="./char_set.png" width="500" align="left">

In [33]:
re.search(r'\s','Natural Language Processing: lec-06')

<re.Match object; span=(7, 8), match=' '>

In [34]:
re.search(r'\S','Natural Language Processing: lec-06')

<re.Match object; span=(0, 1), match='N'>

In [35]:
re.search(r'\d','Natural Language Processing: lec-06')
# re.search(r'[0-9]','Natural Language Processing: lec-06')

<re.Match object; span=(33, 34), match='0'>

In [36]:
re.search(r'\D','Natural Language Processing: lec-06')
# re.search(r'[^0-9]','Natural Language Processing: lec-06')

<re.Match object; span=(0, 1), match='N'>

In [37]:
re.search(r'\w','Natural Language Processing: lec-06')
# re.search(r'[0-9a-zA-Z]','Natural Language Processing: lec-06')

<re.Match object; span=(0, 1), match='N'>

In [38]:
re.search(r'\W','Natural Language Processing: lec-06')
# re.search(r'[^\w]','Natural Language Processing: lec-06')

<re.Match object; span=(7, 8), match=' '>

In [196]:
re.search(r'[\w]*','Natural Language Processing: lec-06')

<re.Match object; span=(0, 7), match='Natural'>

In [200]:
re.findall(r'[\w]+','Natural Language Processing: lec-06') # + / *

['Natural', 'Language', 'Processing', 'lec', '06']

In [203]:
re.findall(r'[\w\S]*','Natural Language Processing: lec-06') # \S -> \s

['Natural', '', 'Language', '', 'Processing:', '', 'lec-06', '']

### backslash as escape:
- deprive the special power of the character
- literally match a specific character 

- e.g.:
    - . matches any character
    - \\. matches a period

In [214]:
re.search(r'\.', 'The end.') # escape

<re.Match object; span=(7, 8), match='.'>

- e.g. match a literal backslash

<img src="./backslash.png" width="400" align="left">

In [215]:
re.search('\section', '\section{Natural Language Processing}')

In [216]:
re.search('\\section', '\section{Natural Language Processing}')

In [217]:
# each backslash must be expressed as \\ inside a regular Python string literal
re.search('\\\\section', '\section{Natural Language Processing}')

<re.Match object; span=(0, 8), match='\\section'>

### r: raw string notation for regular expression patterns in Python
- avoid confusion for characters that have special meaning
    - "\n": one-character string, a newline
    - r"\n": two-character string, \ and n
        - prefixed with 'r': backslashes are not handled in any special way

In [219]:
# a python "raw" string, do not handle backslash in any special way
re.search(r'\\section', '\section{Natural Language Processing}')

<re.Match object; span=(0, 8), match='\\section'>

<img src="./raw_str.png" width="300" align="left">

## parentheses: ( )
- group
    - capture and group the letters that matched the pattern
    - backreference: use the matched part inside a pattern
    
- the scope of an operator 
    - can be used together with the pipe (or disjunction) symbol 
    - r'w(i|e|ai|oo)t', matching wit, wet, wait, and woot. 

<img src="./back_refer.png" width="600" align="left">

In [42]:
# parenthesis with pipe
re.findall(r'(grounddog|woodchuck)', 'woodchuck is the official name of grounddog')

['woodchuck', 'grounddog']

In [246]:
re.search(r'(\d)[a-z]\1','zsdfg1a1z213') # r'(\d)[a-z](\d)'

<re.Match object; span=(5, 8), match='1a1'>

In [247]:
s_match = re.search(r'(\d)[a-z]\1','zsdfg1a1z213')
s_match.group(1)

'1'

In [252]:
re.search(r'^(\d)(\d).*\2\1$','13awdfgasdf31') # .group(1)

<re.Match object; span=(0, 13), match='13awdfgasdf31'>

In [256]:
re.search(r'^(\d)(\d).*\2\1$','2021.12.02')# .group(1)

<re.Match object; span=(0, 10), match='2021.12.02'>

In [261]:
re.search(r'^([\d]*).*\1$','2021.1202').group(1)

'202'

## braced expressions: {m,n}
- e.g.,[ ]{3,5}
- specify the number of repeats of the previous pattern

In [262]:
# numbers with four digits {4}
[w for w in vocab if re.search('^[0-9]{4}$', w)][:10]

['1614',
 '1637',
 '1787',
 '1901',
 '1903',
 '1917',
 '1925',
 '1929',
 '1933',
 '1934']

In [263]:
# strings start with several digits, dash, 3 to 5 letters
[w for w in vocab if re.search('^[0-9]+-[a-z]{3,5}$', w)][:10]

['10-day',
 '10-lap',
 '10-year',
 '100-share',
 '12-point',
 '12-year',
 '14-hour',
 '15-day',
 '150-point',
 '190-point']

In [264]:
# strings start with more than 5 letters, dash, 2 to 3 letters, dash, no more than 6 letters
[w for w in vocab if re.search('^[a-z]{5,}-[a-z]{2,3}-[a-z]{,6}$', w)]

['black-and-white',
 'bread-and-butter',
 'father-in-law',
 'machine-gun-toting',
 'savings-and-loan']

# Methods and attributes for pattern objects

<img src="./re_methods.png" width="800" align="left">

In [265]:
# first matches
re.search(r'\d+', 'Natural Language Processing: lec-06')

<re.Match object; span=(33, 35), match='06'>

In [268]:
# whether match at the beginning of the string
re.match(r'\d+', 'Natural Language Processing: lec-06') # start with 06

In [269]:
# whether the whole string matches the regular expression pattern
re.fullmatch(r'[wW]oodchuck', 'woodchuck')

<re.Match object; span=(0, 9), match='woodchuck'>

In [274]:
# all substrings
re.findall(r'\w+', 'Natural Language Processing')

['Natural', 'Language', 'Processing']

In [280]:
# substitute the leftmost non-overlapping matched string by the replacement string
re.sub(r'red', r'green','green socks and red shoes') 

'green socks and green shoes'

In [279]:
# same as sub(), but returns the new string and the number of replacements
re.subn(r'red', r'green','red socks and red shoes')

('green socks and green shoes', 2)

- **Split**
    - Split the string into a list, splitting it wherever the RE matches

In [281]:
re.split(r'\s', 'Natural Language Processing')

['Natural', 'Language', 'Processing']

In [282]:
re.split(r'\.', '2021.12.02')

['2021', '12', '02']

<img src="./re_attributes.png" width="600" align="left">

In [287]:
re.search(r'\w+', 'Natural Language Processing')

<re.Match object; span=(0, 7), match='Natural'>

In [289]:
re_object = re.search(r'\w+', 'Natural Language Processing') 
re_object.group()

'Natural'

In [290]:
re_object.span()

(0, 7)

In [291]:
re_object.start()

0

In [292]:
re_object.end()

7

In [65]:
s = 'Natural Language Processing'
s[0:7]

'Natural'