Regular Expressions
=============

Literal Search
--------

In [26]:
import re

text_to_search = '''
abcdefghijklmnopqurtuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
1234567890
Ha HaHa
MetaCharacters (Need to be escaped):
. ^ $ * + ? { } [ ] \ | ( )
coreyms.com
321-555-4321
123.555.1234
123*555*1234
800-555-1234
900-555-1234
Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T

cat
bat
mat
pat
'''

sentence = 'Start a sentence and then bring it to an end'

**raw string**  
  
just a string prefixed with an 'r' and that tells python  
not to handle back slashes in any special way  

this is important because we want our regular expressions to   
interpret the strings we are passing in and not have python  
doing anything to them first  

In [3]:
print('\tTab')
print(r'\tTab')

	Tab
\tTab


**compile method**  
  
compile method will allow us to seperate out our patterns into  
a variable and also we'll make it easier to reuse that variable  
to perform multiple searches  

In [5]:
pattern = re.compile(r'abc')

matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<_sre.SRE_Match object; span=(1, 4), match='abc'>


span은 발견하고자 하는 문자열의 시작 위치와 끝 위치를 의미한다  

In [6]:
print(text_to_search[1:4]) # span을 통해 원하는 값 찾기가능

abc


In [9]:
# 문자열 Escape 하는 방법 : \ 붙이기
# meta character에는 escape 해야함!
pattern = re.compile(r'\.')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<_sre.SRE_Match object; span=(111, 112), match='.'>
<_sre.SRE_Match object; span=(146, 147), match='.'>
<_sre.SRE_Match object; span=(167, 168), match='.'>
<_sre.SRE_Match object; span=(171, 172), match='.'>
<_sre.SRE_Match object; span=(218, 219), match='.'>
<_sre.SRE_Match object; span=(249, 250), match='.'>
<_sre.SRE_Match object; span=(262, 263), match='.'>


Meta Characters
---------

`.`       - Any Character Except New Line  
`\d`      - Digit (0-9)  
`\D`      - Not a Digit (0-9)  
`\w`      - Word Character (a-z, A-Z, 0-9, _)  
`\W`      - Not a Word Character  
`\s`      - Whitespace (space, tab, newline)  
`\S`      - Not Whitespace (space, tab, newline)  
  
`\b`      - Word Boundary  
`\B`      - Not a Word Boundary  
`^`       - Beginning of a String  
`$`       - End of a String  
  
`[]`      - Matches Characters in brackets  
`[^ ]`    - Matches Characters NOT in brackets  
`|`       - Either Or  
`( )`     - Group  
  
Quantifiers:  
`*`       - 0 or More  
`+`       - 1 or More  
`?`       - 0 or One  
`{3}`     - Exact Number  
`{3,4}`   - Range of Numbers (Minimum, Maximum)  

In [12]:
pattern = re.compile(r'\d')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<_sre.SRE_Match object; span=(55, 56), match='1'>
<_sre.SRE_Match object; span=(56, 57), match='2'>
<_sre.SRE_Match object; span=(57, 58), match='3'>
<_sre.SRE_Match object; span=(58, 59), match='4'>
<_sre.SRE_Match object; span=(59, 60), match='5'>
<_sre.SRE_Match object; span=(60, 61), match='6'>
<_sre.SRE_Match object; span=(61, 62), match='7'>
<_sre.SRE_Match object; span=(62, 63), match='8'>
<_sre.SRE_Match object; span=(63, 64), match='9'>
<_sre.SRE_Match object; span=(64, 65), match='0'>
<_sre.SRE_Match object; span=(151, 152), match='3'>
<_sre.SRE_Match object; span=(152, 153), match='2'>
<_sre.SRE_Match object; span=(153, 154), match='1'>
<_sre.SRE_Match object; span=(155, 156), match='5'>
<_sre.SRE_Match object; span=(156, 157), match='5'>
<_sre.SRE_Match object; span=(157, 158), match='5'>
<_sre.SRE_Match object; span=(159, 160), match='4'>
<_sre.SRE_Match object; span=(160, 161), match='3'>
<_sre.SRE_Match object; span=(161, 162), match='2'>
<_sre.SRE_Match object; span=(16

In [13]:
pattern = re.compile(r'\bHa') # 단어 앞에 공백이 있는 Ha를 찾아라
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<_sre.SRE_Match object; span=(66, 68), match='Ha'>
<_sre.SRE_Match object; span=(69, 71), match='Ha'>


In [14]:
pattern = re.compile(r'\BHa') # 단어 앞에 공백이 없는 Ha를 찾아라
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<_sre.SRE_Match object; span=(71, 73), match='Ha'>


In [16]:
# ^는 이후에 나오는 문자열로 시작하는 문자열을 검색함
pattern = re.compile(r'^Start')
matches = pattern.finditer(sentence)
for match in matches:
    print(match)

<_sre.SRE_Match object; span=(0, 5), match='Start'>


In [19]:
# 전화번호 찾기
with open('data.txt', 'r') as f:
    contents = f.read()
    
    pattern = re.compile(r'\d\d\d[-.]\d\d\d[-.]\d\d\d') # -이나 .만 매치!
    matches = pattern.finditer(contents)
    
    for match in matches:
        print(match)

<_sre.SRE_Match object; span=(12, 23), match='615-555-716'>
<_sre.SRE_Match object; span=(102, 113), match='800-555-566'>
<_sre.SRE_Match object; span=(191, 202), match='560-555-515'>
<_sre.SRE_Match object; span=(281, 292), match='900-555-934'>
<_sre.SRE_Match object; span=(378, 389), match='714-555-740'>
<_sre.SRE_Match object; span=(467, 478), match='800-555-677'>
<_sre.SRE_Match object; span=(557, 568), match='783-555-479'>
<_sre.SRE_Match object; span=(647, 658), match='516-555-461'>
<_sre.SRE_Match object; span=(740, 751), match='127-555-186'>
<_sre.SRE_Match object; span=(829, 840), match='608-555-493'>
<_sre.SRE_Match object; span=(915, 926), match='568-555-605'>
<_sre.SRE_Match object; span=(1003, 1014), match='292-555-187'>
<_sre.SRE_Match object; span=(1091, 1102), match='900-555-320'>
<_sre.SRE_Match object; span=(1180, 1191), match='614-555-116'>
<_sre.SRE_Match object; span=(1269, 1280), match='530-555-267'>
<_sre.SRE_Match object; span=(1355, 1366), match='470-555-275'>


In [20]:
# 전화번호 찾기
# 800이나 900으로 시작하는 번호만 찾기
with open('data.txt', 'r') as f:
    contents = f.read()
    
    pattern = re.compile(r'[89]00[-.]\d\d\d[-.]\d\d\d') # -이나 .만 매치!
    matches = pattern.finditer(contents)
    
    for match in matches:
        print(match)

<_sre.SRE_Match object; span=(102, 113), match='800-555-566'>
<_sre.SRE_Match object; span=(281, 292), match='900-555-934'>
<_sre.SRE_Match object; span=(467, 478), match='800-555-677'>
<_sre.SRE_Match object; span=(1091, 1102), match='900-555-320'>
<_sre.SRE_Match object; span=(1439, 1450), match='800-555-608'>
<_sre.SRE_Match object; span=(1790, 1801), match='800-555-710'>
<_sre.SRE_Match object; span=(2051, 2062), match='900-555-511'>
<_sre.SRE_Match object; span=(2826, 2837), match='900-555-542'>
<_sre.SRE_Match object; span=(3284, 3295), match='800-555-881'>


In [23]:
with open('data.txt', 'r') as f:
    contents = f.read()
    
    pattern = re.compile(r'[1-3]') # - 으로 범위 지정하기
    matches = pattern.finditer(text_to_search)
    
    for match in matches:
        print(match)

<_sre.SRE_Match object; span=(55, 56), match='1'>
<_sre.SRE_Match object; span=(56, 57), match='2'>
<_sre.SRE_Match object; span=(57, 58), match='3'>
<_sre.SRE_Match object; span=(151, 152), match='3'>
<_sre.SRE_Match object; span=(152, 153), match='2'>
<_sre.SRE_Match object; span=(153, 154), match='1'>
<_sre.SRE_Match object; span=(160, 161), match='3'>
<_sre.SRE_Match object; span=(161, 162), match='2'>
<_sre.SRE_Match object; span=(162, 163), match='1'>
<_sre.SRE_Match object; span=(164, 165), match='1'>
<_sre.SRE_Match object; span=(165, 166), match='2'>
<_sre.SRE_Match object; span=(166, 167), match='3'>
<_sre.SRE_Match object; span=(172, 173), match='1'>
<_sre.SRE_Match object; span=(173, 174), match='2'>
<_sre.SRE_Match object; span=(174, 175), match='3'>
<_sre.SRE_Match object; span=(177, 178), match='1'>
<_sre.SRE_Match object; span=(178, 179), match='2'>
<_sre.SRE_Match object; span=(179, 180), match='3'>
<_sre.SRE_Match object; span=(185, 186), match='1'>
<_sre.SRE_Match ob

In [27]:
# ^이 []안으로 들어오게 되면 []안에 있는 것이 아닌 것만 검색하는 기능을 가짐
pattern = re.compile(r'[^p]at') # p로 시작하는 문자열 제외
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<_sre.SRE_Match object; span=(267, 270), match='cat'>
<_sre.SRE_Match object; span=(271, 274), match='bat'>
<_sre.SRE_Match object; span=(275, 278), match='mat'>


Quantifier
----------

In [28]:
# Quantifier를 이용하여 전화번호 찾기
pattern = re.compile(r'\d{3}.\d{3}.\d{4}') 
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<_sre.SRE_Match object; span=(151, 163), match='321-555-4321'>
<_sre.SRE_Match object; span=(164, 176), match='123.555.1234'>
<_sre.SRE_Match object; span=(177, 189), match='123*555*1234'>
<_sre.SRE_Match object; span=(190, 202), match='800-555-1234'>
<_sre.SRE_Match object; span=(203, 215), match='900-555-1234'>


In [31]:
# 일정하지 않은 이름 형식에서 이름 불러오기

# group을 이용하여 Mr, MS, Mrs 셋 중 하나 매치
# .이 0이거나 그 이상 있어야 하므로 ?
# 이후 공백 문자열 \s
# 대문자 문자열 [A-Z]
# 이후 알파벳 문자열이 있거나 없으므로 \w*

pattern = re.compile(r'(Mr|Ms|Mrs).?\s[A-Z]\w*')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<_sre.SRE_Match object; span=(216, 227), match='Mr. Schafer'>
<_sre.SRE_Match object; span=(228, 236), match='Mr Smith'>
<_sre.SRE_Match object; span=(237, 245), match='Ms Davis'>
<_sre.SRE_Match object; span=(246, 259), match='Mrs. Robinson'>
<_sre.SRE_Match object; span=(260, 265), match='Mr. T'>


Example : Email Matching
    ---------------

In [32]:
# email에 해당하는 문자열 모두 매칭하기

emails = '''
CoreyMSchafer@gmail.com
corey.schafer@university.edu
corey-321-schafer@my-work.net
'''

pattern = re.compile(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+')

matches = pattern.finditer(emails)

for match in matches:
    print(match)

<_sre.SRE_Match object; span=(1, 24), match='CoreyMSchafer@gmail.com'>
<_sre.SRE_Match object; span=(25, 53), match='corey.schafer@university.edu'>
<_sre.SRE_Match object; span=(54, 83), match='corey-321-schafer@my-work.net'>


Groups
-------

In [35]:
urls = '''
https://www.google.com
http://coreyms.com
https://youtube.com
https://www.nasa.gov
'''

pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')

matches = pattern.finditer(urls)

# 위의 패턴에는 3개의 그룹이 있고, group함수에 인자로 원하는 그룹을 출력할 수 있다
for match in matches:
    print(match.group(0))

https://www.google.com
http://coreyms.com
https://youtube.com
https://www.nasa.gov


In [36]:
subbed_urls = pattern.sub(r'\2\3', urls) # 패턴에서 sub 함수를 통해 특정 그룹을 불러올 수 있다 
print(subbed_urls)


google.com
coreyms.com
youtube.com
nasa.gov



Other Methods
------------

In [37]:
# findall will just  return the matches as a list of strings
# only return group
pattern = re.compile(r'(Mr|Ms|Mrs).?\s[A-Z]\w*')
matches = pattern.findall(text_to_search)
for match in matches:
    print(match)

Mr
Mr
Ms
Mrs
Mr


In [38]:
# match 메소드는 주어진 문자열이 제일 처음 나오는 문자열을 매칭함
pattern = re.compile(r'^Start')
matches = pattern.match(sentence)
print(matches)

<_sre.SRE_Match object; span=(0, 5), match='Start'>


In [39]:
# search 메서드는 주어진 문자열이 처음 나오는 모든 문자열을 매칭함
pattern = re.compile(r'^Start')
matches = pattern.search(sentence)
print(matches)

<_sre.SRE_Match object; span=(0, 5), match='Start'>


In [40]:
# flag를 통해 대문자 소문자 모두 매치
pattern = re.compile(r'^start', re.IGNORECASE)
matches = pattern.match(sentence)
print(matches)

<_sre.SRE_Match object; span=(0, 5), match='Start'>


Reference
---------

https://www.youtube.com/watch?v=K8L6KVGG-7o&index=30&list=PL-osiE80TeTt2d9bfVyTiXJA-UTHn6WwU