In [1]:
from datetime import datetime

today = datetime.now()

print("Today's date is: {:%Y년%m월%d일}".format(today))

Today's date is: 2021년07월18일


In [2]:
name = "python"

print(f"Python is called {name} due to a comedy series")
print(f"Python is called {name!r} due to a comedy series")

Python is called python due to a comedy series
Python is called 'python' due to a comedy series


In [3]:
number = 9001.1549864234864
num = 10224

print(f'result: {number:10.2f} ') # float
print(f'result: {number:.2e} ') # scientific notation
print(f'result: {num:2d} ') # digit

result:    9001.15 
result: 9.00e+03 
result: 10224 


In [4]:
print(f'result: {num:08d}')

result: 00010224


In [5]:
print(f'result: {num:8d}')

result:    10224


In [6]:
print(f'Today: {today:%Y-%m-%d}')

Today: 2021-07-18


In [7]:
from string import Template # slower than f-string, don't allow format specifiers
# user-provided string 같은 것을 쓸 때는 유용할 수 있음.
# 그닥 쓸모는 없음. 

In [8]:
my_string = Template('Data science has been called ${identifier}')
my_string.substitute(identifier="sexiest job")

'Data science has been called sexiest job'

In [9]:
import re

In [10]:
pattern = re.compile(r'#ufc')

In [11]:
s = "Love #ufc! Let's try another #ufc176"
re.findall(pattern, s)

['#ufc', '#ufc']

In [12]:
pattern = "---\d+월 \d+일 대화내역---"
s = """
A: 그래서 어쨌다고?
B: 응 안녕

---1월 3일 대화내역---

C: 그래
D: 잘가~
"""
re.split(pattern, s)

['\nA: 그래서 어쨌다고?\nB: 응 안녕\n\n', '\n\nC: 그래\nD: 잘가~\n']

In [13]:
s = "김형식씨는 앞으로 나와주시기 바랍니다."
pattern = re.compile(r'[ㄱ-힣]+씨')

re.sub(pattern, "***씨", s)

'***씨는 앞으로 나와주시기 바랍니다.'

In [14]:
s = 'user_1, user_A, user_a, user__'
pattern = re.compile(r'user_\w') # alphaneumeric + '_'(underscore)

re.findall(pattern, s)

['user_1', 'user_A', 'user_a', 'user__']

In [15]:
pattern = re.compile(r'user_\W')
re.findall(pattern, s) # non-word

[]

Quantifiers

In [16]:
s1 = "The color of this image"
s2 = "the colour of the image"
pattern = re.compile(r'colou?r') # once or zero

print(re.findall(pattern, s1))
print(re.findall(pattern, s2))

['color']
['colour']


In [17]:
s = "010-4679-58785, 010-467911-58345, 010-467-58, 010-461-58345456"

pattern = re.compile(r'\d{3}-\d{1,4}-\d{4,}')

re.findall(pattern, s)

['010-4679-58785', '010-461-58345456']

metacharacters

In [18]:
s = "4506 people attend the snow"
pattern = re.compile(r'\d{4}')

둘다 됨. 

In [19]:
re.search(pattern , s)

<re.Match object; span=(0, 4), match='4506'>

In [20]:
re.match(pattern, s)

<re.Match object; span=(0, 4), match='4506'>

하지만 match는 맨 앞이 아니면 안됨. 

In [21]:
s = "Wow, 4506 people attend the snow"
pattern = re.compile(r'\d{4}')

In [22]:
re.search(pattern , s)

<re.Match object; span=(5, 9), match='4506'>

In [23]:
re.match(pattern, s)

greedy vs non-greedy(lazy) matching

- greedy
    - `*`, `+`, `?`, `{num, num}`
    - 최대한 많이 match시킴. return the longest match
    - 하지만 너무 많이 match되면 뒤에서부터 하나씩 gives up 한다. (backtrack)
        - 아래의 예를 보자. `.*hello`라는 regex는 왼쪽부터 해석되는데, 
            - 맨 처음엔 `.*` 즉 아무 문자나 가지고 쭉 오른쪽으로 채운다. 그렇게 끝까지 간 다음
            - 다음으로 `h`가 나와야 하니까 맨 뒤에서부터 한 걸음씩 뒷걸음질 치며 `h`를 찾는다. 
            - 그렇게 `h`를 찾고나서 나머지 `ello`도 매칭되는지 체크한다. 
 

In [24]:
s1 = 'xhelloxxxxxxxxxx'
s2 = 'xhelloxxxxxxxxxxxxhelloxxxxxxxx'
pattern = re.compile(r'.*hello')

In [25]:
re.match(pattern, s1)

<re.Match object; span=(0, 6), match='xhello'>

In [26]:
re.match(pattern, s2)

<re.Match object; span=(0, 23), match='xhelloxxxxxxxxxxxxhello'>

- non-greedy
    - lazy: match as few characters as needed
    - return the shortest match
    - greedy quantifier 마지막에 `?`를 붙이면 non-greedy된다. 
    - greedy와는 반대로 너무 적게 match되면 앞에서부터 하나씩 추가한다. 
        - 아래의 예를 보자. `.*?hello`는 왼쪽부터 해석되는데, 
            - 맨 처음엔 `.*?` 즉 아무 문자나 있는지 없는지 확인한다. (아예 없는 것부터 시작)
            - 아무문자도 없는 것도 `.*?`에 해당되므로 바로 다음에 그럼 `h`가 있는지 확인한다. 
            - 없으므로 맨 앞에서 한 걸음씩 나아가며 조건을 하나씩 충족시키고 그 다음 `h`가 나오는지 찾는다. 
            - 그렇게 `h`를 찾고나서 나머지 `ello`도 매칭되는지 체크한다. 

In [27]:
s = "152315642341abcd"

pattern1 = re.compile(r'\d+')
pattern2 = re.compile(r'\d+?')

In [28]:
re.match(pattern1, s)

<re.Match object; span=(0, 12), match='152315642341'>

In [29]:
re.match(pattern2, s) # 맨 처음만 match 됨.

<re.Match object; span=(0, 1), match='1'>

In [30]:
s = 'xxhelloxxxxxxx'
pattern = re.compile(r'.*?hello')

In [31]:
re.match(pattern, s)

<re.Match object; span=(0, 7), match='xxhello'>

In [32]:
s = 'Make America (United States) Great Again (For Trump)'
p = re.compile(r'\(.+\)')

re.findall(p, s)

['(United States) Great Again (For Trump)']

In [33]:
s = 'Make America (United States) Great Again (For Trump)'
p = re.compile(r'\(.+?\)')

re.findall(p, s)

['(United States)', '(For Trump)']

capturing groups
- match a specific subpattern in a pattern
- use it for further processing
- 전체가 group 0, 그 하위 subpattern이 group 1, group 2 이런 식이다. 
    - 튜플로 반환됨.
- capture a repeated group `(\d+)` vs repeat a capturing group `(\d)+`

In [34]:
s = 'My id: g8fish pw: gogoteam. your id: philips pw: good.'
p = re.compile(r'id:\s(.*?)\spw:\s(.*?\.)')

re.findall(p, s)

[('g8fish', 'gogoteam.'), ('philips', 'good.')]

In [55]:
s = "My lucky numbers are 5778 and 1123."
p = re.compile(r'(\d)+')

re.findall(p, s)

['8', '3']

In [36]:
s = "My lucky numbers are 5788 and 777."
p = re.compile(r'(\d+)')

re.findall(p, s)

['5788', '777']

alternation and non-capturing groups
- use groups to choose between optional patterns

non-capturing groups
- match but not capture a group
    - when group is not backreferenced
    - add `?:` like `(?:regex)`
    
alternation (or, `|`)
- 뒤따르는 non-capturing을 제외하고 앞의 값만 얻어낼 때. 

In [37]:
s = 'Ingredients: 3 apples, 2 bananas, 4 cheese'
p = re.compile(r'(\d+)\s(apples|bananas|cheese)')

re.findall(p, s)

[('3', 'apples'), ('2', 'bananas'), ('4', 'cheese')]

In [38]:
s = 'John Doe: 34-32-34-064-982, Rebecca Smith: 10-78-20-469-598'
p = re.compile(r'(?:\d{2}-){3}(\d{3}-\d{3})') # 첫 group은 non-capturing이라 안쓴다. 

re.findall(p, s)

['064-982', '469-598']

In [39]:
s = "Today is 23rd May 2019, Tomrrow is 24th May 19, Yesterday was 22nd May 2019"
p = re.compile(r'(\d+)(?:st|nd|rd|th)')

re.findall(p, s)

['23', '24', '22']

backreference

numbered groups
- `re.search`로 찾고, `.group`로 몇 번째 그룹 값 가져올지 선택 가능. 

In [40]:
s = 'Python 3.0 was released on 12-03-2008'
p = re.compile(r'(\d{1,2})-(\d{2})-(\d{4})')

info = re.search(p, s)
info

<re.Match object; span=(27, 37), match='12-03-2008'>

In [41]:
info.groups()

('12', '03', '2008')

In [42]:
info.group(0)

'12-03-2008'

In [43]:
info.group(1)

'12'

In [44]:
info.group(2)

'03'

In [45]:
info.group(3)

'2008'

named groups
- 각 그룹에 name을 부여
- `(?P<name>regex)` 형식

In [46]:
s = 'Austin, 78701 Seoul, 07984'
p = re.compile(r'(?P<city>\w+),\s(\d+)') # 하나만 name 설정할 수도 있다. 

cities = re.search(p, s)
cities

<re.Match object; span=(0, 13), match='Austin, 78701'>

In [47]:
cities.groups()

('Austin', '78701')

In [48]:
cities.group('city')

'Austin'

In [49]:
cities = re.findall(p, s)
cities

[('Austin', '78701'), ('Seoul', '07984')]

backreference

그닥 쓸모는 없는듯... 

In [50]:
s = 'I wish you a happy happy birthday!'
p = re.compile(r'(\w+)\s\1')

re.findall(p, s)

['happy']

look around
- allow us to confirm that subpattern is ahead or behind main pattern

look-ahead
- non-capturing group
- checks that the first part of the expression is followed or not by the lookahead expression.
- return only the first part of the expression
    - positive: `(?=regex)`
    - negative: `(?!regex)`
    
look-behind
- look-ahead와 거의 같음. 
    - positive: `(?<=regex)`
    - negative: `(?<!regex)`
- 그냥 위치만 뒤에 놓던걸 앞에 놓으면 됨. (왼쪽)

In [51]:
s = "tweets.txt transferred, mypass.txt transferred, keyword.txt error"
p = re.compile(r'\w+.txt(?=\stransferred)')

re.findall(p, s)

['tweets.txt', 'mypass.txt']

In [52]:
s = "tweets.txt transferred, mypass.txt transferred, keyword.txt error"
p = re.compile(r'\w+.txt(?!\stransferred)') # negation

re.findall(p, s)

['keyword.txt']

In [53]:
s = "tweets.txt transferred, mypass.txt transferred, keyword.txt error"
p = re.compile(r'(?=\w+.txt)\stransferred') # look-ahead를 앞에 붙이면 안됨. 

re.findall(p, s)

[]

In [54]:
s = "tweets.txt transferred, mypass.txt transferred, keyword.txt error"
p = re.compile(r'(?<=\w{6}.txt)\stransferred') # look-behind를 써줘야 함. 

re.findall(p, s)

[' transferred', ' transferred']

이 때, look-behind는 fixed-width pattern을 써줘야 함. 

`(?<=\w{이부분}.txt)\stransferred` 

저 부분에 `\w+` 또는 `\w{1, 6}` 이런건 못옴. 