In [1]:
# 使用多个界定符分割字符串
import re
line = 'asdf fjdk; afed, fjek,asdf, foo'
re.split(r'[;,\s]\s*',line)

['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']

In [2]:
re.split(r'(;|,|\s)\s*',line)

['asdf', ' ', 'fjdk', ';', 'afed', ',', 'fjek', ',', 'asdf', ',', 'foo']

In [3]:
#  ?: 非分组捕获
re.split(r'(?:;|,|\s)\s*',line)

['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']

In [4]:
# 字符串开头或结尾匹配
# str.startswith() str.endswith()
# 字符串或者元组
filename = 'foo.c'
filename.endswith(('.c','.h'))

True

In [5]:
# re
import re
text1 = '11/27/2012'
test2 = 'Nov 27,2012'
datepat = re.compile(r'\d+/\d+/\d+')
datepat.match(text1)

<re.Match object; span=(0, 10), match='11/27/2012'>

In [8]:
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
datepat.findall(text)

['11/27/2012', '3/13/2013']

In [10]:
datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
m = datepat.match('11/27/2012')
m

<re.Match object; span=(0, 10), match='11/27/2012'>

In [12]:
m.group(0),m.group(1),m.group(2),m.group(3),m.groups()

('11/27/2012', '11', '27', '2012', ('11', '27', '2012'))

In [13]:
datepat.findall(text)

[('11', '27', '2012'), ('3', '13', '2013')]

In [15]:
# finditer() 返回一个可迭代对象
for m in datepat.finditer(text):
    print(m.groups())

('11', '27', '2012')
('3', '13', '2013')


In [16]:
# 字符串搜索和替换
# str.replace()
# re.sub()
text

'Today is 11/27/2012. PyCon starts 3/13/2013.'

`sub()` 函数中的第一个参数是被匹配的模式，第二个参数是替换模式。反斜杠数字比如 \3 指向前面模式的捕获组号。

In [17]:
re.sub(r'(\d+)/(\d+)/(\d+)', r'\3-\1-\2', text)

'Today is 2012-11-27. PyCon starts 2013-3-13.'

`\g<group_name>`指向命名分组

In [18]:
re.sub(r'(?P<month>\d+)/(?P<day>\d+)/(?P<year>\d+)', r'\g<year>-\g<month>-\g<day>', text)

'Today is 2012-11-27. PyCon starts 2013-3-13.'

In [19]:
newtext,n = re.subn(r'(?P<month>\d+)/(?P<day>\d+)/(?P<year>\d+)', r'\g<year>-\g<month>-\g<day>', text)
newtext,n

('Today is 2012-11-27. PyCon starts 2013-3-13.', 2)

+ 替换回调函数

In [24]:
from calendar import month_abbr
def change_date(m):
    mon_name=month_abbr[int(m.group(1))]
    return f'{m.group(2)} {mon_name} {m.group(3)}'
datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
datepat.subn(change_date,text)

('Today is 27 Nov 2012. PyCon starts 13 Mar 2013.', 2)

+ 忽略大小写，`re.IGNORECASE`

In [25]:
text = 'UPPER PYTHON, lower python, Mixed Python'
re.findall('python', text, flags=re.IGNORECASE)

['PYTHON', 'python', 'Python']

In [26]:
re.sub('python', 'snake', text, flags=re.IGNORECASE)

'UPPER snake, lower snake, Mixed snake'

+ 最短匹配模式  
在`*`后面添加`?`使得匹配变成非贪婪模式

In [28]:
str_pat = re.compile(r'"(.*)"')
text1 = 'Computer says "no."'
text2 = 'Computer says "no." Phone says "yes."'
str_pat.findall(text1),str_pat.findall(text2)

(['no.'], ['no." Phone says "yes.'])

In [29]:
str_pat = re.compile(r'"(.*?)"')
str_pat.findall(text2)

['no.', 'yes.']

+ 多行匹配   
`.`不能匹配换行符   
`(?:.|\n)`指定一个非捕获组
`re.DOTALL` 让正则表达式中的`.`匹配包括换行符在内的任意字符

In [30]:
text2 = '''/* this is a
 multiline comment */
'''
comment = re.compile(r'/\*((?:.|\n)*?)\*/')
comment.findall(text2)

[' this is a\n multiline comment ']

In [31]:
comment = re.compile(r'/\*(.*?)\*/', re.DOTALL)
comment.findall(text2)

[' this is a\n multiline comment ']

+ str.translate()

In [38]:
import sys
import unicodedata
cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode) if unicodedata.combining(chr(c)))
s = 'pýtĥöñ\fis\tawesome\r\n'
remap = {
 ord('\t') : ' ',
ord('\f') : ' ',
ord('\r') : None # Deleted
}
a = s.translate(remap)
a

'pýtĥöñ is awesome\n'

In [40]:
b = unicodedata.normalize('NFD', a)
b

'pýtĥöñ is awesome\n'

In [41]:
b.translate(cmb_chrs)

'python is awesome\n'

+ 字符串对齐
> `ljust(20,'+')`,`rjust(20,'-')`,`center(20,'*')`  
> `format(x,'=>20')`  
> `'%-20s' % text`  

In [42]:
text = 'Hello World'
text.ljust(20),text.rjust(20),text.center(20)

('Hello World         ', '         Hello World', '    Hello World     ')

In [43]:
text.rjust(20,'='),text.center(20,'*')



In [44]:
format(text,'>20'),format(text,'<20'),format(text,'^20')

('         Hello World', 'Hello World         ', '    Hello World     ')

In [47]:
format(text,'=>20s'),format(text,'*^20s')



In [48]:
'{:>10s} {:>10s}'.format('Hello','World')

'     Hello      World'

In [52]:
x = 1.2345
format(x,'>10'),format(x,'^10.2f'),format(x,'0^10.2f')

('    1.2345', '   1.23   ', '0001.23000')

In [53]:
'{a:>10s} {b:>10s}'.format(a='Hello',b='World')

'     Hello      World'

+ 合并拼接字符串  
`join`  
`+`  
`print`

In [54]:
a,b,c = (1,2,3)
print(a,b,c,sep=':')

1:2:3


+ 字符串中加入变量

In [64]:
class safesub(dict):
    def __missing__(self,key):
        return '{'+key+'}'
'{name} {age}'.format_map(safesub(name=1))

'1 {age}'

+ 以指定列宽格式化字符串  
`textwrap`

In [66]:
s = "Look into my eyes, look into my eyes, the eyes, the eyes, \
the eyes, not around the eyes, don't look around the eyes, \
look into my eyes, you're under."
s

"Look into my eyes, look into my eyes, the eyes, the eyes, the eyes, not around the eyes, don't look around the eyes, look into my eyes, you're under."

In [72]:
import textwrap
print(textwrap.fill(s,70))

Look into my eyes, look into my eyes, the eyes, the eyes, the eyes,
not around the eyes, don't look around the eyes, look into my eyes,
you're under.


In [73]:
print(textwrap.fill(s,40,initial_indent='    '))

    Look into my eyes, look into my
eyes, the eyes, the eyes, the eyes, not
around the eyes, don't look around the
eyes, look into my eyes, you're under.


In [74]:
print(textwrap.fill(s,40,subsequent_indent='    '))

Look into my eyes, look into my eyes,
    the eyes, the eyes, the eyes, not
    around the eyes, don't look around
    the eyes, look into my eyes, you're
    under.


+ 字符串令牌解析

In [87]:
text = 'foo = 23 + 42 * 10'
import re
NAME = r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)'
NUM = r'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
TIMES = r'(?P<TIMES>\*)'
EQ = r'(?P<EQ>=)'
WS = r'(?P<WS>\s+)'

master_pat = re.compile('|'.join([NAME, NUM, PLUS, TIMES, EQ, WS]))
scanner = master_pat.scanner('foo = 42')
for i in range(6):
    _ = scanner.match()
    print(_)
    if _:
        print(_.lastgroup,_.group())

<re.Match object; span=(0, 3), match='foo'>
NAME foo
<re.Match object; span=(3, 4), match=' '>
WS  
<re.Match object; span=(4, 5), match='='>
EQ =
<re.Match object; span=(5, 6), match=' '>
WS  
<re.Match object; span=(6, 8), match='42'>
NUM 42
None


In [91]:
from collections import namedtuple
def generate_tokens(pat, text):
    Token = namedtuple('Token', ['type', 'value'])
    scanner = pat.scanner(text)
    for m in iter(scanner.match, None):
        yield Token(m.lastgroup, m.group())
for tok in generate_tokens(master_pat, 'foo = 42'):
    print(tok)

Token(type='NAME', value='foo')
Token(type='WS', value=' ')
Token(type='EQ', value='=')
Token(type='WS', value=' ')
Token(type='NUM', value='42')


In [92]:
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
Topic: 下降解析器
Desc :
"""
import re
import collections

# Token specification
NUM = r'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
MINUS = r'(?P<MINUS>-)'
TIMES = r'(?P<TIMES>\*)'
DIVIDE = r'(?P<DIVIDE>/)'
LPAREN = r'(?P<LPAREN>\()'
RPAREN = r'(?P<RPAREN>\))'
WS = r'(?P<WS>\s+)'

master_pat = re.compile('|'.join([NUM, PLUS, MINUS, TIMES,
                                  DIVIDE, LPAREN, RPAREN, WS]))
# Tokenizer
Token = collections.namedtuple('Token', ['type', 'value'])


def generate_tokens(text):
    scanner = master_pat.scanner(text)
    for m in iter(scanner.match, None):
        tok = Token(m.lastgroup, m.group())
        if tok.type != 'WS':
            yield tok


# Parser
class ExpressionEvaluator:
    '''
    Implementation of a recursive descent parser. Each method
    implements a single grammar rule. Use the ._accept() method
    to test and accept the current lookahead token. Use the ._expect()
    method to exactly match and discard the next token on on the input
    (or raise a SyntaxError if it doesn't match).
    '''

    def parse(self, text):
        self.tokens = generate_tokens(text)
        self.tok = None  # Last symbol consumed
        self.nexttok = None  # Next symbol tokenized
        self._advance()  # Load first lookahead token
        return self.expr()

    def _advance(self):
        'Advance one token ahead'
        self.tok, self.nexttok = self.nexttok, next(self.tokens, None)

    def _accept(self, toktype):
        'Test and consume the next token if it matches toktype'
        if self.nexttok and self.nexttok.type == toktype:
            self._advance()
            return True
        else:
            return False

    def _expect(self, toktype):
        'Consume next token if it matches toktype or raise SyntaxError'
        if not self._accept(toktype):
            raise SyntaxError('Expected ' + toktype)

    # Grammar rules follow
    def expr(self):
        "expression ::= term { ('+'|'-') term }*"
        exprval = self.term()
        while self._accept('PLUS') or self._accept('MINUS'):
            op = self.tok.type
            right = self.term()
            if op == 'PLUS':
                exprval += right
            elif op == 'MINUS':
                exprval -= right
        return exprval

    def term(self):
        "term ::= factor { ('*'|'/') factor }*"
        termval = self.factor()
        while self._accept('TIMES') or self._accept('DIVIDE'):
            op = self.tok.type
            right = self.factor()
            if op == 'TIMES':
                termval *= right
            elif op == 'DIVIDE':
                termval /= right
        return termval

    def factor(self):
        "factor ::= NUM | ( expr )"
        if self._accept('NUM'):
            return int(self.tok.value)
        elif self._accept('LPAREN'):
            exprval = self.expr()
            self._expect('RPAREN')
            return exprval
        else:
            raise SyntaxError('Expected NUMBER or LPAREN')


def descent_parser():
    e = ExpressionEvaluator()
    print(e.parse('2'))
    print(e.parse('2 + 3'))
    print(e.parse('2 + 3 * 4'))
    print(e.parse('2 + (3 + 4) * 5'))
    # print(e.parse('2 + (3 + * 4)'))
    # Traceback (most recent call last):
    #    File "<stdin>", line 1, in <module>
    #    File "exprparse.py", line 40, in parse
    #    return self.expr()
    #    File "exprparse.py", line 67, in expr
    #    right = self.term()
    #    File "exprparse.py", line 77, in term
    #    termval = self.factor()
    #    File "exprparse.py", line 93, in factor
    #    exprval = self.expr()
    #    File "exprparse.py", line 67, in expr
    #    right = self.term()
    #    File "exprparse.py", line 77, in term
    #    termval = self.factor()
    #    File "exprparse.py", line 97, in factor
    #    raise SyntaxError("Expected NUMBER or LPAREN")
    #    SyntaxError: Expected NUMBER or LPAREN


if __name__ == '__main__':
    descent_parser()

2
5
14
37
