### 分割字符串

In [4]:
import re

line = 'asdf fjdk; afed, fjek,asdf, foo'

re.split(r'[;,\s]\s*', line)

# () 捕获
re.split(r'(;|,|\s)\s*', line)

# ?: 非捕获，等同于 []
re.split(r'(?:,|;|\s)\s*', line)

['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']

### 开头或结尾匹配

In [None]:
filename = 'spam.txt'

filename.endswith('.txt') # 可接受（只能）元组
filename.startswith('file:')

### Shell通配符匹配字符串

In [5]:
from fnmatch import fnmatch, fnmatchcase

fnmatch('foo.txt', '*.txt')
fnmatch('foo.txt', '?oo.txt')
fnmatch('Dat45.csv', 'Dat[0-9]*')

# 大小写敏感
fnmatchcase('foo.txt', '*.TXT')

False

### 字符串匹配和搜索

In [6]:
text = 'yeah, but no, but yeah, but no, but yeah'

# Search for the location of the first occurrence
text.find('no')

10

In [13]:
import re

text1 = '11/27/2012'

# Simple matching: \d+ means match one or more digits
re.match(r'\d+/\d+/\d+', text1)

# with compile
datepat = re.compile(r'\d+/\d+/\d+')
datepat.match(text1)

# find all
datepat.findall(text1)

['11/27/2012']

In [16]:
# 捕获
datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
m = datepat.match('11/27/2012')
print(m.group(0))
print(m.group(1))
print(m.group(2))
print(m.group(3))
print(m.groups())

11/27/2012
11
27
2012
('11', '27', '2012')


In [18]:
# 迭代器
for m in datepat.finditer('11/27/2012'):
    print(m.groups())

('11', '27', '2012')


### 字符串搜索和替换

flags=re.IGNORECASE 忽略大小写

In [19]:
text = 'yeah, but no, but yeah, but no, but yeah'
text.replace('yeah', 'yep')

'yep, but no, but yep, but no, but yep'

In [None]:
import re

text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
re.sub(r'(\d+)/(\d+)/(\d+)', r'\3-\1-\2', text)

# with compile
datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
datepat.sub(r'\3-\1-\2', text)

In [24]:
# with group_name, ?P<name>, \g<name>
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
re.sub(r'(?P<month>\d+)/(?P<day>\d+)/(?P<year>\d+)', r'\g<year>-\g<month>-\g<day>', text)

# with callback，groups as params
from calendar import month_abbr
def change_date(m):
    mon_name = month_abbr[int(m.group(1))]
    return '{} {} {}'.format(m.group(2), mon_name, m.group(3))

datepat.sub(change_date, text)

# 替换次数
newtext, n = datepat.subn(r'\3-\1-\2', text)

### 最短匹配模式

In [26]:
text2 = 'Computer says "no." Phone says "yes."'

# 贪心模式
str_pat = re.compile(r'"(.*)"')
print(str_pat.findall(text2))

# 非贪心模式
str_pat = re.compile(r'"(.*?)"')
print(str_pat.findall(text2))

['no." Phone says "yes.']
['no.', 'yes.']


### 多行匹配模式

点(.)不能匹配换行符

In [32]:
text2 = '''/* this is a
multiline comment */
'''

comment = re.compile(r'/\*((?:.|\n)*?)\*/') # 非捕获
comment.findall(text2)

# re.DOTALL
comment = re.compile(r'/\*(.*?)\*/', re.DOTALL)
comment.findall(text2)

[' this is a\nmultiline comment ']

### 将Unicode文本标准化

NFC表示字符应该是整体组成(比如可能的话就使用单一编码)

而NFD表示字符应该分解为多个组合字符表示

In [34]:
import unicodedata

s1 = 'Spicy Jalape\u00f1o'
s2 = 'Spicy Jalapen\u0303o'

t1 = unicodedata.normalize('NFC', s1)
t2 = unicodedata.normalize('NFD', s2)

# 测试一个字符是否为和音字符
''.join(c for c in t2 if not unicodedata.combining(c))

'Spicy Jalapeno'

### 删除字符串中不需要的字符

In [41]:
t = '-----hello====='
t.lstrip('-')
t.strip('-=')

s = ' hello     world \n'
s.replace(' ', '')
re.sub('\s+', ' ', s)

' hello world '

### 审查清理

In [43]:
s = 'pýtĥöñ\fis\tawesome\r\n'
remap = {
    ord('\t') : ' ',
    ord('\f') : ' ',
    ord('\r') : None # Deleted
}
s.translate(remap)

'pýtĥöñ is awesome\n'

### 字符串对齐

In [46]:
text = 'Hello World'
text.ljust(20)
text.rjust(20)
text.center(20)

format(text, '>20')
format(text, '<20')
format(text, '^20')

'    Hello World     '

In [47]:
text.rjust(20,'=')
text.center(20,'*')

format(text, '=>20s')
format(text, '*^20s')

'****Hello World*****'

In [48]:
'{:>10s} {:>10s}'.format('Hello', 'World')

'     Hello      World'

In [49]:
x = 1.2345
format(x, '>10')

'    1.2345'

### 字符串中插入变量

In [53]:
s = '{name} has {n} messages.'
s.format(name='Guido', n=37)
s.format_map({'name':'Guido', 'n':37})

'Guido has 37 messages.'

In [59]:
class safesub(dict):
    """防止key找不到"""
    def __missing__(self, key):
        return '{' + key + '}'

s.format_map(safesub({'name':'Guido'}))

'Guido has {n} messages.'

### 列宽格式化

In [65]:
s = "Look into my eyes, look into my eyes, the eyes, the eyes, \
the eyes, not around the eyes, don't look around the eyes, \
look into my eyes, you're under."

import textwrap
print(textwrap.fill(s, 70))
print(textwrap.fill(s, 40, initial_indent='    '))
print(textwrap.fill(s, 40, subsequent_indent='    '))

Look into my eyes, look into my eyes, the eyes, the eyes, the eyes,
not around the eyes, don't look around the eyes, look into my eyes,
you're under.
    Look into my eyes, look into my
eyes, the eyes, the eyes, the eyes, not
around the eyes, don't look around the
eyes, look into my eyes, you're under.
Look into my eyes, look into my eyes,
    the eyes, the eyes, the eyes, not
    around the eyes, don't look around
    the eyes, look into my eyes, you're
    under.


### 字符串令牌解析

https://python3-cookbook.readthedocs.io/zh_CN/latest/c02/p18_tokenizing_text.html

### 四舍五入

In [6]:
print(round(1.23456, 3))
print(round(1627731, -1))

1.235
1627730


In [4]:
format(1.23456, '0.3f')

'1.235'

### 精确的浮点数

In [7]:
from decimal import Decimal
a = Decimal('4.2')
b = Decimal('2.1')
a + b

Decimal('6.3')

### 二八十六进制整数

In [8]:
x = 1234
print(bin(x))
print(oct(x))
print(hex(x))
print(format(x, 'b'))
print(format(x, 'o'))
print(format(x, 'x'))

0b10011010010
0o2322
0x4d2
10011010010
2322
4d2


### 字节到大整数

In [9]:
data = b'\x00\x124V\x00x\x90\xab\x00\xcd\xef\x01\x00#\x004'
int.from_bytes(data, 'little')
int.from_bytes(data, 'big')

x = 94522842520747284487117727783387188
x.to_bytes(16, 'big')
x.to_bytes(16, 'little')

b'4\x00#\x00\x01\xef\xcd\x00\xab\x90x\x00V4\x12\x00'

### 分数运算

In [11]:
from fractions import Fraction

a = Fraction(5, 4)
b = Fraction(7, 16)
c = a * b
print(c.limit_denominator(8))

4/7


### 随机选择

In [14]:
import random

values = [1, 2, 3, 4, 5, 6]
print(random.choice(values))
print(random.sample(values, 2))
random.shuffle(values)
print(random.randint(0,10))
print(random.random())

3
[1, 2]
7
0.3006420775016281


### 日期与时间

In [17]:
from datetime import timedelta
a = timedelta(days=2, hours=6)
b = timedelta(hours=4.5)
c = a + b
print(c.days)
print(c.seconds)
print(c.seconds / 3600)
print(c.total_seconds() / 3600)

2
37800
10.5
58.5


In [19]:
from datetime import datetime

a = datetime(2012, 9, 23)
print(a + timedelta(days=10))

2012-10-03 00:00:00


In [23]:
from datetime import datetime

text = '2012-09-20'
datetime.strptime(text, '%Y-%m-%d')

z = datetime.now()
datetime(2012, 9, 23, 21, 37, 4, 177393)
datetime.strftime(z, '%A %B %d, %Y')

'Thursday October 29, 2020'