2.1. Splitting Strings on Any of Multiple Delimiters

In [1]:
line = 'asdf fjdk; afed, fjek,asdf, foo'
import re
line_split = re.split(r'[;,\s]\s*', line)
print(line_split)

['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']


2.2. Matching Text at the Start or End of a String

In [2]:
filename = 'spam.txt'
print(filename.endswith('txt'))

True


In [5]:
import os
filenames = os.listdir('.')
print(filenames)
print([name for name in filenames if name.endswith(('.ipynb'))])

['.DS_Store', '.gitignore', '01-Data-Structures-and-Algorithms.ipynb', '02-String-and-Text.ipynb', '.ipynb_checkpoints']
['01-Data-Structures-and-Algorithms.ipynb', '02-String-and-Text.ipynb']


In [6]:
choices = ['http:', 'ftp:']
url = 'http://www.python.org'
url.startswith(choices)

TypeError: startswith first arg must be str or a tuple of str, not list

In [7]:
print(url.startswith(tuple(choices)))

True


2.3. Matching Strings Using Shell Wildcard Patterns

In [8]:
from fnmatch import fnmatch, fnmatchcase
print(fnmatch('foo.txt', '*.txt'))
print(fnmatch('foo.txt', '?oo.txt'))
print(fnmatch('Dat45.csv', 'Dat[0-9]*'))

True
True
True


In [10]:
addresses = [
        '5412 N CLARK ST',
        '1060 W ADDISON ST',
        '1039 W GRANVILLE AVE',
        '2122 N CLARK ST',
        '4802 N BROADWAY',
]
print([addr for addr in addresses if fnmatch(addr, '* ST')])
print([addr for addr in addresses if fnmatch(addr, '54[0-9][0-9] *CLARK*')])

['5412 N CLARK ST', '1060 W ADDISON ST', '2122 N CLARK ST']
['5412 N CLARK ST']


2.4. Matching and Searching for Text Patterns

In [11]:
####    0123456789012345678901234567890123456789
text = 'yeah, but no, but yeah, but no, but yeah'
text.find('no')

10

In [18]:
text1 = '11/27/2012'
if re.match(r'\d+/\d+/\d+', text1):
    print('yes')

yes


In [28]:
datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
m = datepat.match('11/27/2012')
print(m.group(0))
print(m.group(1))
print(m.group(2))
print(m.group(3))
print(m.groups())

text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
print(datepat.findall(text))

11/27/2012
11
27
2012
('11', '27', '2012')
[('11', '27', '2012'), ('3', '13', '2013')]


In [22]:
m1 = datepat.match('11/27/2012abcde')
print(m1.group(0))

11/27/2012


2.5. Searching and Replacing Text

In [26]:
text = 'yeah, but no, but yeah, but no, but yeah'
print(text.replace('yeah', 'yep'))

yep, but no, but yep, but no, but yep


In [33]:
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
import re
print(re.sub(r'(\d+)/(\d+)/(\d+)', r'\3-\1-\2', text))

Today is 2012-11-27. PyCon starts 2013-3-13.


In [34]:
from calendar import month_abbr
def change_date(m):
    mon_name = month_abbr[int(m.group(1))]
    return '{} {} {}'.format(m.group(2), mon_name, m.group(3))
datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
print(datepat.sub(change_date, text))

Today is 27 Nov 2012. PyCon starts 13 Mar 2013.


In [35]:
print(datepat.subn(change_date, text))

('Today is 27 Nov 2012. PyCon starts 13 Mar 2013.', 2)


2.6. Searching and Replacing Case-Insensitive Text Problem

In [38]:
text = 'UPPER PYTHON, lower python, Mixed Python'
print(re.findall('python', text, flags=re.IGNORECASE))
print(re.sub('python', 'snake', text, flags=re.IGNORECASE))

['PYTHON', 'python', 'Python']
UPPER snake, lower snake, Mixed snake


In [39]:
def matchcase(word):
    def replace(m):
        text = m.group()
        if text.isupper():
            return word.upper()
        elif text.islower():
            return word.lower()
        elif text[0].isupper():
            return word.capitalize()
        else:
            return word
    return replace

print(re.sub('python', matchcase('snake'), text, flags=re.IGNORECASE))

UPPER SNAKE, lower snake, Mixed Snake


2.7. Specifying a Regular Expression for the Shortest Match

In [40]:
str_pat = re.compile(r'\"(.*)"')
text1 = 'Coputer says "no."'
print(str_pat.findall(text1))

['no.']


In [41]:
text2 = 'Computer says "no." Phone says "yes."'
print(str_pat.findall(text2))

['no." Phone says "yes.']


In [42]:
str_pat = re.compile(r'\"(.*?)\"')
print(str_pat.findall(text2))

['no.', 'yes.']


2.8. Writing a Regular Expression for Multiline Patterns

In [43]:
text = '''/* this is a 
multiline comment */
'''
comment = re.compile(r'/\*(.*?)\*/', re.DOTALL)
print(comment.findall(text))

[' this is a \nmultiline comment ']


2.9. Normalizing Unicode Text to a Standard Representation

In [1]:
s1 = 'Spicy Jalape\u00f1o'
s2 = 'Spicy Jalapen\u0303o'
print(s1, len(s1))
print(s2, len(s2))

Spicy Jalapeño 14
Spicy Jalapeño 15


In [4]:
import unicodedata
t1 = unicodedata.normalize('NFC', s1)
t2 = unicodedata.normalize('NFC', s2)
print(t1 == t2)
print(ascii(t1))
print(ascii(t2))

True
'Spicy Jalape\xf1o'
'Spicy Jalape\xf1o'


In [5]:
t3 = unicodedata.normalize('NFD', s1)
t4 = unicodedata.normalize('NFD', s2)
print(t3 == t4)
print(ascii(t3))
print(ascii(t4))

True
'Spicy Jalapen\u0303o'
'Spicy Jalapen\u0303o'


2.10. Working with Unicode Characters in Regular Expressions

In [6]:
import re
num = re.compile('\d+')
num.match('123')

<_sre.SRE_Match object; span=(0, 3), match='123'>

In [8]:
if num.match('\u0661\u0662\u0663'):
    print('match')

match


2.11. Stripping Unwanted Characters from Strings

In [9]:
s = ' hello world \n'
print(s.strip())
print(s.lstrip())
print(s.rstrip())

hello world
hello world 

 hello world


In [14]:
t = '-----hello====='
print(t.lstrip('-'))
print(t.rstrip('='))
print(t.strip('=-'))

hello=====
-----hello
hello


2.12. Sanitizing and Cleaning Up Text

In [19]:
s = 'pýtĥöñ\fis\tawesome\r\n'
remap = {
    ord('\t'): ' ',
    ord('\f'): ' ',
    ord('\r'): None
}
a = s.translate(remap)
print(a)

pýtĥöñ is awesome



In [22]:
import unicodedata
import sys
cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode) if unicodedata.combining(chr(c)))
b = unicodedata.normalize('NFD', a)
print(b.translate(cmb_chrs))

python is awesome



2.13. Aligning Text Strings

In [24]:
text = 'Hello World'
print(text.ljust(20))
print(text.rjust(20))
print(text.center(20))

Hello World         
         Hello World
    Hello World     


In [25]:
print(text.ljust(20, '='))
print(text.rjust(20, '='))
print(text.center(20, '='))

====Hello World=====


In [26]:
print(format(text, '>20'))
print(format(text, '<20'))
print(format(text, '^20'))

         Hello World
Hello World         
    Hello World     


In [29]:
print('{:>10s} {:>10s}'.format('Hello', 'World'))

     Hello      World


2.14. Combining and Concatenating Stringsm

In [33]:
parts = ['Is', 'Chicago', 'Not', 'Chicago?']
print(' '.join(parts))
print(', '.join(parts))
print(''.join(parts))

Is Chicago Not Chicago?
Is, Chicago, Not, Chicago?
IsChicagoNotChicago?


In [34]:
a = 'Is Chicago'
b = 'Not Chicago'
print('{} {}'.format(a, b))

Is Chicago Not Chicago


In [36]:
print(':'.join([a, b]))
print(a, b, sep=':')

Is Chicago:Not Chicago
Is Chicago:Not Chicago


2.15. Interpolating Variables in Strings

In [37]:
def sample(): 
    yield 'Is'
    yield 'Chicago'
    yield 'Not'
    yield 'Chicago?'

print(''.join(sample()))

IsChicagoNotChicago?


2.15. Interpolating Variables in Strings

In [39]:
s = '{name} has {n} messages.'
print(s.format(name='Guido', n='37'))

Guido has 37 messages.


In [41]:
name = 'Guido'
n = 37
print(s.format_map(vars()))

Guido has 37 messages.


In [42]:
class Info:
    def __init__(self, name, n):
        self.name = name
        self.n = n

a = Info('Guido', 44)
print(s.format_map(vars(a)))

Guido has 44 messages.


In [44]:
class safesub(dict):
    def __missing__(self, key):
        return '{' + key + '}'

s = '{name} has {m} messages.'  
s.format_map(safesub(vars()))

'Guido has {m} messages.'

In [45]:
import sys
def sub(text):
    return text.format_map(safesub(sys._getframe(1).f_locals))

In [46]:
name = 'David'
n = 55
print(sub('Hello {name}'))
print(sub('You have {n} messages.'))
print(sub('Your favorite coor is {color}'))

Hello David
You have 55 messages.
Your favorite coor is {color}


2.16. Reformatting Text to a Fixed Number of Columns

In [51]:
s = "Look into my eyes, look into my eyes, the eyes, the eyes, \
the eyes, not around the eyes, don't look around the eyes, \
look into my eyes, you're under."
import textwrap
print(textwrap.fill(s, 70))

Look into my eyes, look into my eyes, the eyes, the eyes, the eyes,
not around the eyes, don't look around the eyes, look into my eyes,
you're under.


In [52]:
print(textwrap.fill(s, 40))

Look into my eyes, look into my eyes,
the eyes, the eyes, the eyes, not around
the eyes, don't look around the eyes,
look into my eyes, you're under.


In [54]:
print(textwrap.fill(s, 40, initial_indent='    '))

    Look into my eyes, look into my
eyes, the eyes, the eyes, the eyes, not
around the eyes, don't look around the
eyes, look into my eyes, you're under.


In [55]:
print(textwrap.fill(s, 40, subsequent_indent='    '))

Look into my eyes, look into my eyes,
    the eyes, the eyes, the eyes, not
    around the eyes, don't look around
    the eyes, look into my eyes, you're
    under.


2.17. Handling HTML and XML Entities in Text

In [56]:
import html
s = 'Elements are written as "<tag>text</tag>".'
print(html.escape(s, quote=False))

Elements are written as "&lt;tag&gt;text&lt;/tag&gt;".


2.18. Tokenizing Text

In [88]:
text = 'foo = 23 + 42 * 10'
tokens = [('NAME', 'foo'), ('EQ','='), ('NUM', '23'), ('PLUS','+'),
          ('NUM', '42'), ('TIMES', '*'), ('NUM', 10)]
import re
NAME = r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)'
NUM = r'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
TIMES = r'(?P<TIMES>\*)'
EQ = r'(?P<EQ>=)'
WS = r'(?P<WS>\s+)'

master_pat = re.compile('|'.join([NAME, NUM, PLUS, TIMES, EQ, WS]))
scanner = master_pat.scanner('foo = 42')
scanner.match()

<_sre.SRE_Match object; span=(0, 3), match='foo'>

In [89]:
_.lastgroup, _.group()

('NAME', 'foo')

In [90]:
scanner.match()

<_sre.SRE_Match object; span=(3, 4), match=' '>

In [91]:
_.lastgroup, _.group()

('WS', ' ')

In [92]:
from collections import namedtuple
Token = namedtuple('Token', ['type', 'value'])
def generate_tokens(pat, text):
    scanner = pat.scanner(text)
    for m in iter(scanner.match, None):
        yield Token(m.lastgroup, m.group())
    
for tok in generate_tokens(master_pat, 'foo = 42'):
    print(tok)

Token(type='NAME', value='foo')
Token(type='WS', value=' ')
Token(type='EQ', value='=')
Token(type='WS', value=' ')
Token(type='NUM', value='42')


In [93]:
tokens = (tok for tok in generate_tokens(master_pat, text) if tok.type != 'WS')
for tok in tokens:
    print(tok)

Token(type='NAME', value='foo')
Token(type='EQ', value='=')
Token(type='NUM', value='23')
Token(type='PLUS', value='+')
Token(type='NUM', value='42')
Token(type='TIMES', value='*')
Token(type='NUM', value='10')


2.19. Writing a Simple Recursive Descent Parser

2.20. Performing Text Operations on Byte Strings

In [94]:
data = b'Hello World'
print(data[0:5])
print(data.startswith(b'Hello'))

b'Hello'
True


In [95]:
s = b'Hello World'
print(s)
print(s.decode('ascii'))

b'Hello World'
Hello World
