In [49]:
# 2.1. Splitting Strings on Any of Multiple Delimiters
import re
line = 'asdf fjdk; afed, fjek,asdf, foo'

fields = re.split(r'[;,\s]\s*', line)
print(fields)
print(re.split(r'(?:,|;|\s)\s*', line))
fields = re.split(r'(;|,|\s)\s*', line)
print(fields)

print('-'*20)
values = fields[::2]
print(values)
delimiters = fields[1::2] + ['']
print(delimiters)

# Reform the line using the same delimiters
print('-'*20)
print(''.join(v+d for v,d in zip(values, delimiters)))


['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']
['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']
['asdf', ' ', 'fjdk', ';', 'afed', ',', 'fjek', ',', 'asdf', ',', 'foo']
--------------------
['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']
[' ', ';', ',', ',', ',', '']
--------------------
asdf fjdk;afed,fjek,asdf,foo


In [33]:
# 2.2. Matching Text at the Start or End of a String
filename = 'spam.txt'
print(filename.endswith('.txt'))
url = 'http://www.python.org'
print(url.startswith('http:'))
print('-'*20)

import os
filenames = os.listdir('.')
print(filenames)
file_ipynb = [name for name in filenames if name.endswith(('.ipynb', '.h')) ]
print(file_ipynb)
print(any(name.endswith('.py') for name in filenames))

True
True
--------------------
['src', 'PC-2 Strings and Text.ipynb', 'files', 'csv.DictReader.ipynb', 'PC-1 Data Structures and Algorithms.ipynb', '.ipynb_checkpoints']
['PC-2 Strings and Text.ipynb', 'csv.DictReader.ipynb', 'PC-1 Data Structures and Algorithms.ipynb']
False


In [38]:
# 2.2. Matching Text at the Start or End of a String
from urllib.request import urlopen
def read_data(name):
    if name.startswith(('http:', 'https:', 'ftp:')):
        return urlopen(name).read()
    else:
        with open(name) as f:
            return f.read()
readed = read_data('https://mojebudowy.pl/mbudowy-wprowadzenie')
print(readed)


b'<!doctype html>\n<html lang="pl">\n<head>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><meta name="robots" content="index, follow" /><title>mBudowy - opis nowoczesnego narz\xc4\x99dzia informatycznego</title><!--[if lt IE 9]><script src="/js/html5shiv.js?4"></script><![endif]--><link rel="stylesheet" href="/css/compiled/common.css?4" type="text/css" media="all" /><script type="text/javascript">\nvar cookieUrl = "";\nvar __st_loadLate=true; //if __st_loadLate is defined then the widget will not load on domcontent ready\n\n</script><link rel="Shortcut icon" href="/favicon.ico?4" /></head><body class="pp twocol"><div id="outer"><div class="service-wrapper"><div class="service"><strong>Nasze serwisy</strong><ul><li class="first last level0 even "><a href="/page/o-projekcie" >\n  mBudowy &raquo;\n</a></li></ul><div class="right"><div class="social"><span>lubi\xc4\x99 to</span><div class="cl"><span class="st_facebook_custom" displayText="Facebook" st_url="https://m

In [39]:
# 2.2 ... regular expressions 
import re
url = 'http://www.python.org'
re.match('http:|https:|ftp:', url)

<_sre.SRE_Match object; span=(0, 5), match='http:'>

In [45]:
# 2.3. Matching Strings Using Shell Wildcard Patterns
from fnmatch import fnmatch, fnmatchcase
print(fnmatch('foo.txt', '*.txt'))
print(fnmatch('foo.txt', '?oo.txt'))
print(fnmatch('Dat45.csv', 'Dat[0-9][0-9]*'))

print('-'*20)
names = ['Dat1.csv', 'Dat2.csv', 'config.ini', 'foo.py']
filered=[name for name in names if fnmatch(name, 'Dat*.csv')]
print(filered)

print('-'*20)
print(fnmatchcase('foo.txt', '*.TXT')) # case sensitive
print(fnmatch('foo.txt', '*.TXT')) # case sensitive or not - depended on system

True
True
True
--------------------
['Dat1.csv', 'Dat2.csv']
--------------------
False
False


In [55]:
# 2.4. Matching and Searching for Text Patterns
text1 = '11/27/2012'
text2 = 'Nov 27, 2012'

import re
# Simple matching: \d+ means match one or more digits
def date_match(text):
    if re.match(r'\d+/\d+/\d+', text):
        print('yes')
    else:
        print('no')
date_match(text1)
date_match(text2)

# Recompile matching:
print('-'*20)
datepat = re.compile(r'\d+/\d+/\d+')
def date_match_re(text):
    if datepat.match(text):
        print('yes')
    else:
        print('no')
date_match(text1)

# findall
print('-'*20)
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
print(datepat.match(text))
print(datepat.findall(text))


yes
no
--------------------
yes
--------------------
None
['11/27/2012', '3/13/2013']


In [67]:
# 2.4. 
datepat = re.compile('(\\d+)/(\\d+)/(\\d+)')
m = datepat.match('11/27/2012')
print(m)
print(m.group(0))
print(m.group(1))
print(m.group(3))
print(m.groups())

<_sre.SRE_Match object; span=(0, 10), match='11/27/2012'>
11/27/2012
11
2012
('11', '27', '2012')


In [68]:
# 2.4. 
print(datepat.findall(text))

for month, day, year in datepat.findall(text):
    print('{}-{}-{}'.format(year, month, day))

[('11', '27', '2012'), ('3', '13', '2013')]
2012-11-27
2013-3-13


In [71]:
# 2.4
datepat = re.compile(r'(\d+)/(\d+)/(\d+)$')
print(datepat.match('11/27/2012abcdef'))
print(datepat.match('11/27/2012'))

None
<_sre.SRE_Match object; span=(0, 10), match='11/27/2012'>


In [74]:
# 2.5. Searching and Replacing Text
text = 'yeah, but no, but yeah, but no, but yeah'
print(text.replace('yeah', 'yep'))

text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
import re
print(re.sub(r'(\d+)/(\d+)/(\d+)', r'\3-\1-\2', text))

datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
print(datepat.sub(r'\3-\1-\2', text))

yep, but no, but yep, but no, but yep
Today is 2012-11-27. PyCon starts 2013-3-13.
Today is 2012-11-27. PyCon starts 2013-3-13.


In [76]:
# 2.5.
from calendar import month_abbr
def change_date(m):
    mon_name = month_abbr[int(m.group(1))]
    return '{} {} {}'.format(m.group(2), mon_name, m.group(3))

print(datepat.sub(change_date, text))

newtext, n = datepat.subn(r'\3-\1-\2', text)
print(newtext)
print(n)

Today is 27 Nov 2012. PyCon starts 13 Mar 2013.
Today is 2012-11-27. PyCon starts 2013-3-13.
2


In [26]:
# 2.6. Searching and Replacing Case-Insensitive Text
import re
text = 'UPPER PYTHON, lower python, Mixed Python'
print(re.findall('python', text, flags=re.IGNORECASE))
print(re.sub('python', 'snake', text, flags=re.IGNORECASE))

def matchcase(word):
    def replace(m):
        print('m: ', m)
        print('word: ',word)
        text = m.group()
        if text.isupper():
            return word.upper()
        elif text.islower():
            return word.lower()
        elif text[0].isupper():
            return word.capitalize()
        else:
            return word
    return replace

print('-'*20)
print(re.sub('python', matchcase('snake'), text, flags=re.IGNORECASE))

print('+'*20)
x = matchcase('supper')
print( x(re.match('UPPER', text)))


['PYTHON', 'python', 'Python']
UPPER snake, lower snake, Mixed snake
--------------------
m:  <_sre.SRE_Match object; span=(6, 12), match='PYTHON'>
word:  snake
m:  <_sre.SRE_Match object; span=(20, 26), match='python'>
word:  snake
m:  <_sre.SRE_Match object; span=(34, 40), match='Python'>
word:  snake
UPPER SNAKE, lower snake, Mixed Snake
++++++++++++++++++++
m:  <_sre.SRE_Match object; span=(0, 5), match='UPPER'>
word:  supper
SUPPER


In [97]:
# 2.7. Specifying a Regular Expression
str_pat = re.compile(r'\"(.*)\"')
text1 = 'Computer says "no."'
print(str_pat.findall(text1))

text2 = 'Computer says "no." Phone says "yes."'
print(str_pat.findall(text2))

str_pat = re.compile(r'\"(.*?)\"')
print(str_pat.findall(text2))

['no.']
['no." Phone says "yes.']
['no.', 'yes.']


In [55]:
# 2.8 Writing a Regular Expression for Multiline Patterns
comment = re.compile(r'/\*(.*?)\*/')
text1 = '/* this is a comment */'
text2 = '''/* this is a
           multiline comment */
'''
print(comment.findall(text1))
print(comment.findall(text2))
# support for newlines
comment = re.compile(r'/\*((?:.|\n)*?)\*/')
print(comment.findall(text2))

print('-'*20)
comment = re.compile(r'/\*(.*)\*/', re.DOTALL)
print(comment.findall(text2))

[' this is a comment ']
[]
[' this is a\n           multiline comment ']
--------------------
[' this is a\n           multiline comment ']


In [66]:
# 2.9. Normalizing Unicode Text to a Standard
s1 = 'Spicy Jalape\u00f1o' # fully composed “ñ” character (U+00F1)
s2 = 'Spicy Jalapen\u0303o' #“n” followed by a “~” combining character (U+0303)
print(s1)
print(s2)
print(s1 == s2)
print(len(s1))
print(len(s2))
print('-'*20)

import unicodedata
t1 = unicodedata.normalize('NFC', s1) # NFC - fully composed
t2 = unicodedata.normalize('NFC', s2)
print(t1==t2)
print(ascii(t1))
print('-'*20)

t3 = unicodedata.normalize('NFD', s1) # NFD-fully decomposed
t4 = unicodedata.normalize('NFD', s2)
print(t3==t4)
print(ascii(t3))

Spicy Jalapeño
Spicy Jalapeño
False
14
15
--------------------
True
'Spicy Jalape\xf1o'
--------------------
True
'Spicy Jalapen\u0303o'


In [72]:
# 2.9.
s = '\ufb01' # A single character
print(s)
print(unicodedata.normalize('NFD', s))
# Notice how the combined letters are broken apart here
print(unicodedata.normalize('NFKD', s))
print(unicodedata.normalize('NFKC', s))
print('-'*20)

t1 = unicodedata.normalize('NFD', s1)
norm = ''.join(c for c in t1 if not unicodedata.combining(c))
print(norm)

ﬁ
ﬁ
fi
fi
--------------------
̃


In [89]:
#2.10. Working with Unicode Characters in Regular Expressions
import re
num = re.compile('\d+')
print(num.match('123'))
print(num.match('\u0661\u0662\u0663'))
arabic = re.compile('[\u0600-\u06ff\u0750-\u077f\u08a0-\u08ff]+')

print('-'*20)
pat = re.compile('stra\u00dfe', re.IGNORECASE)
s = 'straße'
print(pat.match(s)) # Matches
print(s.upper()) # Case folds
print(pat.match(s.upper())) # Doesn't match

<_sre.SRE_Match object; span=(0, 3), match='123'>
<_sre.SRE_Match object; span=(0, 3), match='١٢٣'>
--------------------
<_sre.SRE_Match object; span=(0, 6), match='straße'>
STRASSE
None


In [96]:
# 2.11. Stripping Unwanted Characters from Strings
# Whitespace stripping
s = ' hello world \n'
print( s.strip())
print(s.lstrip())
print(s.rstrip())

print('-'*20)
# Character stripping
t = '-----hello====='
print(t.lstrip('-'))
print(t.strip('-='))

print('+'*20)
print(s.replace(' ', ''))
import re
print(re.sub('\s+', ' ', s))

hello world
hello world 

 hello world
--------------------
hello=====
hello
++++++++++++++++++++
helloworld

 hello world 


In [None]:
# 2.11. 
with open(filename) as f:
    lines = (line.strip() for line in f)
    for line in lines:

In [100]:
# 2.12. Sanitizing and Cleaning Up Text
s = 'pýtĥöñ\fis\tawesome\r\n'
remap = {
ord('\t') : ' ',
ord('\f') : ' ',
ord('\r') : None # Deleted
}
a = s.translate(remap)
print(a)

pýtĥöñ is awesome



In [104]:
# 2.12. 
import unicodedata
import sys
cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode) if unicodedata.combining(chr(c)))

b = unicodedata.normalize('NFD', a)
print(b)
print(b.translate(cmb_chrs))

pýtĥöñ is awesome

python is awesome



In [109]:
#2.12.
digitmap = { c: ord('0') + unicodedata.digit(chr(c)) 
for c in range(sys.maxunicode) if unicodedata.category(chr(c)) == 'Nd' }

print(len(digitmap))

x = '\u0661\u0662\u0663' # Arabic digits
print(x)
print(x.translate(digitmap))

550
١٢٣
123


In [117]:
# 2.12.
print(a)
b = unicodedata.normalize('NFD', a)
print(b)
b=b.encode('ascii', 'ignore')
print(b)
b.decode('ascii')

pýtĥöñ is awesome

pýtĥöñ is awesome

b'python is awesome\n'


'python is awesome\n'

In [138]:
# 2.13. Aligning Text Strings
text = 'Hello World'
print("'",format(text, '>20'),"'")
print("'",format(text, '<20'),"'")
print("'",format(text, '^20'),"'")

print('-'*20)
print("'",format(text, '=>20s'),"'")
print("'",format(text, '*^20s'),"'")
print('+'*20)
print("'",'{:>10s} {:>10s}'.format('Hello', 'World'),"'")


'          Hello World '
' Hello World          '
'     Hello World      '
--------------------
' ****Hello World***** '
++++++++++++++++++++
'      Hello      World '


In [135]:
# 2.13
x = 1.2345
print("'",format(x, '>10'),"'")
print("'",format(x, '^10.2f'),"'")
print('"%-20s"' % text)
print('"%20s"' % text)


'     1.2345 '
'    1.23    '
"Hello World         "
"         Hello World"


In [145]:
# 2.14. Combining and Concatenating Strings
parts = ['Is', 'Chicago', 'Not', 'Chicago?']
print(' '.join(parts))

a = 'Is Chicago'
b = 'Not Chicago?'
print(a + ' ' + b)
print('{} {}'.format(a,b))

print('-'*20)
c = 'Hello' 'World'
print(c)

print('~'*20)
data = ['ACME', 50, 91.1]
print(','.join(str(d) for d in data))

Is Chicago Not Chicago?
Is Chicago Not Chicago?
Is Chicago Not Chicago?
--------------------
HelloWorld
~~~~~~~~~~~~~~~~~~~~
ACME,50,91.1
++++++++++++++++++++
Is Chicago:Not Chicago?:HelloWorld
Is Chicago:Not Chicago?:HelloWorld
Is Chicago:Not Chicago?:HelloWorld


In [146]:
# 2.14
print('+'*20)
print(a + ':' + b + ':' + c) # Ugly
print(':'.join([a, b, c])) # Still ugly
print(a, b, c, sep=':') # Better

++++++++++++++++++++
Is Chicago:Not Chicago?:HelloWorld
Is Chicago:Not Chicago?:HelloWorld
Is Chicago:Not Chicago?:HelloWorld


In [151]:
# 2.14
def sample():
    yield 'Is'
    yield 'Chicago'
    yield 'Not'
    yield 'Chicago?'
text = ' '.join(sample())
print(text)

Is Chicago Not Chicago?


In [155]:
# 2.14
def combine(source, maxsize):
    parts = []
    size = 0
    for part in source:
        parts.append(part)
        size += len(part)
        if size > maxsize:
            yield ''.join(parts)
            parts = []
            size = 0
    yield ''.join(parts)
for part in combine(text, 32768):
    print(part)

<generator object combine at 0x7f91b4231c50>
Is Chicago Not Chicago?


In [165]:
# 2.15. Interpolating Variables in Strings
s = '{name} has {n} messages.'
print(s.format(name='Guido', n=37))

name = 'Guido'
n = 37
print(s.format_map(vars()))

class Info:
    def __init__(self, name, n):
        self.name = name
        self.n = n
a = Info('Guido',37)
print(s.format_map(vars(a)))

Guido has 37 messages.
Guido has 37 messages.
Guido has 37 messages.


In [166]:
# 2.15.
class safesub(dict):
    def __missing__(self, key):
        return '{' + key + '}'
del n # Make sure n is undefined
print(s.format_map(safesub(vars())))

Guido has {n} messages.


In [168]:
# 2.15.
import sys
def sub(text):
    return text.format_map(safesub(sys._getframe(1).f_locals))
name = 'Guido'
n = 37
print(sub('Hello {name}'))
print(sub('You have {n} messages.'))
print(sub('Your favorite color is {color}'))

Hello Guido
You have 37 messages.
Your favorite color is {color}


In [179]:
# 2.15.
name = 'Guido'
n = 37
'%(name) has %(n) messages.' % vars()

ValueError: unsupported format character 'm' (0x6d) at index 17

In [180]:
# 2.15.
import string
s = string.Template('$name has $n messages.')
s.substitute(vars())

'Guido has 37 messages.'

In [5]:
# 2.16. Reformatting Text to a Fixed Number of Columns
s = "Look into my eyes, look into my eyes, the eyes, the eyes, \
the eyes, not around the eyes, don't look around the eyes, \
look into my eyes, you're under."

import textwrap
print(textwrap.fill(s, 40))
print(' ')
print(textwrap.fill(s, 40, initial_indent=' '))
print(' ')
print(textwrap.fill(s, 40, subsequent_indent=' '))
print('-'*20)

import os
os.get_terminal_size().columns

Look into my eyes, look into my eyes,
the eyes, the eyes, the eyes, not around
the eyes, don't look around the eyes,
look into my eyes, you're under.
 
 Look into my eyes, look into my eyes,
the eyes, the eyes, the eyes, not around
the eyes, don't look around the eyes,
look into my eyes, you're under.
 
Look into my eyes, look into my eyes,
 the eyes, the eyes, the eyes, not
 around the eyes, don't look around the
 eyes, look into my eyes, you're under.
--------------------


54

In [10]:
# 2.17. Handling HTML and XML Entities in Text
s = 'Elements are written as "<tag>text</tag>".'
import html
print(s)
print(html.escape(s))
# Disable escaping of quotes
print(html.escape(s, quote=False))

print('-'*20)
s = 'Spicy Jalapeño'
s.encode('ascii', errors='xmlcharrefreplace')

Elements are written as "<tag>text</tag>".
Elements are written as &quot;&lt;tag&gt;text&lt;/tag&gt;&quot;.
Elements are written as "&lt;tag&gt;text&lt;/tag&gt;".
--------------------


b'Spicy Jalape&#241;o'

In [15]:
# 2.17.
s = 'Spicy &quot;Jalape&#241;o&quot.'
from html.parser import HTMLParser
p = HTMLParser()
print(p.unescape(s))

t = 'The prompt is &gt;&gt;&gt;'
from xml.sax.saxutils import unescape
print(unescape(t))



Spicy "Jalapeño".
The prompt is >>>


In [80]:
# 2.18. Tokenizing Text
text = 'foo = 23 + 42 * 10'
import re
NAME = r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)'
NUM = r'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
TIMES = r'(?P<TIMES>\*)'
EQ = r'(?P<EQ>=)'
WS = r'(?P<WS>\s+)'

print('|'.join([NAME, NUM, PLUS, TIMES, EQ, WS]))
master_pat = re.compile('|'.join([NAME, NUM, PLUS, TIMES, EQ, WS]))
scanner = master_pat.scanner('foo = 42')
m = scanner.match()
print( m.lastgroup, m.group())
m = scanner.match()
print( m.lastgroup, m.group())
m = scanner.match()
print( m.lastgroup, m.group())
m = scanner.match()
print( m.lastgroup, m.group())
m = scanner.match()
print( m.lastgroup, m.group())

(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)|(?P<NUM>\d+)|(?P<PLUS>\+)|(?P<TIMES>\*)|(?P<EQ>=)|(?P<WS>\s+)
NAME foo
WS  
EQ =
WS  
NUM 42


In [81]:
# 2.18.
from collections import namedtuple
Token = namedtuple('Token', ['type','value'])
def generate_tokens(pat, text):
    scanner = pat.scanner(text)
    for m in iter(scanner.match, None):
        yield Token(m.lastgroup, m.group())
# Example use
for tok in generate_tokens(master_pat, 'foo = 42'):
    print(tok)
print('-'*20)
    
tokens = (tok for tok in generate_tokens(master_pat, text) if tok.type != 'WS')
for tok in tokens:
    print(tok)

Token(type='NAME', value='foo')
Token(type='WS', value=' ')
Token(type='EQ', value='=')
Token(type='WS', value=' ')
Token(type='NUM', value='42')
--------------------
Token(type='NAME', value='foo')
Token(type='EQ', value='=')
Token(type='NUM', value='23')
Token(type='PLUS', value='+')
Token(type='NUM', value='42')
Token(type='TIMES', value='*')
Token(type='NUM', value='10')


In [84]:
# 2.18.
LT = r'(?P<LT><)'
LE = r'(?P<LE><=)'
EQ = r'(?P<EQ>=)'
print('|'.join([LE, LT, EQ]))
master_pat = re.compile('|'.join([LE, LT, EQ])) # Correct
# master_pat = re.compile('|'.join([LT, LE, EQ])) # Incorrect

(?P<LE><=)|(?P<LT><)|(?P<EQ>=)


In [78]:
# 2.18.
PRINT = r'(P<PRINT>print)'
NAME = r'(P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)'
print('|'.join([PRINT, NAME]))
master_pat = re.compile('|'.join([PRINT, NAME]))
for tok in generate_tokens(master_pat, 'printer'):
    print(tok)

(P<PRINT>print)|(P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)


In [91]:
#2.19. Writing a Simple Recursive Descent Parser
import re
import collections
# Token specification
NUM = r'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
MINUS = r'(?P<MINUS>-)'
TIMES = r'(?P<TIMES>\*)'
DIVIDE = r'(?P<DIVIDE>/)'
LPAREN = r'(?P<LPAREN>\()'
RPAREN = r'(?P<RPAREN>\))'
WS = r'(?P<WS>\s+)'
master_pat = re.compile('|'.join([NUM, PLUS, MINUS, TIMES, DIVIDE, LPAREN, RPAREN, WS]))
# Tokenizer
Token = collections.namedtuple('Token', ['type','value'])
def generate_tokens(text):
    scanner = master_pat.scanner(text)
    for m in iter(scanner.match, None):
        tok = Token(m.lastgroup, m.group())
        if tok.type != 'WS':
            yield tok
# Parser
class ExpressionEvaluator:
    '''
    Implementation of a recursive descent parser. Each method
    implements a single grammar rule. Use the ._accept() method
    to test and accept the current lookahead token. Use the ._expect()
    method to exactly match and discard the next token on on the input
    (or raise a SyntaxError if it doesn't match).
    '''
    def parse(self,text):
        self.tokens = generate_tokens(text)
        self.tok = None # Last symbol consumed
        self.nexttok = None # Next symbol tokenized
        self._advance() # Load first lookahead token
        return self.expr()

    def _advance(self):
        'Advance one token ahead'
        self.tok, self.nexttok = self.nexttok, next(self.tokens, None)

    def _accept(self,toktype):
        'Test and consume the next token if it matches toktype'
        if self.nexttok and self.nexttok.type == toktype:
            self._advance()
            return True
        else:
            return False

    def _expect(self,toktype):
        'Consume next token if it matches toktype or raise SyntaxError'
        if not self._accept(toktype):
            raise SyntaxError('Expected ' + toktype)

    # Grammar rules follow
    def expr(self):
        "expression ::= term { ('+'|'-') term }*"
        exprval = self.term()
        while self._accept('PLUS') or self._accept('MINUS'):
            op = self.tok.type
            right = self.term()
            if op == 'PLUS':
                exprval += right
            elif op == 'MINUS':
                exprval -= right
        return exprval
    def term(self):
        "term ::= factor { ('*'|'/') factor }*"
        termval = self.factor()
        while self._accept('TIMES') or self._accept('DIVIDE'):
            op = self.tok.type
            right = self.factor()
            if op == 'TIMES':
                termval *= right
            elif op == 'DIVIDE':
                termval /= right
        return termval
    def factor(self):
        "factor ::= NUM | ( expr )"
        if self._accept('NUM'):
            return int(self.tok.value)
        elif self._accept('LPAREN'):
            exprval = self.expr()
            self._expect('RPAREN')
            return exprval
        else:
            raise SyntaxError('Expected NUMBER or LPAREN')

e = ExpressionEvaluator()
print(e.parse('2'))
print(e.parse('2 + 3'))

2
5


In [92]:
class ExpressionTreeBuilder(ExpressionEvaluator):
    def expr(self):
        "expression ::= term { ('+'|'-') term }"
        exprval = self.term()
        while self._accept('PLUS') or self._accept('MINUS'):
            op = self.tok.type
            right = self.term()
            if op == 'PLUS':
                exprval = ('+', exprval, right)
            elif op == 'MINUS':
                exprval = ('-', exprval, right)
        return exprval
    def term(self):
        "term ::= factor { ('*'|'/') factor }"
        termval = self.factor()
        while self._accept('TIMES') or self._accept('DIVIDE'):
            op = self.tok.type
            right = self.factor()
            if op == 'TIMES':
                termval = ('*', termval, right)
            elif op == 'DIVIDE':
                termval = ('/', termval, right)
        return termval
    def factor(self):
        'factor ::= NUM | ( expr )'
        if self._accept('NUM'):
            return int(self.tok.value)
        elif self._accept('LPAREN'):
            exprval = self.expr()
            self._expect('RPAREN')
            return exprval
        else:
            raise SyntaxError('Expected NUMBER or LPAREN')
e = ExpressionTreeBuilder()
print(e.parse('2 + 3'))

('+', 2, 3)


In [93]:
# 2.19.
from ply.lex import lex
from ply.yacc import yacc

# Token list
tokens = [ 'NUM', 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'LPAREN', 'RPAREN' ]

# Ignored characters
t_ignore = ' \t\n'

# Token specifications (as regexs)
t_PLUS = r'\+'
t_MINUS = r'-'
t_TIMES = r'\*'
t_DIVIDE = r'/'
t_LPAREN = r'\('
t_RPAREN = r'\)'

# Token processing functions
def t_NUM(t):
    r'\d+'
    t.value = int(t.value)
    return t

# Error handler
def t_error(t):
    print('Bad character: {!r}'.format(t.value[0]))
    t.skip(1)

# Build the lexer
lexer = lex()

# Grammar rules and handler functions
def p_expr(p):
    '''
    expr : expr PLUS term
    | expr MINUS term
    '''
    if p[2] == '+':
        p[0] = p[1] + p[3]
    elif p[2] == '-':
        p[0] = p[1] - p[3]

def p_expr_term(p):
    '''
    expr : term
    '''
    p[0] = p[1]
def p_term(p):
    '''
    term : term TIMES factor
    | term DIVIDE factor
    '''
    if p[2] == '*':
        p[0] = p[1] * p[3]
    elif p[2] == '/':
        p[0] = p[1] / p[3]
def p_term_factor(p):
    '''
    term : factor
    '''
    p[0] = p[1]
def p_factor(p):
    '''
    factor : NUM
    '''
    p[0] = p[1]
    
def p_factor_group(p):
    '''
    factor : LPAREN expr RPAREN
    '''
    p[0] = p[2]
def p_error(p):
    print('Syntax error')
    
parser = yacc()
print(parser.parse('2+3'))

ImportError: No module named 'ply'

In [96]:
# 2.20. Performing Text Operations on Byte Strings
data = b'Hello World'
print(data[0:5])
print(data.startswith(b'Hello'))
print(data.split())
print(data.replace(b'Hello', b'Hello Cruel'))
print('-'*20)

data = bytearray(b'Hello World')
print(data[0:5])
print(data.startswith(b'Hello'))
print(data.split())
print(data.replace(b'Hello', b'Hello Cruel'))
print('-'*20)

data = b'FOO:BAR,SPAM'
import re
print(re.split(b'[:,]',data)) # Notice: pattern as bytes

b'Hello'
True
[b'Hello', b'World']
b'Hello Cruel World'
--------------------
bytearray(b'Hello')
True
[bytearray(b'Hello'), bytearray(b'World')]
bytearray(b'Hello Cruel World')
--------------------
[b'FOO', b'BAR', b'SPAM']


In [97]:
# 2.20.
a = 'Hello World' # Text string
print(a[0])
print(a[1])
print('-'*20)

b = b'Hello World' # Byte string
print(b[0])
print(b[1])
print('-'*20)


H
e
--------------------
72
101
--------------------


In [101]:
# 2.20.
s = b'Hello World'
print(s)
print(s.decode('ascii'))
'{:10s} {:10d} {:10.2f}'.format('ACME', 100, 490.1).encode('ascii')

b'Hello World'
Hello World


b'ACME              100     490.10'

In [104]:
# 2.20.
# Get a directory listing
import os
print(os.listdir('.'))   # Text string (names are decoded)
print(os.listdir(b'.')) # Byte string (names left as bytes)

['src', 'PC-2 Strings and Text.ipynb', 'files', 'csv.DictReader.ipynb', 'PC-1 Data Structures and Algorithms.ipynb', '.ipynb_checkpoints']
[b'src', b'PC-2 Strings and Text.ipynb', b'files', b'csv.DictReader.ipynb', b'PC-1 Data Structures and Algorithms.ipynb', b'.ipynb_checkpoints']
