In [1]:
import string
from functools import partial

In [2]:
string.printable

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'

In [3]:
s = ''.join(chr(c) for c in range(0, 1<<7) if chr(c).isprintable())
len(s), s

(95,
 ' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~')

In [4]:
s = ''.join(c for c in string.printable if ord(' ') <= ord(c) <= ord('~'))
len(s), s

(95,
 '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ ')

string.printable has non-printable characters that I don't want,
such as '\x0b', so I make my own set of good characters.

In [5]:
good_characters = {chr(c) for c in range(ord(' '), ord('~')+1)} | {'\t'}

In [6]:
len(good_characters), good_characters

(96,
 {'\t',
  ' ',
  '!',
  '"',
  '#',
  '$',
  '%',
  '&',
  "'",
  '(',
  ')',
  '*',
  '+',
  ',',
  '-',
  '.',
  '/',
  '0',
  '1',
  '2',
  '3',
  '4',
  '5',
  '6',
  '7',
  '8',
  '9',
  ':',
  ';',
  '<',
  '=',
  '>',
  '?',
  '@',
  'A',
  'B',
  'C',
  'D',
  'E',
  'F',
  'G',
  'H',
  'I',
  'J',
  'K',
  'L',
  'M',
  'N',
  'O',
  'P',
  'Q',
  'R',
  'S',
  'T',
  'U',
  'V',
  'W',
  'X',
  'Y',
  'Z',
  '[',
  '\\',
  ']',
  '^',
  '_',
  '`',
  'a',
  'b',
  'c',
  'd',
  'e',
  'f',
  'g',
  'h',
  'i',
  'j',
  'k',
  'l',
  'm',
  'n',
  'o',
  'p',
  'q',
  'r',
  's',
  't',
  'u',
  'v',
  'w',
  'x',
  'y',
  'z',
  '{',
  '|',
  '}',
  '~'})

In [7]:
filename = '20150223-cohpy-memoization.ipynb'

In [8]:
def pass_good_characters(lines):
    for line in lines:
        yield ''.join(
            c for c in line
            if 31 < ord(c) < 127 or c == '\t')

In [9]:
%timeit list(pass_good_characters(open(filename)))

100 loops, best of 3: 8.84 ms per loop


In [10]:
def pass_good_characters(lines):
    for line in lines:
        yield ''.join(
            c for c in line
            if ord(' ')-1 < ord(c) < ord('~')+1 or c == '\t')

In [11]:
%timeit list(pass_good_characters(open(filename)))

100 loops, best of 3: 14 ms per loop


In [12]:
def pass_good_characters(lines):
    for line in lines:
        yield ''.join(
            c for c in line
            if ord(' ') <= ord(c) <= ord('~') or c == '\t')

In [13]:
%timeit list(pass_good_characters(open(filename)))

100 loops, best of 3: 13.2 ms per loop


In [14]:
def pass_good_characters(lines):
    for line in lines:
        yield ''.join(
            c for c in line
            if ' ' <= c <= '~' or c == '\t')

In [15]:
%timeit list(pass_good_characters(open(filename)))

100 loops, best of 3: 7.52 ms per loop


In [16]:
def pass_good_characters(lines):
    for line in lines:
        yield ''.join(
            c for c in line
            if c.isprintable() or c == '\t')

In [17]:
%timeit list(pass_good_characters(open(filename)))

100 loops, best of 3: 7.47 ms per loop


In [18]:
def pass_good_characters(lines):
    for line in lines:
        yield ''.join(filter(lambda c: c.isprintable() or c == '\t', line))

In [19]:
%timeit list(pass_good_characters(open(filename)))

100 loops, best of 3: 8.7 ms per loop


In [20]:
def pass_good_characters(lines):
    good_characters = [chr(c) for c in range(ord(' '), ord('~')+1)] + ['\t']
    for line in lines:
        yield ''.join(c for c in line if c in good_characters)

In [21]:
%timeit list(pass_good_characters(open(filename)))

10 loops, best of 3: 37.1 ms per loop


In [22]:
def pass_good_characters(lines):
    good_characters = ''.join([chr(c) for c in range(ord(' '), ord('~')+1)] + ['\t'])
    for line in lines:
        yield ''.join(c for c in line if c in good_characters)

In [23]:
%timeit list(pass_good_characters(open(filename)))

100 loops, best of 3: 7.01 ms per loop


In [24]:
def pass_good_characters(lines):
    good_characters = {chr(c) for c in range(ord(' '), ord('~')+1)} | {'\t'}
    for line in lines:
        yield ''.join(c for c in line if c in good_characters)

In [25]:
%timeit list(pass_good_characters(open(filename)))

100 loops, best of 3: 6.15 ms per loop


In [26]:
def pass_good_characters(lines):
    good_characters = {chr(c) for c in range(ord(' '), ord('~')+1)} | {'\t'}
    yield from (
        ''.join(c for c in line if c in good_characters)
        for line in lines)

In [27]:
%timeit list(pass_good_characters(open(filename)))

100 loops, best of 3: 6.31 ms per loop


In [28]:
def pass_good_characters(lines):
    good_characters = {chr(c) for c in range(ord(' '), ord('~')+1)} | {'\t'}
    for line in lines:
        yield ''.join(filter(lambda c: c in good_characters, line))

In [29]:
%timeit list(pass_good_characters(open(filename)))

100 loops, best of 3: 7.6 ms per loop


In [30]:
class MyStringIO():
    def __init__(self, s=''):
        self.s = s
        self.i = 0
        
    def __iter__(self):
        return self
    
    def __next__(self):
        s = []
        for c in iter(partial(self.read, 1), ''):
            s.append(c)
            if c == '\n':
                break
        if not s:
            raise StopIteration
        return ''.join(s)

    def read(self, n):
        s = self.s[self.i:self.i+n]
        self.i += n
        self.i = min(self.i, len(self.s))
        return s

    def write(self, s):
        self.s += s

In [31]:
s = 'hello\nwo\1\200\trld\n'

In [32]:
f = MyStringIO(s)
f.write('peas\n')
f

<__main__.MyStringIO at 0xb28f570c>

In [33]:
for i, line in enumerate(f):
    print(i, repr(line))

0 'hello\n'
1 'wo\x01\x80\trld\n'
2 'peas\n'


In [34]:
f = MyStringIO(s)
f.write('peas\n')
f

<__main__.MyStringIO at 0xb208d0cc>

In [35]:
for i, line in enumerate(pass_good_characters(f)):
    print(i, repr(line))

0 'hello'
1 'wo\trld'
2 'peas'
