Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Find encoding for Python files #1526

Merged
merged 8 commits into from Apr 14, 2012
29 changes: 7 additions & 22 deletions IPython/core/magic.py
Expand Up @@ -55,6 +55,7 @@
from IPython.core.pylabtools import mpl_runner
from IPython.testing.skipdoctest import skip_doctest
from IPython.utils import py3compat
from IPython.utils import openpy
from IPython.utils.io import file_read, nlprint
from IPython.utils.module_paths import find_mod
from IPython.utils.path import get_py_filename, unquote_filename
Expand Down Expand Up @@ -98,9 +99,6 @@ def needs_local_scope(func):
# Used for exception handling in magic_edit
class MacroToEdit(ValueError): pass

# Taken from PEP 263, this is the official encoding regexp.
_encoding_declaration_re = re.compile(r"^#.*coding[:=]\s*([-\w.]+)")

#***************************************************************************
# Main class implementing Magic functionality

Expand Down Expand Up @@ -2261,28 +2259,15 @@ def magic_loadpy(self, arg_s):
# Local files must be .py; for remote URLs it's possible that the
# fetch URL doesn't have a .py in it (many servers have an opaque
# URL, such as scipy-central.org).
raise ValueError('%%load only works with .py files: %s' % arg_s)
raise ValueError('%%loadpy only works with .py files: %s' % arg_s)

# openpy takes care of finding the source encoding (per PEP 263)
if remote_url:
import urllib2
fileobj = urllib2.urlopen(arg_s)
# While responses have a .info().getencoding() way of asking for
# their encoding, in *many* cases the return value is bogus. In
# the wild, servers serving utf-8 but declaring latin-1 are
# extremely common, as the old HTTP standards specify latin-1 as
# the default but many modern filesystems use utf-8. So we can NOT
# rely on the headers. Short of building complex encoding-guessing
# logic, going with utf-8 is a simple solution likely to be right
# in most real-world cases.
linesource = fileobj.read().decode('utf-8', 'replace').splitlines()
fileobj.close()
contents = openpy.read_py_url(arg_s, skip_encoding_cookie=True)
else:
with open(arg_s) as fileobj:
linesource = fileobj.read().splitlines()

# Strip out encoding declarations
lines = [l for l in linesource if not _encoding_declaration_re.match(l)]
contents = openpy.read_py_file(arg_s, skip_encoding_cookie=True)

self.set_next_input(os.linesep.join(lines))
self.set_next_input(contents)

def _find_edit_target(self, args, opts, last_call):
"""Utility method used by magic_edit to find what to edit."""
Expand Down
5 changes: 5 additions & 0 deletions IPython/core/tests/nonascii.py
@@ -0,0 +1,5 @@
# encoding: iso-8859-5
# (Unlikely to be the default encoding for most testers.)
# ������������������� <- Cyrillic characters
from __future__ import unicode_literals
u = '����'
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that Github displays this file using a default encoding (probably latin-1 or cp1252), so these characters don't look like cyrillic characters. They are compared with a literal in the UTF-8 encoded test file.

8 changes: 8 additions & 0 deletions IPython/core/tests/test_run.py
@@ -1,3 +1,4 @@
# encoding: utf-8
"""Tests for code execution (%run and related), which is particularly tricky.

Because of how %run manages namespaces, and the fact that we are trying here to
Expand Down Expand Up @@ -240,3 +241,10 @@ def test_run_i_after_reset(self):
_ip.run_cell("zz = 23")
_ip.magic('run -i %s' % self.fname)
tt.assert_equals(_ip.user_ns['yy'], 23)

def test_unicode(self):
"""Check that files in odd encodings are accepted."""
mydir = os.path.dirname(__file__)
na = os.path.join(mydir, 'nonascii.py')
_ip.magic('run %s' % na)
tt.assert_equals(_ip.user_ns['u'], u'Ўт№Ф')
192 changes: 192 additions & 0 deletions IPython/utils/openpy.py
@@ -0,0 +1,192 @@
"""
Tools to open .py files as Unicode, using the encoding specified within the file,
as per PEP 263.

Much of the code is taken from the tokenize module in Python 3.2.
"""
from __future__ import absolute_import

import __builtin__
import io
from io import TextIOWrapper
import re
import urllib

cookie_re = re.compile(ur"coding[:=]\s*([-\w.]+)", re.UNICODE)
cookie_comment_re = re.compile(ur"^\s*#.*coding[:=]\s*([-\w.]+)", re.UNICODE)

try:
# Available in Python 3
from tokenize import detect_encoding
except ImportError:
from codecs import lookup, BOM_UTF8

# Copied from Python 3.2 tokenize
def _get_normal_name(orig_enc):
"""Imitates get_normal_name in tokenizer.c."""
# Only care about the first 12 characters.
enc = orig_enc[:12].lower().replace("_", "-")
if enc == "utf-8" or enc.startswith("utf-8-"):
return "utf-8"
if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
return "iso-8859-1"
return orig_enc

# Copied from Python 3.2 tokenize
def detect_encoding(readline):
"""
The detect_encoding() function is used to detect the encoding that should
be used to decode a Python source file. It requires one argment, readline,
in the same way as the tokenize() generator.

It will call readline a maximum of twice, and return the encoding used
(as a string) and a list of any lines (left as bytes) it has read in.

It detects the encoding from the presence of a utf-8 bom or an encoding
cookie as specified in pep-0263. If both a bom and a cookie are present,
but disagree, a SyntaxError will be raised. If the encoding cookie is an
invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
'utf-8-sig' is returned.

If no encoding is specified, then the default of 'utf-8' will be returned.
"""
bom_found = False
encoding = None
default = 'utf-8'
def read_or_stop():
try:
return readline()
except StopIteration:
return b''

def find_cookie(line):
try:
line_string = line.decode('ascii')
except UnicodeDecodeError:
return None

matches = cookie_re.findall(line_string)
if not matches:
return None
encoding = _get_normal_name(matches[0])
try:
codec = lookup(encoding)
except LookupError:
# This behaviour mimics the Python interpreter
raise SyntaxError("unknown encoding: " + encoding)

if bom_found:
if codec.name != 'utf-8':
# This behaviour mimics the Python interpreter
raise SyntaxError('encoding problem: utf-8')
encoding += '-sig'
return encoding

first = read_or_stop()
if first.startswith(BOM_UTF8):
bom_found = True
first = first[3:]
default = 'utf-8-sig'
if not first:
return default, []

encoding = find_cookie(first)
if encoding:
return encoding, [first]

second = read_or_stop()
if not second:
return default, [first]

encoding = find_cookie(second)
if encoding:
return encoding, [first, second]

return default, [first, second]

try:
# Available in Python 3.2 and above.
from tokenize import open
except ImportError:
# Copied from Python 3.2 tokenize
def open(filename):
"""Open a file in read only mode using the encoding detected by
detect_encoding().
"""
buffer = io.open(filename, 'rb') # Tweaked to use io.open for Python 2
encoding, lines = detect_encoding(buffer.readline)
buffer.seek(0)
text = TextIOWrapper(buffer, encoding, line_buffering=True)
text.mode = 'r'
return text

def strip_encoding_cookie(filelike):
"""Generator to pull lines from a text-mode file, skipping the encoding
cookie if it is found in the first two lines.
"""
it = iter(filelike)
try:
first = next(it)
if not cookie_comment_re.match(first):
yield first
second = next(it)
if not cookie_comment_re.match(second):
yield second
except StopIteration:
return

for line in it:
yield line

def read_py_file(filename, skip_encoding_cookie=True):
"""Read a Python file, using the encoding declared inside the file.

Parameters
----------
filename : str
The path to the file to read.
skip_encoding_cookie : bool
If True (the default), and the encoding declaration is found in the first
two lines, that line will be excluded from the output - compiling a
unicode string with an encoding declaration is a SyntaxError in Python 2.

Returns
-------
A unicode string containing the contents of the file.
"""
with open(filename) as f: # the open function defined in this module.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For this function and the next, let's add at least proper docstrings (i.e. with full Parameters and Returns descriptions), as they are likely to be useful for others in general.

if skip_encoding_cookie:
return "".join(strip_encoding_cookie(f))
else:
return f.read()

def read_py_url(url, errors='replace', skip_encoding_cookie=True):
"""Read a Python file from a URL, using the encoding declared inside the file.

Parameters
----------
url : str
The URL from which to fetch the file.
errors : str
How to handle decoding errors in the file. Options are the same as for
bytes.decode(), but here 'replace' is the default.
skip_encoding_cookie : bool
If True (the default), and the encoding declaration is found in the first
two lines, that line will be excluded from the output - compiling a
unicode string with an encoding declaration is a SyntaxError in Python 2.

Returns
-------
A unicode string containing the contents of the file.
"""
response = urllib.urlopen(url)
buffer = io.BytesIO(response.read())
encoding, lines = detect_encoding(buffer.readline)
buffer.seek(0)
text = TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True)
text.mode = 'r'
if skip_encoding_cookie:
return "".join(strip_encoding_cookie(text))
else:
return text.read()
2 changes: 1 addition & 1 deletion IPython/utils/py3compat.py
Expand Up @@ -70,7 +70,7 @@ def isidentifier(s, dotted=False):

def execfile(fname, glob, loc=None):
loc = loc if (loc is not None) else glob
exec compile(open(fname).read(), fname, 'exec') in glob, loc
exec compile(open(fname, 'rb').read(), fname, 'exec') in glob, loc

# Refactor print statements in doctests.
_print_statement_re = re.compile(r"\bprint (?P<expr>.*)$", re.MULTILINE)
Expand Down
23 changes: 23 additions & 0 deletions IPython/utils/tests/test_openpy.py
@@ -0,0 +1,23 @@
import io
import os.path
import nose.tools as nt

from IPython.utils import openpy

mydir = os.path.dirname(__file__)
nonascii_path = os.path.join(mydir, '../../core/tests/nonascii.py')

def test_detect_encoding():
f = open(nonascii_path, 'rb')
enc, lines = openpy.detect_encoding(f.readline)
nt.assert_equal(enc, 'iso-8859-5')

def test_read_file():
read_specified_enc = io.open(nonascii_path, encoding='iso-8859-5').read()
read_detected_enc = openpy.read_py_file(nonascii_path, skip_encoding_cookie=False)
nt.assert_equal(read_detected_enc, read_specified_enc)
assert u'encoding: iso-8859-5' in read_detected_enc

read_strip_enc_cookie = openpy.read_py_file(nonascii_path, skip_encoding_cookie=True)
assert u'encoding: iso-8859-5' not in read_strip_enc_cookie