ipython · fperez · Apr 14, 2012 · Mar 14, 2012 · Mar 25, 2012 · Mar 25, 2012
diff --git a/IPython/core/magic.py b/IPython/core/magic.py
@@ -55,6 +55,7 @@
 from IPython.core.pylabtools import mpl_runner
 from IPython.testing.skipdoctest import skip_doctest
 from IPython.utils import py3compat
+from IPython.utils import openpy
 from IPython.utils.io import file_read, nlprint
 from IPython.utils.module_paths import find_mod
 from IPython.utils.path import get_py_filename, unquote_filename
@@ -98,9 +99,6 @@ def needs_local_scope(func):
 # Used for exception handling in magic_edit
 class MacroToEdit(ValueError): pass
 
-# Taken from PEP 263, this is the official encoding regexp.
-_encoding_declaration_re = re.compile(r"^#.*coding[:=]\s*([-\w.]+)")
-
 #***************************************************************************
 # Main class implementing Magic functionality
 
@@ -2261,28 +2259,15 @@ def magic_loadpy(self, arg_s):
             # Local files must be .py; for remote URLs it's possible that the
             # fetch URL doesn't have a .py in it (many servers have an opaque
             # URL, such as scipy-central.org).
-            raise ValueError('%%load only works with .py files: %s' % arg_s)
+            raise ValueError('%%loadpy only works with .py files: %s' % arg_s)
+
+        # openpy takes care of finding the source encoding (per PEP 263)
         if remote_url:
-            import urllib2
-            fileobj = urllib2.urlopen(arg_s)
-            # While responses have a .info().getencoding() way of asking for
-            # their encoding, in *many* cases the return value is bogus.  In
-            # the wild, servers serving utf-8 but declaring latin-1 are
-            # extremely common, as the old HTTP standards specify latin-1 as
-            # the default but many modern filesystems use utf-8.  So we can NOT
-            # rely on the headers.  Short of building complex encoding-guessing
-            # logic, going with utf-8 is a simple solution likely to be right
-            # in most real-world cases.
-            linesource = fileobj.read().decode('utf-8', 'replace').splitlines()
-            fileobj.close()
+            contents = openpy.read_py_url(arg_s, skip_encoding_cookie=True)
         else:
-            with open(arg_s) as fileobj:
-                linesource = fileobj.read().splitlines()
-
-        # Strip out encoding declarations
-        lines = [l for l in linesource if not _encoding_declaration_re.match(l)]
+            contents = openpy.read_py_file(arg_s, skip_encoding_cookie=True)
 
-        self.set_next_input(os.linesep.join(lines))
+        self.set_next_input(contents)
 
     def _find_edit_target(self, args, opts, last_call):
         """Utility method used by magic_edit to find what to edit."""

diff --git a/IPython/core/tests/nonascii.py b/IPython/core/tests/nonascii.py
@@ -0,0 +1,5 @@
+# encoding: iso-8859-5
+# (Unlikely to be the default encoding for most testers.)
+# ������������������� <- Cyrillic characters
+from __future__ import unicode_literals
+u = '����'
diff --git a/IPython/core/tests/test_run.py b/IPython/core/tests/test_run.py
@@ -1,3 +1,4 @@
+# encoding: utf-8
 """Tests for code execution (%run and related), which is particularly tricky.
 
 Because of how %run manages namespaces, and the fact that we are trying here to
@@ -240,3 +241,10 @@ def test_run_i_after_reset(self):
         _ip.run_cell("zz = 23")
         _ip.magic('run -i %s' % self.fname)
         tt.assert_equals(_ip.user_ns['yy'], 23)
+
+    def test_unicode(self):
+        """Check that files in odd encodings are accepted."""
+        mydir = os.path.dirname(__file__)
+        na = os.path.join(mydir, 'nonascii.py')
+        _ip.magic('run %s' % na)
+        tt.assert_equals(_ip.user_ns['u'], u'Ўт№Ф')
diff --git a/IPython/utils/openpy.py b/IPython/utils/openpy.py
@@ -0,0 +1,192 @@
+"""
+Tools to open .py files as Unicode, using the encoding specified within the file,
+as per PEP 263.
+
+Much of the code is taken from the tokenize module in Python 3.2.
+"""
+from __future__ import absolute_import
+
+import __builtin__
+import io
+from io import TextIOWrapper
+import re
+import urllib
+
+cookie_re = re.compile(ur"coding[:=]\s*([-\w.]+)", re.UNICODE)
+cookie_comment_re = re.compile(ur"^\s*#.*coding[:=]\s*([-\w.]+)", re.UNICODE)
+
+try:
+    # Available in Python 3
+    from tokenize import detect_encoding
+except ImportError:
+    from codecs import lookup, BOM_UTF8
+
+    # Copied from Python 3.2 tokenize
+    def _get_normal_name(orig_enc):
+        """Imitates get_normal_name in tokenizer.c."""
+        # Only care about the first 12 characters.
+        enc = orig_enc[:12].lower().replace("_", "-")
+        if enc == "utf-8" or enc.startswith("utf-8-"):
+            return "utf-8"
+        if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
+           enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
+            return "iso-8859-1"
+        return orig_enc
+
+    # Copied from Python 3.2 tokenize
+    def detect_encoding(readline):
+        """
+        The detect_encoding() function is used to detect the encoding that should
+        be used to decode a Python source file.  It requires one argment, readline,
+        in the same way as the tokenize() generator.
+
+        It will call readline a maximum of twice, and return the encoding used
+        (as a string) and a list of any lines (left as bytes) it has read in.
+
+        It detects the encoding from the presence of a utf-8 bom or an encoding
+        cookie as specified in pep-0263.  If both a bom and a cookie are present,
+        but disagree, a SyntaxError will be raised.  If the encoding cookie is an
+        invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
+        'utf-8-sig' is returned.
+
+        If no encoding is specified, then the default of 'utf-8' will be returned.
+        """
+        bom_found = False
+        encoding = None
+        default = 'utf-8'
+        def read_or_stop():
+            try:
+                return readline()
+            except StopIteration:
+                return b''
+
+        def find_cookie(line):
+            try:
+                line_string = line.decode('ascii')
+            except UnicodeDecodeError:
+                return None
+
+            matches = cookie_re.findall(line_string)
+            if not matches:
+                return None
+            encoding = _get_normal_name(matches[0])
+            try:
+                codec = lookup(encoding)
+            except LookupError:
+                # This behaviour mimics the Python interpreter
+                raise SyntaxError("unknown encoding: " + encoding)
+
+            if bom_found:
+                if codec.name != 'utf-8':
+                    # This behaviour mimics the Python interpreter
+                    raise SyntaxError('encoding problem: utf-8')
+                encoding += '-sig'
+            return encoding
+
+        first = read_or_stop()
+        if first.startswith(BOM_UTF8):
+            bom_found = True
+            first = first[3:]
+            default = 'utf-8-sig'
+        if not first:
+            return default, []
+
+        encoding = find_cookie(first)
+        if encoding:
+            return encoding, [first]
+
+        second = read_or_stop()
+        if not second:
+            return default, [first]
+
+        encoding = find_cookie(second)
+        if encoding:
+            return encoding, [first, second]
+
+        return default, [first, second]
+
+try:
+    # Available in Python 3.2 and above.
+    from tokenize import open
+except ImportError:
+    # Copied from Python 3.2 tokenize
+    def open(filename):
+        """Open a file in read only mode using the encoding detected by
+        detect_encoding().
+        """
+        buffer = io.open(filename, 'rb')   # Tweaked to use io.open for Python 2
+        encoding, lines = detect_encoding(buffer.readline)
+        buffer.seek(0)
+        text = TextIOWrapper(buffer, encoding, line_buffering=True)
+        text.mode = 'r'
+        return text   
+
+def strip_encoding_cookie(filelike):
+    """Generator to pull lines from a text-mode file, skipping the encoding
+    cookie if it is found in the first two lines.
+    """
+    it = iter(filelike)
+    try:
+        first = next(it)
+        if not cookie_comment_re.match(first):
+            yield first
+        second = next(it)
+        if not cookie_comment_re.match(second):
+            yield second
+    except StopIteration:
+        return
+
+    for line in it:
+        yield line
+
+def read_py_file(filename, skip_encoding_cookie=True):
+    """Read a Python file, using the encoding declared inside the file.
+
+    Parameters
+    ----------
+    filename : str
+      The path to the file to read.
+    skip_encoding_cookie : bool
+      If True (the default), and the encoding declaration is found in the first
+      two lines, that line will be excluded from the output - compiling a
+      unicode string with an encoding declaration is a SyntaxError in Python 2.
+
+    Returns
+    -------
+    A unicode string containing the contents of the file.
+    """
+    with open(filename) as f:   # the open function defined in this module.
+        if skip_encoding_cookie:
+            return "".join(strip_encoding_cookie(f))
+        else:
+            return f.read()
+
+def read_py_url(url, errors='replace', skip_encoding_cookie=True):
+    """Read a Python file from a URL, using the encoding declared inside the file.
+
+    Parameters
+    ----------
+    url : str
+      The URL from which to fetch the file.
+    errors : str
+      How to handle decoding errors in the file. Options are the same as for
+      bytes.decode(), but here 'replace' is the default.
+    skip_encoding_cookie : bool
+      If True (the default), and the encoding declaration is found in the first
+      two lines, that line will be excluded from the output - compiling a
+      unicode string with an encoding declaration is a SyntaxError in Python 2.
+
+    Returns
+    -------
+    A unicode string containing the contents of the file.
+    """
+    response = urllib.urlopen(url)
+    buffer = io.BytesIO(response.read())
+    encoding, lines = detect_encoding(buffer.readline)
+    buffer.seek(0)
+    text = TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True)
+    text.mode = 'r'
+    if skip_encoding_cookie:
+        return "".join(strip_encoding_cookie(text))
+    else:
+        return text.read()
diff --git a/IPython/utils/py3compat.py b/IPython/utils/py3compat.py
@@ -70,7 +70,7 @@ def isidentifier(s, dotted=False):
 
     def execfile(fname, glob, loc=None):
         loc = loc if (loc is not None) else glob
-        exec compile(open(fname).read(), fname, 'exec') in glob, loc
+        exec compile(open(fname, 'rb').read(), fname, 'exec') in glob, loc
 
     # Refactor print statements in doctests.
     _print_statement_re = re.compile(r"\bprint (?P<expr>.*)$", re.MULTILINE)

diff --git a/IPython/utils/tests/test_openpy.py b/IPython/utils/tests/test_openpy.py
@@ -0,0 +1,23 @@
+import io
+import os.path
+import nose.tools as nt
+
+from IPython.utils import openpy
+
+mydir = os.path.dirname(__file__)
+nonascii_path = os.path.join(mydir, '../../core/tests/nonascii.py')
+
+def test_detect_encoding():
+    f = open(nonascii_path, 'rb')
+    enc, lines = openpy.detect_encoding(f.readline)
+    nt.assert_equal(enc, 'iso-8859-5')
+
+def test_read_file():
+    read_specified_enc = io.open(nonascii_path, encoding='iso-8859-5').read()
+    read_detected_enc = openpy.read_py_file(nonascii_path, skip_encoding_cookie=False)
+    nt.assert_equal(read_detected_enc, read_specified_enc)
+    assert u'encoding: iso-8859-5' in read_detected_enc
+
+    read_strip_enc_cookie = openpy.read_py_file(nonascii_path, skip_encoding_cookie=True)
+    assert u'encoding: iso-8859-5' not in read_strip_enc_cookie
+