Initial commit.

jdunck · Nov 11, 2010 · fd22704 · fd22704
commit fd22704
Show file tree

Hide file tree

Showing 3 changed files with 142 additions and 0 deletions.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,25 @@
+Copyright 2010 Jeremy Dunck. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are
+permitted provided that the following conditions are met:
+
+   1. Redistributions of source code must retain the above copyright notice, this list of
+      conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright notice, this list
+      of conditions and the following disclaimer in the documentation and/or other materials
+      provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY JEREMY DUNCK ``AS IS'' AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JEREMY DUNCK OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+The views and conclusions contained in the software and documentation are those of the
+authors and should not be interpreted as representing official policies, either expressed
+or implied, of Jeremy Dunck.
diff --git a/README b/README
@@ -0,0 +1,18 @@
+TL;DR: The unicodecsv is a drop-in replacement for Python 2's csv module which supports unicode strings without a hassle.
+
+More fully:
+
+Python 2's csv module doesn't easily deal with unicode strings, leading to the dreaded "'ascii' codec can't encode characters in position ..." exception.
+
+You can work around it by encoding everything just before calling write (or just after read), but why not add support to the serializer?
+
+>>> import unicodecsv
+>>> from cStringIO import StringIO
+>>> f = StringIO()
+>>> w = unicodecsv.writer(f, encoding='utf-8')
+>>> w.writerow((u'é', u'ñ'))
+>>> f.seek(0)
+>>> r = unicodecsv.reader(f, encoding='utf-8')
+>>> row = r.next()
+>>> print row[0], row[1]
+é ñ
diff --git a/unicodecsv.py b/unicodecsv.py
@@ -0,0 +1,99 @@
+# -*- coding: utf-8 -*-
+import csv
+from csv import *
+
+def _stringify(s, encoding):
+    if type(s)==unicode:
+        return s.encode(encoding)
+    elif type(s) != str:
+        s=str(s)
+    return s
+
+def _stringify_list(l, encoding):
+    return [_stringify(s, encoding) for s in l]
+
+class UnicodeWriter(object):
+    """
+    >>> import unicodecsv
+    >>> from cStringIO import StringIO
+    >>> f = StringIO()
+    >>> w = unicodecsv.writer(f, encoding='utf-8')
+    >>> w.writerow((u'é', u'ñ'))
+    >>> f.seek(0)
+    >>> r = unicodecsv.reader(f, encoding='utf-8')
+    >>> row = r.next()
+    >>> print row[0], row[1]
+    é ñ
+    """
+    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
+        self.writer = csv.writer(f)
+        self.dialect = dialect
+        self.encoding = encoding
+        self.writer = csv.writer(f, dialect=dialect, **kwds)
+
+    def writerow(self, row):
+        self.writer.writerow(_stringify_list(row, self.encoding))
+
+    def writerows(self, rows):
+        for row in rows:
+          self.writerow(row)
+writer = UnicodeWriter
+
+class UnicodeReader(object):
+    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
+        self.reader = csv.reader(f, dialect=dialect, **kwds)
+        self.encoding = encoding
+
+    def next(self):
+        row = self.reader.next()
+        return [unicode(s, self.encoding) for s in row]
+
+    def __iter__(self):
+        return self
+reader = UnicodeReader
+
+class DictWriter(csv.DictWriter):
+    """
+    >>> from cStringIO import StringIO
+    >>> f = StringIO()
+    >>> w = DictWriter(f, ['a', 'b'], restval=u'î')
+    >>> w.writerow({'a':'1'})
+    >>> w.writerow({'a':'1', 'b':u'ø'})
+    >>> w.writerow({'a':u'é'})
+    >>> f.seek(0)
+    >>> r = DictReader(f, fieldnames=['a'], restkey='r')
+    >>> r.next() == {'a':u'1', 'r':[u"î"]}
+    True
+    >>> r.next() == {'a':u'1', 'r':[u"ø"]}
+    True
+    >>> r.next() == {'a':u'é', 'r':[u"î"]}
+    """
+    def __init__(self, csvfile, fieldnames, restval='', extrasaction='raise', dialect='excel', encoding='utf-8', *args, **kwds):
+        self.fieldnames = fieldnames
+        self.encoding = encoding
+        self.restval = restval
+        self.writer = csv.DictWriter(csvfile, fieldnames, restval, extrasaction, dialect, *args, **kwds)
+    def writerow(self, d):
+        for fieldname in self.fieldnames:
+            if fieldname in d:
+                d[fieldname] = _stringify(d[fieldname], self.encoding)
+            else:
+                d[fieldname] = _stringify(self.restval, self.encoding)
+        self.writer.writerow(d)
+
+class DictReader(csv.DictReader):
+    def __init__(self, csvfile, fieldnames=None, restkey=None, restval=None, dialect='excel', encoding='utf-8', *args, **kwds):
+        self.restkey = restkey
+        self.encoding = encoding
+        self.reader = csv.DictReader(csvfile, fieldnames, restkey, restval, dialect, *args, **kwds)
+    def next(self):
+        d = self.reader.next()
+        for k, v in d.items():
+            if k == self.restkey:
+                rest = v
+                if rest:
+                    d[self.restkey] = [unicode(v, self.encoding) for v in rest]
+            else:
+                if v is not None:
+                    d[k] = unicode(v, self.encoding)
+        return d