Permalink
Browse files

Initial commit.

  • Loading branch information...
0 parents commit fd227046740f85cfdc9a332fb96ffc23f0108faa @jdunck committed Nov 11, 2010
Showing with 142 additions and 0 deletions.
  1. +25 −0 LICENSE
  2. +18 −0 README
  3. +99 −0 unicodecsv.py
25 LICENSE
@@ -0,0 +1,25 @@
+Copyright 2010 Jeremy Dunck. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are
+permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice, this list of
+ conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice, this list
+ of conditions and the following disclaimer in the documentation and/or other materials
+ provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY JEREMY DUNCK ``AS IS'' AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JEREMY DUNCK OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+The views and conclusions contained in the software and documentation are those of the
+authors and should not be interpreted as representing official policies, either expressed
+or implied, of Jeremy Dunck.
18 README
@@ -0,0 +1,18 @@
+TL;DR: The unicodecsv is a drop-in replacement for Python 2's csv module which supports unicode strings without a hassle.
+
+More fully:
+
+Python 2's csv module doesn't easily deal with unicode strings, leading to the dreaded "'ascii' codec can't encode characters in position ..." exception.
+
+You can work around it by encoding everything just before calling write (or just after read), but why not add support to the serializer?
+
+>>> import unicodecsv
+>>> from cStringIO import StringIO
+>>> f = StringIO()
+>>> w = unicodecsv.writer(f, encoding='utf-8')
+>>> w.writerow((u'é', u'ñ'))
+>>> f.seek(0)
+>>> r = unicodecsv.reader(f, encoding='utf-8')
+>>> row = r.next()
+>>> print row[0], row[1]
+é ñ
@@ -0,0 +1,99 @@
+# -*- coding: utf-8 -*-
+import csv
+from csv import *
+
+def _stringify(s, encoding):
+ if type(s)==unicode:
+ return s.encode(encoding)
+ elif type(s) != str:
+ s=str(s)
+ return s
+
+def _stringify_list(l, encoding):
+ return [_stringify(s, encoding) for s in l]
+
+class UnicodeWriter(object):
+ """
+ >>> import unicodecsv
+ >>> from cStringIO import StringIO
+ >>> f = StringIO()
+ >>> w = unicodecsv.writer(f, encoding='utf-8')
+ >>> w.writerow((u'é', u'ñ'))
+ >>> f.seek(0)
+ >>> r = unicodecsv.reader(f, encoding='utf-8')
+ >>> row = r.next()
+ >>> print row[0], row[1]
+ é ñ
+ """
+ def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
+ self.writer = csv.writer(f)
+ self.dialect = dialect
+ self.encoding = encoding
+ self.writer = csv.writer(f, dialect=dialect, **kwds)
+
+ def writerow(self, row):
+ self.writer.writerow(_stringify_list(row, self.encoding))
+
+ def writerows(self, rows):
+ for row in rows:
+ self.writerow(row)
+writer = UnicodeWriter
+
+class UnicodeReader(object):
+ def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
+ self.reader = csv.reader(f, dialect=dialect, **kwds)
+ self.encoding = encoding
+
+ def next(self):
+ row = self.reader.next()
+ return [unicode(s, self.encoding) for s in row]
+
+ def __iter__(self):
+ return self
+reader = UnicodeReader
+
+class DictWriter(csv.DictWriter):
+ """
+ >>> from cStringIO import StringIO
+ >>> f = StringIO()
+ >>> w = DictWriter(f, ['a', 'b'], restval=u'î')
+ >>> w.writerow({'a':'1'})
+ >>> w.writerow({'a':'1', 'b':u'ø'})
+ >>> w.writerow({'a':u'é'})
+ >>> f.seek(0)
+ >>> r = DictReader(f, fieldnames=['a'], restkey='r')
+ >>> r.next() == {'a':u'1', 'r':[u"î"]}
+ True
+ >>> r.next() == {'a':u'1', 'r':[u"ø"]}
+ True
+ >>> r.next() == {'a':u'é', 'r':[u"î"]}
+ """
+ def __init__(self, csvfile, fieldnames, restval='', extrasaction='raise', dialect='excel', encoding='utf-8', *args, **kwds):
+ self.fieldnames = fieldnames
+ self.encoding = encoding
+ self.restval = restval
+ self.writer = csv.DictWriter(csvfile, fieldnames, restval, extrasaction, dialect, *args, **kwds)
+ def writerow(self, d):
+ for fieldname in self.fieldnames:
+ if fieldname in d:
+ d[fieldname] = _stringify(d[fieldname], self.encoding)
+ else:
+ d[fieldname] = _stringify(self.restval, self.encoding)
+ self.writer.writerow(d)
+
+class DictReader(csv.DictReader):
+ def __init__(self, csvfile, fieldnames=None, restkey=None, restval=None, dialect='excel', encoding='utf-8', *args, **kwds):
+ self.restkey = restkey
+ self.encoding = encoding
+ self.reader = csv.DictReader(csvfile, fieldnames, restkey, restval, dialect, *args, **kwds)
+ def next(self):
+ d = self.reader.next()
+ for k, v in d.items():
+ if k == self.restkey:
+ rest = v
+ if rest:
+ d[self.restkey] = [unicode(v, self.encoding) for v in rest]
+ else:
+ if v is not None:
+ d[k] = unicode(v, self.encoding)
+ return d

0 comments on commit fd22704

Please sign in to comment.