fix encoding in ruby >= 1.9

kr · Feb 21, 2012 · 9d32929 · 9d32929
1 parent 8363cb8
commit 9d32929
Show file tree

Hide file tree

Showing 5 changed files with 44 additions and 17 deletions.
diff --git a/Readme b/Readme
@@ -19,7 +19,7 @@ This library is intended to be "vendored".
 It is not a gem; instead, copy okjson.rb
 into your project and "require" it directly.
 This method helps you avoid an external
-dependency. It's only about 550 lines of
+dependency. It's only about 575 lines of
 source; about half of that is UTF-8 coding.
 If you bundle okjson with your library,
 please change the module's name to something

diff --git a/okjson.rb b/okjson.rb
@@ -1,4 +1,6 @@
-# Copyright 2011 Keith Rarick
+# encoding: UTF-8
+#
+# Copyright 2011, 2012 Keith Rarick
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -260,6 +262,12 @@ def abbrev(s)
   def unquote(q)
     q = q[1...-1]
     a = q.dup # allocate a big enough string
+    rubydoesenc = false
+    # In ruby >= 1.9, a[w] is a codepoint, not a byte.
+    if a.class.method_defined?(:force_encoding)
+      a.force_encoding('UTF-8')
+      rubydoesenc = true
+    end
     r, w = 0, 0
     while r < q.length
       c = q[r]
@@ -297,7 +305,12 @@ def unquote(q)
               end
             end
           end
-          w += ucharenc(a, w, uchar)
+          if rubydoesenc
+            a[w] = '' << uchar
+            w += 1
+          else
+            w += ucharenc(a, w, uchar)
+          end
         else
           raise Error, "invalid escape char #{q[r]} in \"#{q}\""
         end
@@ -307,6 +320,8 @@ def unquote(q)
         # Copy anything else byte-for-byte.
         # Valid UTF-8 will remain valid UTF-8.
         # Invalid UTF-8 will remain invalid UTF-8.
+        # In ruby >= 1.9, c is a codepoint, not a byte,
+        # in which case this is still what we want.
         a[w] = c
         r += 1
         w += 1
@@ -441,6 +456,10 @@ def strenc(s)
     t = StringIO.new
     t.putc(?")
     r = 0
+
+    # In ruby >= 1.9, s[r] is a codepoint, not a byte.
+    rubydoesenc = s.class.method_defined?(:encoding)
+
     while r < s.length
       case s[r]
       when ?"  then t.print('\\"')
@@ -455,21 +474,13 @@ def strenc(s)
         case true
         when Spc <= c && c <= ?~
           t.putc(c)
-        when true
+        when rubydoesenc
+          u = c.ord
+          surrenc(t, u)
+        else
           u, size = uchardec(s, r)
           r += size - 1 # we add one more at the bottom of the loop
-          if u < 0x10000
+          surrenc(t, u)
-            t.print('\\u')
-            hexenc4(t, u)
-          else
-            u1, u2 = unsubst(u)
-            t.print('\\u')
-            hexenc4(t, u1)
-            t.print('\\u')
-            hexenc4(t, u2)
-          end
-        else
-          # invalid byte; skip it
         end
       end
       r += 1
@@ -479,6 +490,20 @@ def strenc(s)
   end
 
 
+  def surrenc(t, u)
+    if u < 0x10000
+      t.print('\\u')
+      hexenc4(t, u)
+    else
+      u1, u2 = unsubst(u)
+      t.print('\\u')
+      hexenc4(t, u1)
+      t.print('\\u')
+      hexenc4(t, u2)
+    end
+  end
+
+
   def hexenc4(t, u)
     t.putc(Hex[(u>>12)&0xf])
     t.putc(Hex[(u>>8)&0xf])

diff --git a/t/encode-utf8.rb b/t/encode-utf8.rb
@@ -0,0 +1 @@
+["á$¢€園𝄞"]
diff --git a/t/encode-utf8.rb.exp b/t/encode-utf8.rb.exp
@@ -0,0 +1 @@
+["\u00e1$\u00a2\u20ac\u5712\ud834\udd1e"]
diff --git a/tested-on b/tested-on
@@ -1,4 +1,4 @@
-This commit was tested Mon Feb 20 16:08:17 PST 2012
+This commit was tested Mon Feb 20 18:21:05 PST 2012
 using the following ruby interpreters:
 ruby 1.8.7 (2010-01-10 patchlevel 249) [universal-darwin11.0]
 ruby 1.9.2p290 (2011-07-09 revision 32553) [x86_64-darwin11.2.0]