Permalink
Browse files

generate multi-byte UTF-8; closes #4

  • Loading branch information...
1 parent cbc0b98 commit d2ccd39964726a75a8141567cae6bae9f34c0711 @kr committed Mar 26, 2012
Showing with 69 additions and 70 deletions.
  1. +1 −0 .gitignore
  2. +51 −63 okjson.rb
  3. +1 −0 t/encode-badutf8.rb
  4. +1 −0 t/encode-badutf8.rb.exp
  5. +1 −1 t/encode-utf8.rb.exp
  6. +1 −1 t/valid-hex.json.exp
  7. +1 −1 t/valid-key.json.exp
  8. +12 −4 tested-on
View
@@ -1 +1,2 @@
*.rbc
+.rbx
View
@@ -374,15 +374,6 @@ def subst(u1, u2)
end
- def unsubst(u)
- if u < Usurrself || u > Umax || surrogate?(u)
- return Ucharerr, Ucharerr
- end
- u -= Usurrself
- [Usurr1 + ((u>>10)&0x3ff), Usurr2 + (u&0x3ff)]
- end
-
-
def surrogate?(u)
Usurr1 <= u && u < Usurr3
end
@@ -472,15 +463,18 @@ def strenc(s)
else
c = s[r]
case true
+ when rubydoesenc
+ begin
+ c.ord # will raise an error if c is invalid UTF-8
+ t.write(c)
+ rescue
+ t.write(Ustrerr)
+ end
when Spc <= c && c <= ?~
t.putc(c)
- when rubydoesenc
- u = c.ord
- surrenc(t, u)
else
- u, size = uchardec(s, r)
- r += size - 1 # we add one more at the bottom of the loop
- surrenc(t, u)
+ n = ucharcopy(t, s, r) # ensure valid UTF-8 output
+ r += n - 1 # r is incremented below
end
end
r += 1
@@ -490,28 +484,6 @@ def strenc(s)
end
- def surrenc(t, u)
- if u < 0x10000
- t.print('\\u')
- hexenc4(t, u)
- else
- u1, u2 = unsubst(u)
- t.print('\\u')
- hexenc4(t, u1)
- t.print('\\u')
- hexenc4(t, u2)
- end
- end
-
-
- def hexenc4(t, u)
- t.putc(Hex[(u>>12)&0xf])
- t.putc(Hex[(u>>8)&0xf])
- t.putc(Hex[(u>>4)&0xf])
- t.putc(Hex[u&0xf])
- end
-
-
def numenc(x)
if ((x.nan? || x.infinite?) rescue false)
raise Error, "Numeric cannot be represented: #{x}"
@@ -520,60 +492,77 @@ def numenc(x)
end
- # Decodes unicode character u from UTF-8
- # bytes in string s at position i.
- # Returns u and the number of bytes read.
- def uchardec(s, i)
+ # Copies the valid UTF-8 bytes of a single character
+ # from string s at position i to I/O object t, and
+ # returns the number of bytes copied.
+ # If no valid UTF-8 char exists at position i,
+ # ucharcopy writes Ustrerr and returns 1.
+ def ucharcopy(t, s, i)
n = s.length - i
- return [Ucharerr, 1] if n < 1
+ raise Utf8Error if n < 1
c0 = s[i].ord
# 1-byte, 7-bit sequence?
if c0 < Utagx
- return [c0, 1]
+ t.putc(c0)
+ return 1
end
- # unexpected continuation byte?
- return [Ucharerr, 1] if c0 < Utag2
+ raise Utf8Error if c0 < Utag2 # unexpected continuation byte?
- # need continuation byte
- return [Ucharerr, 1] if n < 2
+ raise Utf8Error if n < 2 # need continuation byte
c1 = s[i+1].ord
- return [Ucharerr, 1] if c1 < Utagx || Utag2 <= c1
+ raise Utf8Error if c1 < Utagx || Utag2 <= c1
# 2-byte, 11-bit sequence?
if c0 < Utag3
- u = (c0&Umask2)<<6 | (c1&Umaskx)
- return [Ucharerr, 1] if u <= Uchar1max
- return [u, 2]
+ raise Utf8Error if ((c0&Umask2)<<6 | (c1&Umaskx)) <= Uchar1max
+ t.putc(c0)
+ t.putc(c1)
+ return 2
end
# need second continuation byte
- return [Ucharerr, 1] if n < 3
+ raise Utf8Error if n < 3
+
c2 = s[i+2].ord
- return [Ucharerr, 1] if c2 < Utagx || Utag2 <= c2
+ raise Utf8Error if c2 < Utagx || Utag2 <= c2
# 3-byte, 16-bit sequence?
if c0 < Utag4
u = (c0&Umask3)<<12 | (c1&Umaskx)<<6 | (c2&Umaskx)
- return [Ucharerr, 1] if u <= Uchar2max
- return [u, 3]
+ raise Utf8Error if u <= Uchar2max
+ t.putc(c0)
+ t.putc(c1)
+ t.putc(c2)
+ return 3
end
# need third continuation byte
- return [Ucharerr, 1] if n < 4
+ raise Utf8Error if n < 4
c3 = s[i+3].ord
- return [Ucharerr, 1] if c3 < Utagx || Utag2 <= c3
+ raise Utf8Error if c3 < Utagx || Utag2 <= c3
# 4-byte, 21-bit sequence?
if c0 < Utag5
u = (c0&Umask4)<<18 | (c1&Umaskx)<<12 | (c2&Umaskx)<<6 | (c3&Umaskx)
- return [Ucharerr, 1] if u <= Uchar3max
- return [u, 4]
- end
+ raise Utf8Error if u <= Uchar3max
+ t.putc(c0)
+ t.putc(c1)
+ t.putc(c2)
+ t.putc(c3)
+ return 4
+ end
+
+ raise Utf8Error
+ rescue Utf8Error
+ t.write(Ustrerr)
+ return 1
+ end
+
- return [Ucharerr, 1]
+ class Utf8Error < ::StandardError
end
@@ -594,13 +583,12 @@ class Error < ::StandardError
Uchar2max = (1<<11) - 1
Uchar3max = (1<<16) - 1
Ucharerr = 0xFFFD # unicode "replacement char"
+ Ustrerr = "\xef\xbf\xbd" # unicode "replacement char"
Usurrself = 0x10000
Usurr1 = 0xd800
Usurr2 = 0xdc00
Usurr3 = 0xe000
- Umax = 0x10ffff
Spc = ' '[0]
Unesc = {?b=>?\b, ?f=>?\f, ?n=>?\n, ?r=>?\r, ?t=>?\t}
- Hex = '0123456789abcdef'
end
View
@@ -0,0 +1 @@
+["\x82\xAC\xEF"]
@@ -0,0 +1 @@
+["���"]
@@ -1 +1 @@
-["\u00e1$\u00a2\u20ac\u5712\ud834\udd1e"]
+["á$¢€園𝄞"]
@@ -1 +1 @@
-["$\u00a2\u20ac\u5712\ud834\udd1e"]
+["$¢€園𝄞"]
@@ -1 +1 @@
-{"/\\\"\ucafe\ubabe\uab98\ufcde\ubcda\uef4a\b\f\n\r\t`1~!@#$%^&*()_+-=[]{}|;:',./<>?":"A key can be any string"}
+{"/\\\"쫾몾ꮘﳞ볚\b\f\n\r\t`1~!@#$%^&*()_+-=[]{}|;:',./<>?":"A key can be any string"}
View
@@ -1,5 +1,13 @@
-This commit was tested Mon Feb 20 18:21:05 PST 2012
+This commit was tested Tue Mar 27 00:02:01 PDT 2012
using the following ruby interpreters:
-ruby 1.8.7 (2010-01-10 patchlevel 249) [universal-darwin11.0]
-ruby 1.9.2p290 (2011-07-09 revision 32553) [x86_64-darwin11.2.0]
-rubinius 1.2.4 (1.8.7 release 2011-07-05 JI) [x86_64-apple-darwin11.2.0]
+ruby 1.8.7 (2010-01-10 patchlevel 249) [i686-darwin11.3.0]
+ruby 1.8.7 (2011-06-30 patchlevel 352) [i686-darwin11.2.0]
+ruby 1.9.1p378 (2010-01-10 revision 26273) [i386-darwin11.3.0]
+ruby 1.9.2p290 (2011-07-09 revision 32553) [x86_64-darwin11.1.0]
+ruby 1.9.3p166 (2012-03-26 revision 35128) [x86_64-darwin11.3.0]
+ruby 1.9.3p125 (2012-02-16 revision 34643) [x86_64-darwin11.3.0]
+jruby 1.6.3 (ruby-1.8.7-p330) (2011-07-07 965162f) (Java HotSpot(TM) 64-Bit Server VM 1.6.0_29) [darwin-x86_64-java]
+jruby 1.6.7 (ruby-1.8.7-p357) (2012-02-22 3e82bc8) (Java HotSpot(TM) 64-Bit Server VM 1.6.0_29) [darwin-x86_64-java]
+rubinius 1.2.4 (1.8.7 release 2011-07-05 JI) [x86_64-apple-darwin11.3.0]
+rubinius 2.0.0dev (1.8.7 b4ab27cf yyyy-mm-dd JI) [x86_64-apple-darwin11.3.0]
+ruby 1.8.7 (2009-06-12 patchlevel 174) [i686-darwin11.3.0], MBARI 0x6770, Ruby Enterprise Edition 20090928

1 comment on commit d2ccd39

@plentz
plentz commented on d2ccd39 May 10, 2012

this rocks!

Please sign in to comment.