Skip to content
This repository
Browse code

string_decoder: added support for UTF-16LE

Fixes #3223.
  • Loading branch information...
commit 40c4beeb57adebb8495124ecd6c21ddef712c00e 1 parent 5871c81
Koichi Kobayashi koichik authored

Showing 2 changed files with 99 additions and 40 deletions. Show diff stats Hide diff stats

  1. +63 40 lib/string_decoder.js
  2. +36 0 test/simple/test-string-decoder.js
103 lib/string_decoder.js
@@ -21,22 +21,32 @@
21 21
22 22 var StringDecoder = exports.StringDecoder = function(encoding) {
23 23 this.encoding = (encoding || 'utf8').toLowerCase().replace(/[-_]/, '');
24   - if (this.encoding === 'utf8') {
25   - this.charBuffer = new Buffer(6);
26   - this.charReceived = 0;
27   - this.charLength = 0;
  24 + switch (this.encoding) {
  25 + case 'utf8':
  26 + // CESU-8 represents each of Surrogate Pair by 3-bytes
  27 + this.surrogateSize = 3;
  28 + break;
  29 + case 'ucs2':
  30 + case 'utf16le':
  31 + // UTF-16 represents each of Surrogate Pair by 2-bytes
  32 + this.surrogateSize = 2;
  33 + this.detectIncompleteChar = utf16DetectIncompleteChar;
  34 + break;
  35 + default:
  36 + this.write = passThroughWrite;
  37 + return;
28 38 }
  39 +
  40 + this.charBuffer = new Buffer(6);
  41 + this.charReceived = 0;
  42 + this.charLength = 0;
29 43 };
30 44
31 45
32 46 StringDecoder.prototype.write = function(buffer) {
33   - // If not utf8...
34   - if (this.encoding !== 'utf8') {
35   - return buffer.toString(this.encoding);
36   - }
37   -
38 47 var charStr = '';
39 48 var offset = 0;
  49 +
40 50 // if our last write ended with an incomplete multibyte character
41 51 while (this.charLength) {
42 52 // determine how many remaining bytes this buffer has to offer for this char
@@ -55,16 +65,14 @@ StringDecoder.prototype.write = function(buffer) {
55 65 }
56 66
57 67 // get the character that was split
58   - charStr = this.charBuffer.slice(0, this.charLength).toString();
  68 + charStr = this.charBuffer.slice(0, this.charLength).toString(this.encoding);
59 69
60 70 // lead surrogate (D800-DBFF) is also the incomplete character
61   - if (this.charLength === 3) {
62   - var charCode = charStr.charCodeAt(0);
63   - if (charCode >= 0xD800 && charCode <= 0xDBFF) {
64   - charStr = '';
65   - this.charLength += 3; // size of trail surrogate (DC00-DFFF)
66   - continue;
67   - }
  71 + var charCode = charStr.charCodeAt(charStr.length - 1);
  72 + if (charCode >= 0xD800 && charCode <= 0xDBFF) {
  73 + this.charLength += this.surrogateSize;
  74 + charStr = '';
  75 + continue;
68 76 }
69 77 this.charReceived = this.charLength = 0;
70 78
@@ -76,7 +84,35 @@ StringDecoder.prototype.write = function(buffer) {
76 84 break;
77 85 }
78 86
  87 + var lenIncomplete = this.detectIncompleteChar(buffer);
  88 +
  89 + var end = buffer.length;
  90 + if (this.charLength) {
  91 + // buffer the incomplete character bytes we got
  92 + buffer.copy(this.charBuffer, 0, buffer.length - lenIncomplete, end);
  93 + this.charReceived = lenIncomplete;
  94 + end -= lenIncomplete;
  95 + }
  96 +
  97 + charStr += buffer.toString(this.encoding, 0, end);
  98 +
  99 + var end = charStr.length - 1;
  100 + var charCode = charStr.charCodeAt(end);
  101 + // lead surrogate (D800-DBFF) is also the incomplete character
  102 + if (charCode >= 0xD800 && charCode <= 0xDBFF) {
  103 + var size = this.surrogateSize;
  104 + this.charLength += size;
  105 + this.charReceived += size;
  106 + this.charBuffer.copy(this.charBuffer, size, 0, size);
  107 + this.charBuffer.write(charStr.charAt(charStr.length - 1), this.encoding);
  108 + return charStr.substring(0, end);
  109 + }
  110 +
  111 + // or just emit the charStr
  112 + return charStr;
  113 +};
79 114
  115 +StringDecoder.prototype.detectIncompleteChar = function(buffer) {
80 116 // determine how many bytes we have to check at the end of this buffer
81 117 var i = (buffer.length >= 3) ? 3 : buffer.length;
82 118
@@ -106,28 +142,15 @@ StringDecoder.prototype.write = function(buffer) {
106 142 }
107 143 }
108 144
109   - var end = buffer.length;
110   - if (this.charLength) {
111   - // buffer the incomplete character bytes we got
112   - buffer.copy(this.charBuffer, 0, buffer.length - i, buffer.length);
113   - this.charReceived = i;
114   - end -= i;
115   - }
116   -
117   - charStr += buffer.toString('utf8', 0, end);
  145 + return i;
  146 +};
118 147
119   - // lead surrogate (D800-DBFF) is also the incomplete character
120   - end = charStr.length - 1;
121   - var charCode = charStr.charCodeAt(end);
122   - if (charCode >= 0xD800 && charCode <= 0xDBFF) {
123   - // CESU-8 represents each of Surrogate Pair by 3-bytes
124   - this.charLength += 3
125   - this.charReceived += 3
126   - this.charBuffer.copy(this.charBuffer, 3, 0, 3);
127   - this.charBuffer.write(charStr.charAt(end));
128   - return charStr.substring(0, end);
129   - }
  148 +function passThroughWrite(buffer) {
  149 + return buffer.toString(this.encoding);
  150 +}
130 151
131   - // or just emit the charStr
132   - return charStr;
133   -};
  152 +function utf16DetectIncompleteChar(buffer) {
  153 + var incomplete = this.charReceived = buffer.length % 2;
  154 + this.charLength = incomplete ? 2 : 0;
  155 + return incomplete;
  156 +}
36 test/simple/test-string-decoder.js
@@ -89,6 +89,42 @@ s += decoder.write(buffer.slice(0, 6));
89 89 assert.equal(s, '\uD83D\uDC4D'); // THUMBS UP SIGN (in UTF-16)
90 90
91 91
  92 +// UCS-2
  93 +decoder = new StringDecoder('ucs2');
  94 +buffer = new Buffer('ab', 'ucs2');
  95 +assert.equal(decoder.write(buffer), 'ab'); // 2 complete chars
  96 +buffer = new Buffer('abc', 'ucs2');
  97 +assert.equal(decoder.write(buffer.slice(0, 3)), 'a'); // 'a' and first of 'b'
  98 +assert.equal(decoder.write(buffer.slice(3, 6)), 'bc'); // second of 'b' and 'c'
  99 +
  100 +
  101 +// UTF-16LE
  102 +buffer = new Buffer('3DD84DDC', 'hex'); // THUMBS UP SIGN (in CESU-8)
  103 +var s = '';
  104 +s += decoder.write(buffer.slice(0, 1));
  105 +s += decoder.write(buffer.slice(1, 2)); // complete lead surrogate
  106 +assert.equal(s, '');
  107 +s += decoder.write(buffer.slice(2, 3));
  108 +s += decoder.write(buffer.slice(3, 4)); // complete trail surrogate
  109 +assert.equal(s, '\uD83D\uDC4D'); // THUMBS UP SIGN (in UTF-16)
  110 +
  111 +var s = '';
  112 +s += decoder.write(buffer.slice(0, 2)); // complete lead surrogate
  113 +assert.equal(s, '');
  114 +s += decoder.write(buffer.slice(2, 4)); // complete trail surrogate
  115 +assert.equal(s, '\uD83D\uDC4D'); // THUMBS UP SIGN (in UTF-16)
  116 +
  117 +var s = '';
  118 +s += decoder.write(buffer.slice(0, 3)); // complete lead surrogate
  119 +assert.equal(s, '');
  120 +s += decoder.write(buffer.slice(3, 4)); // complete trail surrogate
  121 +assert.equal(s, '\uD83D\uDC4D'); // THUMBS UP SIGN (in UTF-16)
  122 +
  123 +var s = '';
  124 +s += decoder.write(buffer.slice(0, 4));
  125 +assert.equal(s, '\uD83D\uDC4D'); // THUMBS UP SIGN (in UTF-16)
  126 +
  127 +
92 128 // A mixed ascii and non-ascii string
93 129 // Test stolen from deps/v8/test/cctest/test-strings.cc
94 130 // U+02E4 -> CB A4

0 comments on commit 40c4bee

Please sign in to comment.
Something went wrong with that request. Please try again.