From 78c7f3fe25078a34e3438b47aaa8808149d4df29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=B8=D0=BB=D1=8F=D0=BD=20=D0=9F=D0=B0=D0=BB=D0=B0?= =?UTF-8?q?=D1=83=D0=B7=D0=BE=D0=B2?= Date: Sat, 25 Apr 2020 20:30:27 +0000 Subject: [PATCH] helpers.foldline: do not insert spaces between the high and low UTF-16 surrogates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ical.js used to fold in the middle of a UTF-16 code point sequence. To be precise, it inserted a space and a new line after the high UTF-16 surrogate and before the low UTF-16 surrogate. This created invalid stand-alone surrogates which the JavaScript engine converts to something valid. This is the problem, with Node.js: import {writeFileSync} from 'fs'; const a = '\uD83D\uDCAA' //this is 💪'Flexed Biceps', UTF-8 encoding: 0xF0 0x9F // 0x92 0xAA, HTML Entity: 💪 💪 UTF-16 encoding: 0xD83D 0xDCAA writeFileSync('a1', a, 'utf8') //file contains the bytes f0 9f 92 aa writeFIlesync('a2', a.charAt(0) + a.charAt(1), 'utf8') //the same as above writeFileSync('b', a.charAt(0) + ' ' + a.charAt(1), 'utf8') //file contains the bytes 0xEF 0xBF 0xBD 0x20 0xEF 0xBF 0xBD = REPLACEMENT CHARACTER = � so the result in b is valid UTF-8, but it has nothing in common with the original text and it cannot be suspected, that anyform of reconstructing b to a shall be tried. This patch takes a full UTF-16 character from the input, • calculates whether it takes one or two UTF-16 chars, and keeps in pos where the next full UTF-16 character starts, • calculates for the full UTF-16 code point, whether it needs 1, 2, 3 or 4 bytes to be presened in UTF-8, keeping in line_length the bytes for UTF-8 so far necessary, • splits the line, when the UTF-8 presentation exceeds ICAL.foldLength bytes --- lib/ical/helpers.js | 19 +++++++++++++++---- test/stringify_test.js | 6 ++++++ 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/lib/ical/helpers.js b/lib/ical/helpers.js index a8a5c60d..071bf08d 100644 --- a/lib/ical/helpers.js +++ b/lib/ical/helpers.js @@ -287,11 +287,22 @@ ICAL.helpers = { */ foldline: function foldline(aLine) { var result = ""; - var line = aLine || ""; - + var line = aLine || "", pos = 0, line_length = 0; + //pos counts position in line for the UTF-16 presentation + //line_length counts the bytes for the UTF-8 presentation while (line.length) { - result += ICAL.newLineChar + " " + line.substr(0, ICAL.foldLength); - line = line.substr(ICAL.foldLength); + var cp = line.codePointAt(pos); + if (cp < 128) ++line_length; + else if (cp < 2048) line_length += 2;//needs 2 UTF-8 bytes + else if (cp < 65536) line_length += 3; + else line_length += 4; //cp is less than 1114112 + if (line_length < ICAL.foldLength + 1) + pos += cp > 65535 ? 2 : 1; + else { + result += ICAL.newLineChar + " " + line.substring(0, pos); + line = line.substring(pos); + pos = line_length = 0; + } } return result.substr(ICAL.newLineChar.length + 1); }, diff --git a/test/stringify_test.js b/test/stringify_test.js index 987a90b6..e92e4cc1 100644 --- a/test/stringify_test.js +++ b/test/stringify_test.js @@ -113,6 +113,12 @@ suite('ICAL.stringify', function() { assert.equal(ICAL.stringify.property(subject.toJSON(), ICAL.design.icalendar, false), "DESCRIPTION:foo" + N + "bar"); assert.equal(ICAL.stringify.property(subject.toJSON(), ICAL.design.icalendar, true), "DESCRIPTION:foobar"); + var utf16_muscle = '\uD83D\uDCAA'; //in UTF-8 this is F0 DF 92 AA. If space/new line is inserted between the surrogates, then the JS Engine substitutes each stand-alone surrogate with REPLACEMENT CHARACTER 0xEF 0xBF 0xBD + subject.setValue(utf16_muscle); + assert.equal(ICAL.stringify.property(subject.toJSON(), ICAL.design.icalendar, false), "DESCRIPTION:" + N + utf16_muscle);//verify new line is after ':', as otherwise the whole line is longer than ICAL.foldLength + subject.setValue('aa' + utf16_muscle + utf16_muscle + 'a' + utf16_muscle + utf16_muscle); + assert.equal(ICAL.stringify.property(subject.toJSON(), ICAL.design.icalendar, false), "DESCRIPTION:aa" + N + utf16_muscle + utf16_muscle + 'a' + utf16_muscle + N + utf16_muscle);//verify that the utf16_muscle is moved as whole to a new line as it is 4 UTF-8 bytes + ICAL.foldLength = oldLength; }); });