diff --git a/README.md b/README.md index 5c7e468..0760193 100644 --- a/README.md +++ b/README.md @@ -16,18 +16,18 @@ console.assert(whatwgEncoding.isSupported("UTF-32") === false); // In the Encoding Standard, but this package can't decode it console.assert(whatwgEncoding.isSupported("x-mac-cyrillic") === false); -console.assert(whatwgEncoding.getBOMEncoding(new Buffer([0xFE, 0xFF])) === "UTF-16BE"); -console.assert(whatwgEncoding.getBOMEncoding(new Buffer([0x48, 0x69])) === null); +console.assert(whatwgEncoding.getBOMEncoding(new Uint8Array([0xFE, 0xFF])) === "UTF-16BE"); +console.assert(whatwgEncoding.getBOMEncoding(new Uint8Array([0x48, 0x69])) === null); -console.assert(whatwgEncoding.decode(new Buffer([0x48, 0x69]), "UTF-8") === "Hi"); +console.assert(whatwgEncoding.decode(new Uint8Array([0x48, 0x69]), "UTF-8") === "Hi"); ``` ## API -- `decode(buffer, fallbackEncodingName)`: performs the [decode](https://encoding.spec.whatwg.org/#decode) algorithm (in which any BOM will override the passed fallback encoding), and returns the resulting string +- `decode(uint8Array, fallbackEncodingName)`: performs the [decode](https://encoding.spec.whatwg.org/#decode) algorithm (in which any BOM will override the passed fallback encoding), and returns the resulting string - `labelToName(label)`: performs the [get an encoding](https://encoding.spec.whatwg.org/#concept-encoding-get) algorithm and returns the resulting encoding's name, or `null` for failure - `isSupported(name)`: returns whether the encoding is one of [the encodings](https://encoding.spec.whatwg.org/#names-and-labels) of the Encoding Standard, _and_ is an encoding that this package can decode (via iconv-lite) -- `getBOMEncoding(buffer)`: sniffs the first 2–3 bytes of the supplied `Buffer`, returning one of the encoding names `"UTF-8"`, `"UTF-16LE"`, or `"UTF-16BE"` if the appropriate BOM is present, or `null` if no BOM is present +- `getBOMEncoding(uint8Array)`: sniffs the first 2–3 bytes of the supplied `Uint8Array`, returning one of the encoding names `"UTF-8"`, `"UTF-16LE"`, or `"UTF-16BE"` if the appropriate BOM is present, or `null` if no BOM is present ## Unsupported encodings diff --git a/lib/whatwg-encoding.js b/lib/whatwg-encoding.js index d04eab5..ba18493 100644 --- a/lib/whatwg-encoding.js +++ b/lib/whatwg-encoding.js @@ -1,9 +1,8 @@ "use strict"; -const iconvLite = require("iconv-lite"); -const supportedNames = require("./supported-names.json"); +const names = require("./names.json"); const labelsToNames = require("./labels-to-names.json"); -const supportedNamesSet = new Set(supportedNames); +const namesSet = new Set(names); // https://encoding.spec.whatwg.org/#concept-encoding-get exports.labelToName = label => { @@ -13,29 +12,45 @@ exports.labelToName = label => { }; // https://encoding.spec.whatwg.org/#decode -exports.decode = (buffer, fallbackEncodingName) => { +exports.decode = (uint8Array, fallbackEncodingName, { errorMode = "replacement" } = {}) => { let encoding = fallbackEncodingName; if (!exports.isSupported(encoding)) { throw new RangeError(`"${encoding}" is not a supported encoding name`); } - const bomEncoding = exports.getBOMEncoding(buffer); + if (encoding === "replacement") { + // the TextDecoder constructor will early-error. We implement + // https://encoding.spec.whatwg.org/#replacement-decoder instead. + if (uint8Array.byteLength === 0) { + return ""; + } + + if (errorMode === "fatal") { + throw new TypeError("The replacement encoding always errors on any non-empty input"); + } else { + return "\uFFFD".repeat(uint8Array.byteLength); + } + } + + const bomEncoding = exports.getBOMEncoding(uint8Array); + let start = 0; if (bomEncoding !== null) { encoding = bomEncoding; + start = bomEncoding === "UTF-8" ? 3 : 2; } - // iconv-lite will strip BOMs for us, so no need to do the stuff the spec does + const subarray = uint8Array.subarray(start, uint8Array.byteLength); - return iconvLite.decode(buffer, encoding); + return (new TextDecoder(encoding, { ignoreBOM: true, fatal: errorMode === "fatal" })).decode(subarray); }; // https://github.com/whatwg/html/issues/1910#issuecomment-254017369 -exports.getBOMEncoding = buffer => { - if (buffer[0] === 0xFE && buffer[1] === 0xFF) { +exports.getBOMEncoding = uint8Array => { + if (uint8Array[0] === 0xFE && uint8Array[1] === 0xFF) { return "UTF-16BE"; - } else if (buffer[0] === 0xFF && buffer[1] === 0xFE) { + } else if (uint8Array[0] === 0xFF && uint8Array[1] === 0xFE) { return "UTF-16LE"; - } else if (buffer[0] === 0xEF && buffer[1] === 0xBB && buffer[2] === 0xBF) { + } else if (uint8Array[0] === 0xEF && uint8Array[1] === 0xBB && uint8Array[2] === 0xBF) { return "UTF-8"; } @@ -43,5 +58,5 @@ exports.getBOMEncoding = buffer => { }; exports.isSupported = name => { - return supportedNamesSet.has(String(name)); + return namesSet.has(String(name)); }; diff --git a/scripts/update.js b/scripts/update.js index 651c293..4e6d25d 100644 --- a/scripts/update.js +++ b/scripts/update.js @@ -12,11 +12,9 @@ async function main() { const supportedNames = []; for (const entry of body) { for (const encoding of entry.encodings) { - if (iconvLite.encodingExists(encoding.name)) { - supportedNames.push(encoding.name); - for (const label of encoding.labels) { - labelsToNames[label] = encoding.name; - } + supportedNames.push(encoding.name); + for (const label of encoding.labels) { + labelsToNames[label] = encoding.name; } } } @@ -25,7 +23,7 @@ async function main() { fs.writeFileSync(path.resolve(__dirname, "../lib/labels-to-names.json"), labelsToNamesOutput); const supportedNamesOutput = JSON.stringify(supportedNames, undefined, 2); - fs.writeFileSync(path.resolve(__dirname, "../lib/supported-names.json"), supportedNamesOutput); + fs.writeFileSync(path.resolve(__dirname, "../lib/names.json"), supportedNamesOutput); } main().catch(e => { diff --git a/test/tests.js b/test/tests.js index 3b51689..0914571 100644 --- a/test/tests.js +++ b/test/tests.js @@ -4,52 +4,65 @@ const whatwgEncoding = require(".."); describe("decode", () => { it("should decode BOM-less windows-1252", () => { - const buffer = new Buffer([0x80, 0x95]); - const string = whatwgEncoding.decode(buffer, "windows-1252"); + const uint8Array = new Uint8Array([0x80, 0x95]); + const string = whatwgEncoding.decode(uint8Array, "windows-1252"); assert.strictEqual(string, "€•"); }); it("should override when it sees a UTF-8 BOM", () => { - const buffer = new Buffer([0xEF, 0xBB, 0xBF, 0xE2, 0x82, 0xAC, 0xE2, 0x80, 0xA2]); - const string = whatwgEncoding.decode(buffer, "windows-1252"); + const uint8Array = new Uint8Array([0xEF, 0xBB, 0xBF, 0xE2, 0x82, 0xAC, 0xE2, 0x80, 0xA2]); + const string = whatwgEncoding.decode(uint8Array, "windows-1252"); assert.strictEqual(string, "€•"); }); it("should override when it sees a UTF-16LE BOM", () => { - const buffer = new Buffer([0xFF, 0xFE, 0xAC, 0x20, 0x22, 0x20]); - const string = whatwgEncoding.decode(buffer, "windows-1252"); + const uint8Array = new Uint8Array([0xFF, 0xFE, 0xAC, 0x20, 0x22, 0x20]); + const string = whatwgEncoding.decode(uint8Array, "windows-1252"); assert.strictEqual(string, "€•"); }); it("should override when it sees a UTF-16BE BOM", () => { - const buffer = new Buffer([0xFE, 0xFF, 0x20, 0xAC, 0x20, 0x22]); - const string = whatwgEncoding.decode(buffer, "windows-1252"); + const uint8Array = new Uint8Array([0xFE, 0xFF, 0x20, 0xAC, 0x20, 0x22]); + const string = whatwgEncoding.decode(uint8Array, "windows-1252"); assert.strictEqual(string, "€•"); }); - it("should throw when given an invalid encoding name", () => { - assert.throws(() => whatwgEncoding.decode(new Buffer([]), "asdf"), RangeError); - assert.throws(() => whatwgEncoding.decode(new Buffer([]), "utf-8"), RangeError); - assert.throws(() => whatwgEncoding.decode(new Buffer([]), " UTF-8"), RangeError); - assert.throws(() => whatwgEncoding.decode(new Buffer([]), "UTF-32"), RangeError); + it("should throw a TypeError on invalid byte sequence inputs for a given encoding when mode is fatal", () => { + // Anything fails for replacement + assert.throws(() => whatwgEncoding.decode(new Uint8Array([0x20]), "replacement", { errorMode: "fatal" }), TypeError); + + assert.throws(() => whatwgEncoding.decode(new Uint8Array([0x83, 0x5C]), "Big5", { errorMode: "fatal" }), TypeError); + }); + + it("should output U+FFFD for a given encoding when mode is replacement (the default)", () => { + // Anything fails for replacement + assert.strictEqual(whatwgEncoding.decode(new Uint8Array([0x20]), "replacement", { errorMode: "replacement" }), "\uFFFD"); + assert.strictEqual(whatwgEncoding.decode(new Uint8Array([0x20]), "replacement"), "\uFFFD"); + + assert.strictEqual(whatwgEncoding.decode(new Uint8Array([0x83, 0x5C]), "Big5", { errorMode: "replacement" }), "\uFFFD"); + assert.strictEqual(whatwgEncoding.decode(new Uint8Array([0x83, 0x5C]), "Big5"), "\uFFFD\u005C"); + }); + + it("should return the empty string for empty input", () => { + assert.strictEqual(whatwgEncoding.decode(new Uint8Array([]), "replacement"), ""); + assert.strictEqual(whatwgEncoding.decode(new Uint8Array([]), "Big5"), ""); }); - it("should throw when given an unsupported encoding name", () => { - assert.throws(() => whatwgEncoding.decode(new Buffer([]), "ISO-2022-JP"), RangeError); - assert.throws(() => whatwgEncoding.decode(new Buffer([]), "ISO-8859-8-I"), RangeError); - assert.throws(() => whatwgEncoding.decode(new Buffer([]), "replacement"), RangeError); - assert.throws(() => whatwgEncoding.decode(new Buffer([]), "x-mac-cyrillic"), RangeError); - assert.throws(() => whatwgEncoding.decode(new Buffer([]), "x-user-defined"), RangeError); + it("should throw when given an invalid encoding name", () => { + assert.throws(() => whatwgEncoding.decode(new Uint8Array([]), "asdf"), RangeError); + assert.throws(() => whatwgEncoding.decode(new Uint8Array([]), "utf-8"), RangeError); + assert.throws(() => whatwgEncoding.decode(new Uint8Array([]), " UTF-8"), RangeError); + assert.throws(() => whatwgEncoding.decode(new Uint8Array([]), "UTF-32"), RangeError); }); it("should throw when given an encoding label that is not a name", () => { - assert.throws(() => whatwgEncoding.decode(new Buffer([]), "ascii"), RangeError); - assert.throws(() => whatwgEncoding.decode(new Buffer([]), "latin1"), RangeError); - assert.throws(() => whatwgEncoding.decode(new Buffer([]), "iso88591"), RangeError); + assert.throws(() => whatwgEncoding.decode(new Uint8Array([]), "ascii"), RangeError); + assert.throws(() => whatwgEncoding.decode(new Uint8Array([]), "latin1"), RangeError); + assert.throws(() => whatwgEncoding.decode(new Uint8Array([]), "iso88591"), RangeError); }); }); @@ -59,6 +72,7 @@ describe("labelToName", () => { assert.strictEqual(whatwgEncoding.labelToName("csibm866"), "IBM866"); assert.strictEqual(whatwgEncoding.labelToName("latin3"), "ISO-8859-3"); assert.strictEqual(whatwgEncoding.labelToName("tis-620"), "windows-874"); + assert.strictEqual(whatwgEncoding.labelToName("replacement"), "replacement"); }); it("should be case-insensitive", () => { @@ -85,30 +99,9 @@ describe("labelToName", () => { it("should return null for invalid encoding labels", () => { assert.strictEqual(whatwgEncoding.labelToName("AS\u0009CII"), null); assert.strictEqual(whatwgEncoding.labelToName("asdf"), null); - assert.strictEqual(whatwgEncoding.labelToName("replacement"), null); assert.strictEqual(whatwgEncoding.labelToName("UTF-32"), null); }); - it("should return null for unsupported encoding labels", () => { - assert.strictEqual(whatwgEncoding.labelToName("ISO-2022-JP"), null); - assert.strictEqual(whatwgEncoding.labelToName("csiso2022jp"), null); - - assert.strictEqual(whatwgEncoding.labelToName("ISO-8859-8-I"), null); - assert.strictEqual(whatwgEncoding.labelToName("csiso88598i"), null); - assert.strictEqual(whatwgEncoding.labelToName("logical"), null); - - assert.strictEqual(whatwgEncoding.labelToName("csiso2022kr"), null); - assert.strictEqual(whatwgEncoding.labelToName("hz-gb-2312"), null); - assert.strictEqual(whatwgEncoding.labelToName("iso-2022-cn"), null); - assert.strictEqual(whatwgEncoding.labelToName("iso-2022-cn-ext"), null); - assert.strictEqual(whatwgEncoding.labelToName("iso-2022-kr"), null); - - assert.strictEqual(whatwgEncoding.labelToName("x-mac-cyrillic"), null); - assert.strictEqual(whatwgEncoding.labelToName("x-mac-ukrainian"), null); - - assert.strictEqual(whatwgEncoding.labelToName("x-user-defined"), null); - }); - it("should return null for non-strings", () => { assert.strictEqual(whatwgEncoding.labelToName(), null); assert.strictEqual(whatwgEncoding.labelToName(5), null); @@ -117,7 +110,7 @@ describe("labelToName", () => { }); describe("isSupported", () => { - it("should return true for supported encodings", () => { + it("should return true for all supported encodings", () => { assert.strictEqual(whatwgEncoding.isSupported("UTF-8"), true); assert.strictEqual(whatwgEncoding.isSupported("IBM866"), true); assert.strictEqual(whatwgEncoding.isSupported("ISO-8859-2"), true); @@ -153,6 +146,12 @@ describe("isSupported", () => { assert.strictEqual(whatwgEncoding.isSupported("EUC-KR"), true); assert.strictEqual(whatwgEncoding.isSupported("UTF-16BE"), true); assert.strictEqual(whatwgEncoding.isSupported("UTF-16LE"), true); + assert.strictEqual(whatwgEncoding.isSupported("ISO-2022-JP"), true); + assert.strictEqual(whatwgEncoding.isSupported("ISO-8859-8-I"), true); + assert.strictEqual(whatwgEncoding.isSupported("replacement"), true); + assert.strictEqual(whatwgEncoding.isSupported("x-mac-cyrillic"), true); + assert.strictEqual(whatwgEncoding.isSupported("x-user-defined"), true); + assert.strictEqual(whatwgEncoding.isSupported("replacement"), true); }); it("should return false for miscapitalizations and non-name labels", () => { @@ -161,14 +160,6 @@ describe("isSupported", () => { assert.strictEqual(whatwgEncoding.isSupported("latin1"), false); }); - it("should return false for the unimplemented encodings", () => { - assert.strictEqual(whatwgEncoding.isSupported("ISO-2022-JP"), false); - assert.strictEqual(whatwgEncoding.isSupported("ISO-8859-8-I"), false); - assert.strictEqual(whatwgEncoding.isSupported("replacement"), false); - assert.strictEqual(whatwgEncoding.isSupported("x-mac-cyrillic"), false); - assert.strictEqual(whatwgEncoding.isSupported("x-user-defined"), false); - }); - it("should return false for invalid encoding names", () => { assert.strictEqual(whatwgEncoding.isSupported("asdf"), false); assert.strictEqual(whatwgEncoding.isSupported("UTF-32"), false); @@ -177,57 +168,57 @@ describe("isSupported", () => { describe("getBOMEncoding", () => { it("should return UTF-8 for a UTF-8 BOM", () => { - const buffer = new Buffer([0xEF, 0xBB, 0xBF, 0xE2, 0x82, 0xAC, 0xE2, 0x80, 0xA2]); - const encoding = whatwgEncoding.getBOMEncoding(buffer); + const uint8Array = new Uint8Array([0xEF, 0xBB, 0xBF, 0xE2, 0x82, 0xAC, 0xE2, 0x80, 0xA2]); + const encoding = whatwgEncoding.getBOMEncoding(uint8Array); assert.strictEqual(encoding, "UTF-8"); }); it("should return UTF-16LE for a UTF-16LE BOM", () => { - const buffer = new Buffer([0xFF, 0xFE, 0xAC, 0x20, 0x22, 0x20]); - const encoding = whatwgEncoding.getBOMEncoding(buffer); + const uint8Array = new Uint8Array([0xFF, 0xFE, 0xAC, 0x20, 0x22, 0x20]); + const encoding = whatwgEncoding.getBOMEncoding(uint8Array); assert.strictEqual(encoding, "UTF-16LE"); }); it("should return UTF-16BE for a UTF-16BE BOM", () => { - const buffer = new Buffer([0xFE, 0xFF, 0x20, 0xAC, 0x20, 0x22]); - const encoding = whatwgEncoding.getBOMEncoding(buffer); + const uint8Array = new Uint8Array([0xFE, 0xFF, 0x20, 0xAC, 0x20, 0x22]); + const encoding = whatwgEncoding.getBOMEncoding(uint8Array); assert.strictEqual(encoding, "UTF-16BE"); }); it("should return null for no BOM", () => { - const buffer = new Buffer([0x80, 0x95]); - const encoding = whatwgEncoding.getBOMEncoding(buffer); + const uint8Array = new Uint8Array([0x80, 0x95]); + const encoding = whatwgEncoding.getBOMEncoding(uint8Array); assert.strictEqual(encoding, null); }); it("should return UTF-16LE for a UTF-32LE BOM", () => { - const buffer = new Buffer([0xFF, 0xFE, 0x00, 0x00]); - const encoding = whatwgEncoding.getBOMEncoding(buffer); + const uint8Array = new Uint8Array([0xFF, 0xFE, 0x00, 0x00]); + const encoding = whatwgEncoding.getBOMEncoding(uint8Array); assert.strictEqual(encoding, "UTF-16LE"); }); it("should return null for a UTF-32BE BOM", () => { - const buffer = new Buffer([0x00, 0x00, 0xFF, 0xFE]); - const encoding = whatwgEncoding.getBOMEncoding(buffer); + const uint8Array = new Uint8Array([0x00, 0x00, 0xFF, 0xFE]); + const encoding = whatwgEncoding.getBOMEncoding(uint8Array); assert.strictEqual(encoding, null); }); - it("should return null for an empty buffer", () => { - const buffer = new Buffer([]); - const encoding = whatwgEncoding.getBOMEncoding(buffer); + it("should return null for an empty uint8Array", () => { + const uint8Array = new Uint8Array([]); + const encoding = whatwgEncoding.getBOMEncoding(uint8Array); assert.strictEqual(encoding, null); }); - it("should return null for a one-byte buffer", () => { - const buffer = new Buffer([0xFF]); - const encoding = whatwgEncoding.getBOMEncoding(buffer); + it("should return null for a one-byte uint8Array", () => { + const uint8Array = new Uint8Array([0xFF]); + const encoding = whatwgEncoding.getBOMEncoding(uint8Array); assert.strictEqual(encoding, null); });