Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Attempt to use the builtin and switch to Uint8Array #13

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,18 @@ console.assert(whatwgEncoding.isSupported("UTF-32") === false);
// In the Encoding Standard, but this package can't decode it
console.assert(whatwgEncoding.isSupported("x-mac-cyrillic") === false);

console.assert(whatwgEncoding.getBOMEncoding(new Buffer([0xFE, 0xFF])) === "UTF-16BE");
console.assert(whatwgEncoding.getBOMEncoding(new Buffer([0x48, 0x69])) === null);
console.assert(whatwgEncoding.getBOMEncoding(new Uint8Array([0xFE, 0xFF])) === "UTF-16BE");
console.assert(whatwgEncoding.getBOMEncoding(new Uint8Array([0x48, 0x69])) === null);

console.assert(whatwgEncoding.decode(new Buffer([0x48, 0x69]), "UTF-8") === "Hi");
console.assert(whatwgEncoding.decode(new Uint8Array([0x48, 0x69]), "UTF-8") === "Hi");
```

## API

- `decode(buffer, fallbackEncodingName)`: performs the [decode](https://encoding.spec.whatwg.org/#decode) algorithm (in which any BOM will override the passed fallback encoding), and returns the resulting string
- `decode(uint8Array, fallbackEncodingName)`: performs the [decode](https://encoding.spec.whatwg.org/#decode) algorithm (in which any BOM will override the passed fallback encoding), and returns the resulting string
- `labelToName(label)`: performs the [get an encoding](https://encoding.spec.whatwg.org/#concept-encoding-get) algorithm and returns the resulting encoding's name, or `null` for failure
- `isSupported(name)`: returns whether the encoding is one of [the encodings](https://encoding.spec.whatwg.org/#names-and-labels) of the Encoding Standard, _and_ is an encoding that this package can decode (via iconv-lite)
- `getBOMEncoding(buffer)`: sniffs the first 2–3 bytes of the supplied `Buffer`, returning one of the encoding names `"UTF-8"`, `"UTF-16LE"`, or `"UTF-16BE"` if the appropriate BOM is present, or `null` if no BOM is present
- `getBOMEncoding(uint8Array)`: sniffs the first 2–3 bytes of the supplied `Uint8Array`, returning one of the encoding names `"UTF-8"`, `"UTF-16LE"`, or `"UTF-16BE"` if the appropriate BOM is present, or `null` if no BOM is present

## Unsupported encodings

Expand Down
39 changes: 27 additions & 12 deletions lib/whatwg-encoding.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
"use strict";
const iconvLite = require("iconv-lite");
const supportedNames = require("./supported-names.json");
const names = require("./names.json");
const labelsToNames = require("./labels-to-names.json");

const supportedNamesSet = new Set(supportedNames);
const namesSet = new Set(names);

// https://encoding.spec.whatwg.org/#concept-encoding-get
exports.labelToName = label => {
Expand All @@ -13,35 +12,51 @@ exports.labelToName = label => {
};

// https://encoding.spec.whatwg.org/#decode
exports.decode = (buffer, fallbackEncodingName) => {
exports.decode = (uint8Array, fallbackEncodingName, { errorMode = "replacement" } = {}) => {
let encoding = fallbackEncodingName;
if (!exports.isSupported(encoding)) {
throw new RangeError(`"${encoding}" is not a supported encoding name`);
}

const bomEncoding = exports.getBOMEncoding(buffer);
if (encoding === "replacement") {
// the TextDecoder constructor will early-error. We implement
// https://encoding.spec.whatwg.org/#replacement-decoder instead.
if (uint8Array.byteLength === 0) {
return "";
}

if (errorMode === "fatal") {
throw new TypeError("The replacement encoding always errors on any non-empty input");
} else {
return "\uFFFD".repeat(uint8Array.byteLength);
}
}

const bomEncoding = exports.getBOMEncoding(uint8Array);
let start = 0;
if (bomEncoding !== null) {
encoding = bomEncoding;
start = bomEncoding === "UTF-8" ? 3 : 2;
}

// iconv-lite will strip BOMs for us, so no need to do the stuff the spec does
const subarray = uint8Array.subarray(start, uint8Array.byteLength);

return iconvLite.decode(buffer, encoding);
return (new TextDecoder(encoding, { ignoreBOM: true, fatal: errorMode === "fatal" })).decode(subarray);
};

// https://github.com/whatwg/html/issues/1910#issuecomment-254017369
exports.getBOMEncoding = buffer => {
if (buffer[0] === 0xFE && buffer[1] === 0xFF) {
exports.getBOMEncoding = uint8Array => {
if (uint8Array[0] === 0xFE && uint8Array[1] === 0xFF) {
return "UTF-16BE";
} else if (buffer[0] === 0xFF && buffer[1] === 0xFE) {
} else if (uint8Array[0] === 0xFF && uint8Array[1] === 0xFE) {
return "UTF-16LE";
} else if (buffer[0] === 0xEF && buffer[1] === 0xBB && buffer[2] === 0xBF) {
} else if (uint8Array[0] === 0xEF && uint8Array[1] === 0xBB && uint8Array[2] === 0xBF) {
return "UTF-8";
}

return null;
};

exports.isSupported = name => {
return supportedNamesSet.has(String(name));
return namesSet.has(String(name));
};
10 changes: 4 additions & 6 deletions scripts/update.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,9 @@ async function main() {
const supportedNames = [];
for (const entry of body) {
for (const encoding of entry.encodings) {
if (iconvLite.encodingExists(encoding.name)) {
supportedNames.push(encoding.name);
for (const label of encoding.labels) {
labelsToNames[label] = encoding.name;
}
supportedNames.push(encoding.name);
for (const label of encoding.labels) {
labelsToNames[label] = encoding.name;
}
}
}
Expand All @@ -25,7 +23,7 @@ async function main() {
fs.writeFileSync(path.resolve(__dirname, "../lib/labels-to-names.json"), labelsToNamesOutput);

const supportedNamesOutput = JSON.stringify(supportedNames, undefined, 2);
fs.writeFileSync(path.resolve(__dirname, "../lib/supported-names.json"), supportedNamesOutput);
fs.writeFileSync(path.resolve(__dirname, "../lib/names.json"), supportedNamesOutput);
}

main().catch(e => {
Expand Down
131 changes: 61 additions & 70 deletions test/tests.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,52 +4,65 @@ const whatwgEncoding = require("..");

describe("decode", () => {
it("should decode BOM-less windows-1252", () => {
const buffer = new Buffer([0x80, 0x95]);
const string = whatwgEncoding.decode(buffer, "windows-1252");
const uint8Array = new Uint8Array([0x80, 0x95]);
const string = whatwgEncoding.decode(uint8Array, "windows-1252");

assert.strictEqual(string, "ۥ");
});

it("should override when it sees a UTF-8 BOM", () => {
const buffer = new Buffer([0xEF, 0xBB, 0xBF, 0xE2, 0x82, 0xAC, 0xE2, 0x80, 0xA2]);
const string = whatwgEncoding.decode(buffer, "windows-1252");
const uint8Array = new Uint8Array([0xEF, 0xBB, 0xBF, 0xE2, 0x82, 0xAC, 0xE2, 0x80, 0xA2]);
const string = whatwgEncoding.decode(uint8Array, "windows-1252");

assert.strictEqual(string, "ۥ");
});

it("should override when it sees a UTF-16LE BOM", () => {
const buffer = new Buffer([0xFF, 0xFE, 0xAC, 0x20, 0x22, 0x20]);
const string = whatwgEncoding.decode(buffer, "windows-1252");
const uint8Array = new Uint8Array([0xFF, 0xFE, 0xAC, 0x20, 0x22, 0x20]);
const string = whatwgEncoding.decode(uint8Array, "windows-1252");

assert.strictEqual(string, "ۥ");
});

it("should override when it sees a UTF-16BE BOM", () => {
const buffer = new Buffer([0xFE, 0xFF, 0x20, 0xAC, 0x20, 0x22]);
const string = whatwgEncoding.decode(buffer, "windows-1252");
const uint8Array = new Uint8Array([0xFE, 0xFF, 0x20, 0xAC, 0x20, 0x22]);
const string = whatwgEncoding.decode(uint8Array, "windows-1252");

assert.strictEqual(string, "ۥ");
});

it("should throw when given an invalid encoding name", () => {
assert.throws(() => whatwgEncoding.decode(new Buffer([]), "asdf"), RangeError);
assert.throws(() => whatwgEncoding.decode(new Buffer([]), "utf-8"), RangeError);
assert.throws(() => whatwgEncoding.decode(new Buffer([]), " UTF-8"), RangeError);
assert.throws(() => whatwgEncoding.decode(new Buffer([]), "UTF-32"), RangeError);
it("should throw a TypeError on invalid byte sequence inputs for a given encoding when mode is fatal", () => {
// Anything fails for replacement
assert.throws(() => whatwgEncoding.decode(new Uint8Array([0x20]), "replacement", { errorMode: "fatal" }), TypeError);

assert.throws(() => whatwgEncoding.decode(new Uint8Array([0x83, 0x5C]), "Big5", { errorMode: "fatal" }), TypeError);
});

it("should output U+FFFD for a given encoding when mode is replacement (the default)", () => {
// Anything fails for replacement
assert.strictEqual(whatwgEncoding.decode(new Uint8Array([0x20]), "replacement", { errorMode: "replacement" }), "\uFFFD");
assert.strictEqual(whatwgEncoding.decode(new Uint8Array([0x20]), "replacement"), "\uFFFD");

assert.strictEqual(whatwgEncoding.decode(new Uint8Array([0x83, 0x5C]), "Big5", { errorMode: "replacement" }), "\uFFFD");
assert.strictEqual(whatwgEncoding.decode(new Uint8Array([0x83, 0x5C]), "Big5"), "\uFFFD\u005C");
});

it("should return the empty string for empty input", () => {
assert.strictEqual(whatwgEncoding.decode(new Uint8Array([]), "replacement"), "");
assert.strictEqual(whatwgEncoding.decode(new Uint8Array([]), "Big5"), "");
});

it("should throw when given an unsupported encoding name", () => {
assert.throws(() => whatwgEncoding.decode(new Buffer([]), "ISO-2022-JP"), RangeError);
assert.throws(() => whatwgEncoding.decode(new Buffer([]), "ISO-8859-8-I"), RangeError);
assert.throws(() => whatwgEncoding.decode(new Buffer([]), "replacement"), RangeError);
assert.throws(() => whatwgEncoding.decode(new Buffer([]), "x-mac-cyrillic"), RangeError);
assert.throws(() => whatwgEncoding.decode(new Buffer([]), "x-user-defined"), RangeError);
it("should throw when given an invalid encoding name", () => {
assert.throws(() => whatwgEncoding.decode(new Uint8Array([]), "asdf"), RangeError);
assert.throws(() => whatwgEncoding.decode(new Uint8Array([]), "utf-8"), RangeError);
assert.throws(() => whatwgEncoding.decode(new Uint8Array([]), " UTF-8"), RangeError);
assert.throws(() => whatwgEncoding.decode(new Uint8Array([]), "UTF-32"), RangeError);
});

it("should throw when given an encoding label that is not a name", () => {
assert.throws(() => whatwgEncoding.decode(new Buffer([]), "ascii"), RangeError);
assert.throws(() => whatwgEncoding.decode(new Buffer([]), "latin1"), RangeError);
assert.throws(() => whatwgEncoding.decode(new Buffer([]), "iso88591"), RangeError);
assert.throws(() => whatwgEncoding.decode(new Uint8Array([]), "ascii"), RangeError);
assert.throws(() => whatwgEncoding.decode(new Uint8Array([]), "latin1"), RangeError);
assert.throws(() => whatwgEncoding.decode(new Uint8Array([]), "iso88591"), RangeError);
});
});

Expand All @@ -59,6 +72,7 @@ describe("labelToName", () => {
assert.strictEqual(whatwgEncoding.labelToName("csibm866"), "IBM866");
assert.strictEqual(whatwgEncoding.labelToName("latin3"), "ISO-8859-3");
assert.strictEqual(whatwgEncoding.labelToName("tis-620"), "windows-874");
assert.strictEqual(whatwgEncoding.labelToName("replacement"), "replacement");
});

it("should be case-insensitive", () => {
Expand All @@ -85,30 +99,9 @@ describe("labelToName", () => {
it("should return null for invalid encoding labels", () => {
assert.strictEqual(whatwgEncoding.labelToName("AS\u0009CII"), null);
assert.strictEqual(whatwgEncoding.labelToName("asdf"), null);
assert.strictEqual(whatwgEncoding.labelToName("replacement"), null);
assert.strictEqual(whatwgEncoding.labelToName("UTF-32"), null);
});

it("should return null for unsupported encoding labels", () => {
assert.strictEqual(whatwgEncoding.labelToName("ISO-2022-JP"), null);
assert.strictEqual(whatwgEncoding.labelToName("csiso2022jp"), null);

assert.strictEqual(whatwgEncoding.labelToName("ISO-8859-8-I"), null);
assert.strictEqual(whatwgEncoding.labelToName("csiso88598i"), null);
assert.strictEqual(whatwgEncoding.labelToName("logical"), null);

assert.strictEqual(whatwgEncoding.labelToName("csiso2022kr"), null);
assert.strictEqual(whatwgEncoding.labelToName("hz-gb-2312"), null);
assert.strictEqual(whatwgEncoding.labelToName("iso-2022-cn"), null);
assert.strictEqual(whatwgEncoding.labelToName("iso-2022-cn-ext"), null);
assert.strictEqual(whatwgEncoding.labelToName("iso-2022-kr"), null);

assert.strictEqual(whatwgEncoding.labelToName("x-mac-cyrillic"), null);
assert.strictEqual(whatwgEncoding.labelToName("x-mac-ukrainian"), null);

assert.strictEqual(whatwgEncoding.labelToName("x-user-defined"), null);
});

it("should return null for non-strings", () => {
assert.strictEqual(whatwgEncoding.labelToName(), null);
assert.strictEqual(whatwgEncoding.labelToName(5), null);
Expand All @@ -117,7 +110,7 @@ describe("labelToName", () => {
});

describe("isSupported", () => {
it("should return true for supported encodings", () => {
it("should return true for all supported encodings", () => {
assert.strictEqual(whatwgEncoding.isSupported("UTF-8"), true);
assert.strictEqual(whatwgEncoding.isSupported("IBM866"), true);
assert.strictEqual(whatwgEncoding.isSupported("ISO-8859-2"), true);
Expand Down Expand Up @@ -153,6 +146,12 @@ describe("isSupported", () => {
assert.strictEqual(whatwgEncoding.isSupported("EUC-KR"), true);
assert.strictEqual(whatwgEncoding.isSupported("UTF-16BE"), true);
assert.strictEqual(whatwgEncoding.isSupported("UTF-16LE"), true);
assert.strictEqual(whatwgEncoding.isSupported("ISO-2022-JP"), true);
assert.strictEqual(whatwgEncoding.isSupported("ISO-8859-8-I"), true);
assert.strictEqual(whatwgEncoding.isSupported("replacement"), true);
assert.strictEqual(whatwgEncoding.isSupported("x-mac-cyrillic"), true);
assert.strictEqual(whatwgEncoding.isSupported("x-user-defined"), true);
assert.strictEqual(whatwgEncoding.isSupported("replacement"), true);
});

it("should return false for miscapitalizations and non-name labels", () => {
Expand All @@ -161,14 +160,6 @@ describe("isSupported", () => {
assert.strictEqual(whatwgEncoding.isSupported("latin1"), false);
});

it("should return false for the unimplemented encodings", () => {
assert.strictEqual(whatwgEncoding.isSupported("ISO-2022-JP"), false);
assert.strictEqual(whatwgEncoding.isSupported("ISO-8859-8-I"), false);
assert.strictEqual(whatwgEncoding.isSupported("replacement"), false);
assert.strictEqual(whatwgEncoding.isSupported("x-mac-cyrillic"), false);
assert.strictEqual(whatwgEncoding.isSupported("x-user-defined"), false);
});

it("should return false for invalid encoding names", () => {
assert.strictEqual(whatwgEncoding.isSupported("asdf"), false);
assert.strictEqual(whatwgEncoding.isSupported("UTF-32"), false);
Expand All @@ -177,57 +168,57 @@ describe("isSupported", () => {

describe("getBOMEncoding", () => {
it("should return UTF-8 for a UTF-8 BOM", () => {
const buffer = new Buffer([0xEF, 0xBB, 0xBF, 0xE2, 0x82, 0xAC, 0xE2, 0x80, 0xA2]);
const encoding = whatwgEncoding.getBOMEncoding(buffer);
const uint8Array = new Uint8Array([0xEF, 0xBB, 0xBF, 0xE2, 0x82, 0xAC, 0xE2, 0x80, 0xA2]);
const encoding = whatwgEncoding.getBOMEncoding(uint8Array);

assert.strictEqual(encoding, "UTF-8");
});

it("should return UTF-16LE for a UTF-16LE BOM", () => {
const buffer = new Buffer([0xFF, 0xFE, 0xAC, 0x20, 0x22, 0x20]);
const encoding = whatwgEncoding.getBOMEncoding(buffer);
const uint8Array = new Uint8Array([0xFF, 0xFE, 0xAC, 0x20, 0x22, 0x20]);
const encoding = whatwgEncoding.getBOMEncoding(uint8Array);

assert.strictEqual(encoding, "UTF-16LE");
});

it("should return UTF-16BE for a UTF-16BE BOM", () => {
const buffer = new Buffer([0xFE, 0xFF, 0x20, 0xAC, 0x20, 0x22]);
const encoding = whatwgEncoding.getBOMEncoding(buffer);
const uint8Array = new Uint8Array([0xFE, 0xFF, 0x20, 0xAC, 0x20, 0x22]);
const encoding = whatwgEncoding.getBOMEncoding(uint8Array);

assert.strictEqual(encoding, "UTF-16BE");
});

it("should return null for no BOM", () => {
const buffer = new Buffer([0x80, 0x95]);
const encoding = whatwgEncoding.getBOMEncoding(buffer);
const uint8Array = new Uint8Array([0x80, 0x95]);
const encoding = whatwgEncoding.getBOMEncoding(uint8Array);

assert.strictEqual(encoding, null);
});

it("should return UTF-16LE for a UTF-32LE BOM", () => {
const buffer = new Buffer([0xFF, 0xFE, 0x00, 0x00]);
const encoding = whatwgEncoding.getBOMEncoding(buffer);
const uint8Array = new Uint8Array([0xFF, 0xFE, 0x00, 0x00]);
const encoding = whatwgEncoding.getBOMEncoding(uint8Array);

assert.strictEqual(encoding, "UTF-16LE");
});

it("should return null for a UTF-32BE BOM", () => {
const buffer = new Buffer([0x00, 0x00, 0xFF, 0xFE]);
const encoding = whatwgEncoding.getBOMEncoding(buffer);
const uint8Array = new Uint8Array([0x00, 0x00, 0xFF, 0xFE]);
const encoding = whatwgEncoding.getBOMEncoding(uint8Array);

assert.strictEqual(encoding, null);
});

it("should return null for an empty buffer", () => {
const buffer = new Buffer([]);
const encoding = whatwgEncoding.getBOMEncoding(buffer);
it("should return null for an empty uint8Array", () => {
const uint8Array = new Uint8Array([]);
const encoding = whatwgEncoding.getBOMEncoding(uint8Array);

assert.strictEqual(encoding, null);
});

it("should return null for a one-byte buffer", () => {
const buffer = new Buffer([0xFF]);
const encoding = whatwgEncoding.getBOMEncoding(buffer);
it("should return null for a one-byte uint8Array", () => {
const uint8Array = new Uint8Array([0xFF]);
const encoding = whatwgEncoding.getBOMEncoding(uint8Array);

assert.strictEqual(encoding, null);
});
Expand Down