diff --git a/karma.conf.js b/karma.conf.js index 529a6a0a7b..2c3b7902af 100644 --- a/karma.conf.js +++ b/karma.conf.js @@ -160,10 +160,6 @@ module.exports = (config) => { 'node_modules/es6-promise-polyfill/promise.js', // Babel polyfill, required for async/await 'node_modules/@babel/polyfill/dist/polyfill.js', - // TextDecoder polyfill, required for TextDecoder/TextEncoder on IE and - // legacy Edge - // eslint-disable-next-line max-len - 'node_modules/fastestsmallesttextencoderdecoder/EncoderDecoderTogether.min.js', // muxjs module next 'node_modules/mux.js/dist/mux.min.js', diff --git a/lib/player.js b/lib/player.js index cf54c0cde0..94918e62f1 100644 --- a/lib/player.js +++ b/lib/player.js @@ -857,14 +857,9 @@ shaka.Player = class extends shaka.util.FakeEventTarget { if (!window.Promise) { shaka.log.alwaysWarn('A Promise implementation or polyfill is required'); } - if (!window.TextDecoder || !window.TextEncoder) { - shaka.log.alwaysWarn( - 'A TextDecoder/TextEncoder implementation or polyfill is required'); - } // Basic features needed for the library to be usable. const basicSupport = !!window.Promise && !!window.Uint8Array && - !!window.TextDecoder && !!window.TextEncoder && // eslint-disable-next-line no-restricted-syntax !!Array.prototype.forEach; if (!basicSupport) { diff --git a/lib/util/string_utils.js b/lib/util/string_utils.js index 1fe17e6d56..40753123fc 100644 --- a/lib/util/string_utils.js +++ b/lib/util/string_utils.js @@ -37,6 +37,7 @@ shaka.util.StringUtils = class { if (uint8[0] == 0xef && uint8[1] == 0xbb && uint8[2] == 0xbf) { uint8 = uint8.subarray(3); } + if (window.TextDecoder && !shaka.util.Platform.isPS4()) { // Use the TextDecoder interface to decode the text. This has the // advantage compared to the previously-standard decodeUriComponent that @@ -51,23 +52,69 @@ shaka.util.StringUtils = class { } return decoded; } else { - // http://stackoverflow.com/a/13691499 - const utf8 = shaka.util.StringUtils.fromCharCode(uint8); - // This converts each character in the string to an escape sequence. If - // the character is in the ASCII range, it is not converted; otherwise it - // is converted to a URI escape sequence. - // Example: '\x67\x35\xe3\x82\xac' -> 'g#%E3%82%AC' - const escaped = escape(utf8); - // Decode the escaped sequence. This will interpret UTF-8 sequences into - // the correct character. - // Example: 'g#%E3%82%AC' -> 'g#€' - try { - return decodeURIComponent(escaped); - } catch (e) { - throw new shaka.util.Error( - shaka.util.Error.Severity.CRITICAL, shaka.util.Error.Category.TEXT, - shaka.util.Error.Code.BAD_ENCODING); + // Homebrewed UTF-8 decoder based on + // https://en.wikipedia.org/wiki/UTF-8#Encoding + // Unlike decodeURIComponent, won't throw on bad encoding. + // In this way, it is similar to TextDecoder. + + let decoded = ''; + for (let i = 0; i < uint8.length; ++i) { + // By default, the "replacement character" codepoint. + let codePoint = 0xFFFD; + + // Top bit is 0, 1-byte encoding. + if ((uint8[i] & 0x80) == 0) { + codePoint = uint8[i]; + + // Top 3 bits of byte 0 are 110, top 2 bits of byte 1 are 10, + // 2-byte encoding. + } else if (uint8.length >= i + 2 && + (uint8[i] & 0xe0) == 0xc0 && + (uint8[i + 1] & 0xc0) == 0x80) { + codePoint = ((uint8[i] & 0x1f) << 6) | + ((uint8[i + 1] & 0x3f)); + i += 1; // Consume one extra byte. + + // Top 4 bits of byte 0 are 1110, top 2 bits of byte 1 and 2 are 10, + // 3-byte encoding. + } else if (uint8.length >= i + 3 && + (uint8[i] & 0xf0) == 0xe0 && + (uint8[i + 1] & 0xc0) == 0x80 && + (uint8[i + 2] & 0xc0) == 0x80) { + codePoint = ((uint8[i] & 0x0f) << 12) | + ((uint8[i + 1] & 0x3f) << 6) | + ((uint8[i + 2] & 0x3f)); + i += 2; // Consume two extra bytes. + + // Top 5 bits of byte 0 are 11110, top 2 bits of byte 1, 2 and 3 are 10, + // 4-byte encoding. + } else if (uint8.length >= i + 4 && + (uint8[i] & 0xf1) == 0xf0 && + (uint8[i + 1] & 0xc0) == 0x80 && + (uint8[i + 2] & 0xc0) == 0x80 && + (uint8[i + 3] & 0xc0) == 0x80) { + codePoint = ((uint8[i] & 0x07) << 18) | + ((uint8[i + 1] & 0x3f) << 12) | + ((uint8[i + 2] & 0x3f) << 6) | + ((uint8[i + 3] & 0x3f)); + i += 3; // Consume three extra bytes. + } + + // JavaScript strings are a series of UTF-16 characters. + if (codePoint <= 0xffff) { + decoded += String.fromCharCode(codePoint); + } else { + // UTF-16 surrogate-pair encoding, based on + // https://en.wikipedia.org/wiki/UTF-16#Description + const baseCodePoint = codePoint - 0x10000; + const highPart = baseCodePoint >> 10; + const lowPart = baseCodePoint & 0x3ff; + decoded += String.fromCharCode(0xd800 + highPart); + decoded += String.fromCharCode(0xdc00 + lowPart); + } } + + return decoded; } } diff --git a/test/util/string_utils_unit.js b/test/util/string_utils_unit.js index 4a798fad17..03f13dd6b8 100644 --- a/test/util/string_utils_unit.js +++ b/test/util/string_utils_unit.js @@ -16,19 +16,34 @@ describe('StringUtils', () => { }); it('won\'t break if given cut-off UTF8 character', () => { - // This array contains the first half of a 2-byte UTF8 character, stranded - // at the very end of the string. - const arr1 = [0x53, 0x61, 0x6e, 0x20, 0x4a, 0x6f, 0x73, 0x81]; + const arr1 = [0x53, 0x61, 0x6e, 0x20, 0x4a, 0x6f, 0x73, 0xc3, 0xa9]; expect(StringUtils.fromUTF8(new Uint8Array(arr1))) + .toBe('San Jos\u00E9'); + + // This array contains the first half of a 2-byte UTF8 character + // (0xc3 0xa9 = é). The half-character is stranded at the very end of the + // string. + const arr = [0x53, 0x61, 0x6e, 0x20, 0x4a, 0x6f, 0x73, 0xc3]; + expect(StringUtils.fromUTF8(new Uint8Array(arr))) .toBe('San Jos\uFFFD'); + }); - // For reasons I don't know, it seems like 0xE9 cannot be the start of a - // UTF8 character. Perhaps it is a reserved number? - const arr2 = [0x4a, 0x6f, 0x73, 0xE9, 0x33, 0x33, 0x20, 0x53, 0x61, 0x6e]; - expect(StringUtils.fromUTF8(new Uint8Array(arr2))) + it('won\'t break if given an invalid UTF-8 sequence', () => { + // 0xe9 0x33 0x33 is an invalid UTF-8 sequence. + const arr = [0x4a, 0x6f, 0x73, 0xE9, 0x33, 0x33, 0x20, 0x53, 0x61, 0x6e]; + expect(StringUtils.fromUTF8(new Uint8Array(arr))) .toBe('Jos\uFFFD33 San'); }); + it('can handle an 8-byte character', () => { + // This is the UTF-8 encoding of the US flag emoji. + // It decodes into two Unicode codepoints, which becomes 4 JavaScript + // UTF-16 characters. + const arr = [0xf0, 0x9f, 0x87, 0xba, 0xf0, 0x9f, 0x87, 0xb8]; + expect(StringUtils.fromUTF8(new Uint8Array(arr))) + .toBe('\uD83C\uDDFA\uD83C\uDDF8'); + }); + it('strips the BOM in fromUTF8', () => { // This is 4 Unicode characters, the last will be split into a surrogate // pair.