Merge pull request #97 from curbengh/decode-url

feat: decodeURL()
hexojs · Sep 20, 2019 · e463a12 · e463a12
2 parents a4a0b37 + 8758e6b
commit e463a12
Show file tree

Hide file tree

Showing 5 changed files with 378 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -61,6 +61,21 @@ const sha1 = createSha1Hash();
   });
 ```
 
+### decodeURL(str)
+
+Decode [encoded](https://en.wikipedia.org/wiki/Percent-encoding) URL or path. An alternative to the native [`decodeURI()`](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/decodeURI) function, with added ability to decode [punycoded](https://en.wikipedia.org/wiki/Punycode) domain.
+
+``` js
+decodeURL('http://foo.com/b%C3%A1r')
+// http://foo.com/bár
+
+decodeURL('http://xn--br-mia.com/baz')
+// http://bár.com/baz
+
+decodeURL('/foo/b%C3%A1r/')
+// /foo/bár/
+```
+
 ### encodeURL(str)
 
 Encode URL or path into a [safe format](https://en.wikipedia.org/wiki/Percent-encoding). Domain is encoded into [punycode](https://en.wikipedia.org/wiki/Punycode) when necessary.

diff --git a/lib/decode_url.js b/lib/decode_url.js
@@ -0,0 +1,38 @@
+'use strict';
+
+const { parse, format } = require('url');
+const { toUnicode } = require('./punycode');
+
+const safeDecodeURI = (str) => {
+  try {
+    return decodeURI(str);
+  } catch (err) {
+    return str;
+  }
+};
+
+const decodeURL = (str) => {
+  const parsed = parse(str);
+  if (parsed.protocol) {
+    const obj = Object.assign({}, {
+      auth: parsed.auth,
+      protocol: parsed.protocol,
+      host: toUnicode(parsed.host),
+      pathname: safeDecodeURI(parsed.pathname)
+    });
+
+    if (parsed.hash) {
+      Object.assign(obj, { hash: safeDecodeURI(parsed.hash) });
+    }
+
+    if (parsed.search) {
+      Object.assign(obj, { search: safeDecodeURI(parsed.search) });
+    }
+
+    return format(obj);
+  }
+
+  return safeDecodeURI(str);
+};
+
+module.exports = decodeURL;
diff --git a/lib/index.js b/lib/index.js
@@ -6,6 +6,7 @@ exports.CacheStream = require('./cache_stream');
 exports.camelCaseKeys = require('./camel_case_keys');
 exports.Color = require('./color');
 exports.createSha1Hash = hash.createSha1Hash;
+exports.decodeURL = require('./decode_url');
 exports.encodeURL = require('./encode_url');
 exports.escapeDiacritic = require('./escape_diacritic');
 exports.escapeHTML = require('./escape_html');

diff --git a/lib/punycode.js b/lib/punycode.js
@@ -0,0 +1,237 @@
+'use strict';
+
+/* !
+ * punycode 2.1.1
+ * Licensed MIT (c) 2014-2019 Mathias Bynens <https://mathiasbynens.be/>
+ * https://github.com/bestiejs/punycode.js
+ *
+ * Only punycode.toUnicode(input) is implemented
+ */
+
+/** Highest positive signed 32-bit float value */
+const maxInt = 2147483647; // aka. 0x7FFFFFFF or 2^31-1
+
+/** Bootstring parameters */
+const base = 36;
+const tMin = 1;
+const tMax = 26;
+const skew = 38;
+const damp = 700;
+const initialBias = 72;
+const initialN = 128; // 0x80
+const delimiter = '-'; // '\x2D'
+
+/** Regular expressions */
+const regexPunycode = /^xn--/;
+const regexSeparators = /[\x2E\u3002\uFF0E\uFF61]/g; // RFC 3490 separators
+
+/** Error messages */
+const errors = {
+  'overflow': 'Overflow: input needs wider integers to process',
+  'not-basic': 'Illegal input >= 0x80 (not a basic code point)',
+  'invalid-input': 'Invalid input'
+};
+
+/** Convenience shortcuts */
+const { floor } = Math;
+const baseMinusTMin = base - tMin;
+
+/* --------------------------------------------------------------------------*/
+
+/**
+ * A generic error utility function.
+ * @private
+ * @param {String} type The error type.
+ * @returns {Error} Throws a `RangeError` with the applicable error message.
+ */
+const error = (type) => {
+  throw new RangeError(errors[type]);
+};
+
+/**
+ * A generic `Array#map` utility function.
+ * @private
+ * @param {Array} array The array to iterate over.
+ * @param {Function} callback The function that gets called for every array
+ * item.
+ * @returns {Array} A new array of values returned by the callback function.
+ */
+const map = (array, fn) => {
+  const result = [];
+  let length = array.length;
+  while (length--) {
+    result[length] = fn(array[length]);
+  }
+  return result;
+};
+
+/**
+ * A simple `Array#map`-like wrapper to work with domain name strings or email
+ * addresses.
+ * @private
+ * @param {String} domain The domain name or email address.
+ * @param {Function} callback The function that gets called for every
+ * character.
+ * @returns {Array} A new string of characters returned by the callback
+ * function.
+ */
+const mapDomain = (string, fn) => {
+  // Avoid `split(regex)` for IE8 compatibility. See https://github.com/bestiejs/punycode.js/issues/17.
+  string = string.replace(regexSeparators, '\x2E');
+  const labels = string.split('.');
+  const encoded = map(labels, fn).join('.');
+  return encoded;
+};
+
+/**
+ * Converts a basic code point into a digit/integer.
+ * @see `digitToBasic()`
+ * @private
+ * @param {Number} codePoint The basic numeric code point value.
+ * @returns {Number} The numeric value of a basic code point (for use in
+ * representing integers) in the range `0` to `base - 1`, or `base` if
+ * the code point does not represent a value.
+ */
+const basicToDigit = (codePoint) => {
+  if (codePoint - 0x30 < 0x0A) {
+    return codePoint - 0x16;
+  }
+  if (codePoint - 0x41 < 0x1A) {
+    return codePoint - 0x41;
+  }
+  if (codePoint - 0x61 < 0x1A) {
+    return codePoint - 0x61;
+  }
+  return base;
+};
+
+/**
+ * Bias adaptation function as per section 3.4 of RFC 3492.
+ * https://tools.ietf.org/html/rfc3492#section-3.4
+ * @private
+ */
+const adapt = (delta, numPoints, firstTime) => {
+  let k = 0;
+  delta = firstTime ? floor(delta / damp) : delta >> 1;
+  delta += floor(delta / numPoints);
+  for (/* no initialization */; delta > baseMinusTMin * tMax >> 1; k += base) {
+    delta = floor(delta / baseMinusTMin);
+  }
+  return floor(k + ((baseMinusTMin + 1) * delta / (delta + skew)));
+};
+
+/**
+ * Converts a Punycode string of ASCII-only symbols to a string of Unicode
+ * symbols.
+ * @memberOf punycode
+ * @param {String} input The Punycode string of ASCII-only symbols.
+ * @returns {String} The resulting string of Unicode symbols.
+ */
+const decode = (input) => {
+  // Don't use UCS-2.
+  const output = [];
+  const inputLength = input.length;
+  let i = 0;
+  let n = initialN;
+  let bias = initialBias;
+
+  // Handle the basic code points: let `basic` be the number of input code
+  // points before the last delimiter, or `0` if there is none, then copy
+  // the first basic code points to the output.
+
+  let basic = input.lastIndexOf(delimiter);
+  if (basic < 0) {
+    basic = 0;
+  }
+
+  for (let j = 0; j < basic; ++j) {
+    // if it's not a basic code point
+    if (input.charCodeAt(j) >= 0x80) {
+      error('not-basic');
+    }
+    output.push(input.charCodeAt(j));
+  }
+
+  // Main decoding loop: start just after the last delimiter if any basic code
+  // points were copied; start at the beginning otherwise.
+
+  for (let index = basic > 0 ? basic + 1 : 0; index < inputLength; /* no final expression */) {
+
+    // `index` is the index of the next character to be consumed.
+    // Decode a generalized variable-length integer into `delta`,
+    // which gets added to `i`. The overflow checking is easier
+    // if we increase `i` as we go, then subtract off its starting
+    // value at the end to obtain `delta`.
+    let oldi = i;
+    for (let w = 1, k = base; /* no condition */; k += base) {
+
+      if (index >= inputLength) {
+        error('invalid-input');
+      }
+
+      const digit = basicToDigit(input.charCodeAt(index++));
+
+      if (digit >= base || digit > floor((maxInt - i) / w)) {
+        error('overflow');
+      }
+
+      i += digit * w;
+
+      let t;
+      if (k <= bias) t = tMin;
+      else if (k >= bias + tMax) t = tMax;
+      else t = k - bias;
+
+      if (digit < t) {
+        break;
+      }
+
+      const baseMinusT = base - t;
+      if (w > floor(maxInt / baseMinusT)) {
+        error('overflow');
+      }
+
+      w *= baseMinusT;
+
+    }
+
+    const out = output.length + 1;
+    bias = adapt(i - oldi, out, oldi === 0);
+
+    // `i` was supposed to wrap around from `out` to `0`,
+    // incrementing `n` each time, so we'll fix that now:
+    if (floor(i / out) > maxInt - n) {
+      error('overflow');
+    }
+
+    n += floor(i / out);
+    i %= out;
+
+    // Insert `n` at position `i` of the output.
+    output.splice(i++, 0, n);
+
+  }
+
+  return String.fromCodePoint(...output);
+};
+
+/**
+ * Converts a Punycode string representing a domain name or an email address
+ * to Unicode. Only the Punycoded parts of the input will be converted, i.e.
+ * it doesn't matter if you call it on a string that has already been
+ * converted to Unicode.
+ * @memberOf punycode
+ * @param {String} input The Punycoded domain name or email address to
+ * convert to Unicode.
+ * @returns {String} The Unicode representation of the given Punycode
+ * string.
+ */
+const toUnicode = (input) => {
+  return mapDomain(input, (string) => {
+    return regexPunycode.test(string)
+      ? decode(string.slice(4).toLowerCase())
+      : string;
+  });
+};
+
+module.exports = { toUnicode: toUnicode };
diff --git a/test/decode_url.spec.js b/test/decode_url.spec.js
@@ -0,0 +1,87 @@
+'use strict';
+
+require('chai').should();
+
+describe('decodeURL', () => {
+  const decodeURL = require('../lib/decode_url');
+
+  it('regular', () => {
+    const content = 'http://foo.com/';
+    decodeURL(content).should.eql(content);
+  });
+
+  it('auth', () => {
+    const content = 'http://user:pass@foo.com/';
+    decodeURL(content).should.eql(content);
+  });
+
+  it('port', () => {
+    const content = 'http://foo.com:80/';
+    decodeURL(content).should.eql(content);
+  });
+
+  it('space', () => {
+    const content = 'http://foo.com/bar%20baz';
+    decodeURL(content).should.eql('http://foo.com/bar baz');
+  });
+
+  it('unicode', () => {
+    const content = 'http://foo.com/b%C3%A1r';
+    decodeURL(content).should.eql('http://foo.com/bár');
+  });
+
+  it('decode once', () => {
+    const content = 'http://fóo.com/bár';
+    decodeURL(content).should.eql(content);
+  });
+
+  it('hash', () => {
+    const content = 'http://foo.com/b%C3%A1r#b%C3%A0z';
+    decodeURL(content).should.eql('http://foo.com/bár#bàz');
+  });
+
+  it('query', () => {
+    const content = 'http://foo.com/bar?q%C3%BAery=b%C3%A1z';
+    decodeURL(content).should.eql('http://foo.com/bar?qúery=báz');
+  });
+
+  it('multiple queries', () => {
+    const content = 'http://foo.com/bar?query1=a%C3%A1a&query2=a%C3%A0a';
+    decodeURL(content).should.eql('http://foo.com/bar?query1=aáa&query2=aàa');
+  });
+
+  it('hash and query', () => {
+    const content = 'http://foo.com/bar?query=b%C3%A1z#f%C3%B3o';
+    decodeURL(content).should.eql('http://foo.com/bar?query=báz#fóo');
+  });
+
+  it('idn', () => {
+    const content = 'http://xn--br-mia.com/baz';
+    decodeURL(content).should.eql('http://bár.com/baz');
+  });
+
+  it('path', () => {
+    const content = '/foo/bar/';
+    decodeURL(content).should.eql(content);
+  });
+
+  it('path with space', () => {
+    const content = '/foo%20bar/baz/';
+    decodeURL(content).should.eql('/foo bar/baz/');
+  });
+
+  it('path with unicode', () => {
+    const content = '/foo/b%C3%A1r/';
+    decodeURL(content).should.eql('/foo/bár/');
+  });
+
+  it('decode path once', () => {
+    const content = '/foo/bár /';
+    decodeURL(content).should.eql(content);
+  });
+
+  it('anchor with unicode', () => {
+    const content = '#f%C3%B3o-b%C3%A1r';
+    decodeURL(content).should.eql('#fóo-bár');
+  });
+});